1 package org.apache.velocity.io;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.PushbackInputStream;
26
27
28 /**
29 * This is an input stream that is unicode BOM aware. This allows you to e.g. read
30 * Windows Notepad Unicode files as Velocity templates.
31 *
32 * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
33 * the input stream reader.
34 *
35 * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
36 * the caller must provide synchronization.
37 *
38 * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
39 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
40 * @version $Id: UnicodeInputStream.java 998264 2010-09-17 19:13:02Z apetrelli $
41 * @since 1.5
42 */
43 public class UnicodeInputStream
44 extends InputStream
45 {
46
47 /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
48 public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });
49
50 /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
51 public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });
52
53 /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
54 public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });
55
56 /**
57 * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
58 *
59 * TODO: Does Java actually support this?
60 */
61 public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });
62
63 /**
64 * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
65 *
66 * TODO: Does Java actually support this?
67 */
68 public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });
69
70 /** The maximum amount of bytes to read for a BOM */
71 private static final int MAX_BOM_SIZE = 4;
72
73 /** Buffer for BOM reading */
74 private byte [] buf = new byte[MAX_BOM_SIZE];
75
76 /** Buffer pointer. */
77 private int pos = 0;
78
79 /** The stream encoding as read from the BOM or null. */
80 private final String encoding;
81
82 /** True if the BOM itself should be skipped and not read. */
83 private final boolean skipBOM;
84
85 private final PushbackInputStream inputStream;
86
87 /**
88 * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
89 *
90 * @param inputStream The input stream to use for reading.
91 */
92 public UnicodeInputStream(final InputStream inputStream)
93 throws IllegalStateException, IOException
94 {
95 this(inputStream, true);
96 }
97
98 /**
99 * Creates a new UnicodeInputStream object.
100 *
101 * @param inputStream The input stream to use for reading.
102 * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
103 */
104 public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
105 throws IllegalStateException, IOException
106 {
107 super();
108
109 this.skipBOM = skipBOM;
110 this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);
111
112 try
113 {
114 this.encoding = readEncoding();
115 }
116 catch (IOException ioe)
117 {
118 throw new IllegalStateException("Could not read BOM from Stream", ioe);
119 }
120 }
121
122 /**
123 * Returns true if the input stream discards the BOM.
124 *
125 * @return True if the input stream discards the BOM.
126 */
127 public boolean isSkipBOM()
128 {
129 return skipBOM;
130 }
131
132 /**
133 * Read encoding based on BOM.
134 *
135 * @return The encoding based on the BOM.
136 *
137 * @throws IllegalStateException When a problem reading the BOM occured.
138 */
139 public String getEncodingFromStream()
140 {
141 return encoding;
142 }
143
144 /**
145 * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
146 * is undefined.
147 *
148 * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
149 */
150 protected String readEncoding()
151 throws IOException
152 {
153 pos = 0;
154
155 UnicodeBOM encoding = null;
156
157 // read first byte.
158 if (readByte())
159 {
160 // Build a list of matches
161 //
162 // 00 00 FE FF --> UTF 32 BE
163 // EF BB BF --> UTF 8
164 // FE FF --> UTF 16 BE
165 // FF FE --> UTF 16 LE
166 // FF FE 00 00 --> UTF 32 LE
167
168 switch (buf[0])
169 {
170 case (byte)0x00: // UTF32 BE
171 encoding = match(UTF32BE_BOM, null);
172 break;
173 case (byte)0xef: // UTF8
174 encoding = match(UTF8_BOM, null);
175 break;
176 case (byte)0xfe: // UTF16 BE
177 encoding = match(UTF16BE_BOM, null);
178 break;
179 case (byte)0xff: // UTF16/32 LE
180 encoding = match(UTF16LE_BOM, null);
181
182 if (encoding != null)
183 {
184 encoding = match(UTF32LE_BOM, encoding);
185 }
186 break;
187
188 default:
189 encoding = null;
190 break;
191 }
192 }
193
194 pushback(encoding);
195
196 return (encoding != null) ? encoding.getEncoding() : null;
197 }
198
199 private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
200 throws IOException
201 {
202 byte [] bom = matchEncoding.getBytes();
203
204 for (int i = 0; i < bom.length; i++)
205 {
206 if (pos <= i) // Byte has not yet been read
207 {
208 if (!readByte())
209 {
210 return noMatchEncoding;
211 }
212 }
213
214 if (bom[i] != buf[i])
215 {
216 return noMatchEncoding;
217 }
218 }
219
220 return matchEncoding;
221 }
222
223 private final boolean readByte()
224 throws IOException
225 {
226 int res = inputStream.read();
227 if (res == -1)
228 {
229 return false;
230 }
231
232 if (pos >= buf.length)
233 {
234 throw new IOException("BOM read error");
235 }
236
237 buf[pos++] = (byte) res;
238 return true;
239 }
240
241 private final void pushback(final UnicodeBOM matchBOM)
242 throws IOException
243 {
244 int count = pos; // By default, all bytes are pushed back.
245 int start = 0;
246
247 if (matchBOM != null && skipBOM)
248 {
249 // We have a match (some bytes are part of the BOM)
250 // and we want to skip the BOM. Push back only the bytes
251 // after the BOM.
252 start = matchBOM.getBytes().length;
253 count = (pos - start);
254
255 if (count < 0)
256 {
257 throw new IllegalStateException("Match has more bytes than available!");
258 }
259 }
260
261 inputStream.unread(buf, start, count);
262 }
263
264 /**
265 * @see java.io.InputStream#close()
266 */
267 public void close()
268 throws IOException
269 {
270 inputStream.close();
271 }
272
273 /**
274 * @see java.io.InputStream#available()
275 */
276 public int available()
277 throws IOException
278 {
279 return inputStream.available();
280 }
281
282 /**
283 * @see java.io.InputStream#mark(int)
284 */
285 public void mark(final int readlimit)
286 {
287 inputStream.mark(readlimit);
288 }
289
290 /**
291 * @see java.io.InputStream#markSupported()
292 */
293 public boolean markSupported()
294 {
295 return inputStream.markSupported();
296 }
297
298 /**
299 * @see java.io.InputStream#read()
300 */
301 public int read()
302 throws IOException
303 {
304 return inputStream.read();
305 }
306
307 /**
308 * @see java.io.InputStream#read(byte[])
309 */
310 public int read(final byte [] b)
311 throws IOException
312 {
313 return inputStream.read(b);
314 }
315
316 /**
317 * @see java.io.InputStream#read(byte[], int, int)
318 */
319 public int read(final byte [] b, final int off, final int len)
320 throws IOException
321 {
322 return inputStream.read(b, off, len);
323 }
324
325 /**
326 * @see java.io.InputStream#reset()
327 */
328 public void reset()
329 throws IOException
330 {
331 inputStream.reset();
332 }
333
334 /**
335 * @see java.io.InputStream#skip(long)
336 */
337 public long skip(final long n)
338 throws IOException
339 {
340 return inputStream.skip(n);
341 }
342
343 /**
344 * Helper class to bundle encoding and BOM marker.
345 *
346 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
347 * @version $Id: UnicodeInputStream.java 998264 2010-09-17 19:13:02Z apetrelli $
348 */
349 static final class UnicodeBOM
350 {
351 private final String encoding;
352
353 private final byte [] bytes;
354
355 private UnicodeBOM(final String encoding, final byte [] bytes)
356 {
357 this.encoding = encoding;
358 this.bytes = bytes;
359 }
360
361 String getEncoding()
362 {
363 return encoding;
364 }
365
366 byte [] getBytes()
367 {
368 return bytes;
369 }
370 }
371 }