1 package org.apache.velocity.io;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.PushbackInputStream;
26
27 import org.apache.velocity.util.ExceptionUtils;
28
29
30 /**
31 * This is an input stream that is unicode BOM aware. This allows you to e.g. read
32 * Windows Notepad Unicode files as Velocity templates.
33 *
34 * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
35 * the input stream reader.
36 *
37 * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
38 * the caller must provide synchronization.
39 *
40 * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
41 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
42 * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
43 * @since 1.5
44 */
45 public class UnicodeInputStream
46 extends InputStream
47 {
48
49 /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
50 public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });
51
52 /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
53 public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });
54
55 /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
56 public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });
57
58 /**
59 * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
60 *
61 * TODO: Does Java actually support this?
62 */
63 public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });
64
65 /**
66 * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
67 *
68 * TODO: Does Java actually support this?
69 */
70 public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });
71
72 /** The maximum amount of bytes to read for a BOM */
73 private static final int MAX_BOM_SIZE = 4;
74
75 /** Buffer for BOM reading */
76 private byte [] buf = new byte[MAX_BOM_SIZE];
77
78 /** Buffer pointer. */
79 private int pos = 0;
80
81 /** The stream encoding as read from the BOM or null. */
82 private final String encoding;
83
84 /** True if the BOM itself should be skipped and not read. */
85 private final boolean skipBOM;
86
87 private final PushbackInputStream inputStream;
88
89 /**
90 * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
91 *
92 * @param inputStream The input stream to use for reading.
93 */
94 public UnicodeInputStream(final InputStream inputStream)
95 throws IllegalStateException, IOException
96 {
97 this(inputStream, true);
98 }
99
100 /**
101 * Creates a new UnicodeInputStream object.
102 *
103 * @param inputStream The input stream to use for reading.
104 * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
105 */
106 public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
107 throws IllegalStateException, IOException
108 {
109 super();
110
111 this.skipBOM = skipBOM;
112 this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);
113
114 try
115 {
116 this.encoding = readEncoding();
117 }
118 catch (IOException ioe)
119 {
120 IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream");
121 ExceptionUtils.setCause(ex, ioe);
122 throw ex;
123 }
124 }
125
126 /**
127 * Returns true if the input stream discards the BOM.
128 *
129 * @return True if the input stream discards the BOM.
130 */
131 public boolean isSkipBOM()
132 {
133 return skipBOM;
134 }
135
136 /**
137 * Read encoding based on BOM.
138 *
139 * @return The encoding based on the BOM.
140 *
141 * @throws IllegalStateException When a problem reading the BOM occured.
142 */
143 public String getEncodingFromStream()
144 {
145 return encoding;
146 }
147
148 /**
149 * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
150 * is undefined.
151 *
152 * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
153 */
154 protected String readEncoding()
155 throws IOException
156 {
157 pos = 0;
158
159 UnicodeBOM encoding = null;
160
161 // read first byte.
162 if (readByte())
163 {
164 // Build a list of matches
165 //
166 // 00 00 FE FF --> UTF 32 BE
167 // EF BB BF --> UTF 8
168 // FE FF --> UTF 16 BE
169 // FF FE --> UTF 16 LE
170 // FF FE 00 00 --> UTF 32 LE
171
172 switch (buf[0])
173 {
174 case (byte)0x00: // UTF32 BE
175 encoding = match(UTF32BE_BOM, null);
176 break;
177 case (byte)0xef: // UTF8
178 encoding = match(UTF8_BOM, null);
179 break;
180 case (byte)0xfe: // UTF16 BE
181 encoding = match(UTF16BE_BOM, null);
182 break;
183 case (byte)0xff: // UTF16/32 LE
184 encoding = match(UTF16LE_BOM, null);
185
186 if (encoding != null)
187 {
188 encoding = match(UTF32LE_BOM, encoding);
189 }
190 break;
191
192 default:
193 encoding = null;
194 break;
195 }
196 }
197
198 pushback(encoding);
199
200 return (encoding != null) ? encoding.getEncoding() : null;
201 }
202
203 private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
204 throws IOException
205 {
206 byte [] bom = matchEncoding.getBytes();
207
208 for (int i = 0; i < bom.length; i++)
209 {
210 if (pos <= i) // Byte has not yet been read
211 {
212 if (!readByte())
213 {
214 return noMatchEncoding;
215 }
216 }
217
218 if (bom[i] != buf[i])
219 {
220 return noMatchEncoding;
221 }
222 }
223
224 return matchEncoding;
225 }
226
227 private final boolean readByte()
228 throws IOException
229 {
230 int res = inputStream.read();
231 if (res == -1)
232 {
233 return false;
234 }
235
236 if (pos >= buf.length)
237 {
238 throw new IOException("BOM read error");
239 }
240
241 buf[pos++] = (byte) res;
242 return true;
243 }
244
245 private final void pushback(final UnicodeBOM matchBOM)
246 throws IOException
247 {
248 int count = pos; // By default, all bytes are pushed back.
249 int start = 0;
250
251 if (matchBOM != null && skipBOM)
252 {
253 // We have a match (some bytes are part of the BOM)
254 // and we want to skip the BOM. Push back only the bytes
255 // after the BOM.
256 start = matchBOM.getBytes().length;
257 count = (pos - start);
258
259 if (count < 0)
260 {
261 throw new IllegalStateException("Match has more bytes than available!");
262 }
263 }
264
265 inputStream.unread(buf, start, count);
266 }
267
268 /**
269 * @see java.io.InputStream#close()
270 */
271 public void close()
272 throws IOException
273 {
274 inputStream.close();
275 }
276
277 /**
278 * @see java.io.InputStream#available()
279 */
280 public int available()
281 throws IOException
282 {
283 return inputStream.available();
284 }
285
286 /**
287 * @see java.io.InputStream#mark(int)
288 */
289 public void mark(final int readlimit)
290 {
291 inputStream.mark(readlimit);
292 }
293
294 /**
295 * @see java.io.InputStream#markSupported()
296 */
297 public boolean markSupported()
298 {
299 return inputStream.markSupported();
300 }
301
302 /**
303 * @see java.io.InputStream#read()
304 */
305 public int read()
306 throws IOException
307 {
308 return inputStream.read();
309 }
310
311 /**
312 * @see java.io.InputStream#read(byte[])
313 */
314 public int read(final byte [] b)
315 throws IOException
316 {
317 return inputStream.read(b);
318 }
319
320 /**
321 * @see java.io.InputStream#read(byte[], int, int)
322 */
323 public int read(final byte [] b, final int off, final int len)
324 throws IOException
325 {
326 return inputStream.read(b, off, len);
327 }
328
329 /**
330 * @see java.io.InputStream#reset()
331 */
332 public void reset()
333 throws IOException
334 {
335 inputStream.reset();
336 }
337
338 /**
339 * @see java.io.InputStream#skip(long)
340 */
341 public long skip(final long n)
342 throws IOException
343 {
344 return inputStream.skip(n);
345 }
346
347 /**
348 * Helper class to bundle encoding and BOM marker.
349 *
350 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
351 * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
352 */
353 static final class UnicodeBOM
354 {
355 private final String encoding;
356
357 private final byte [] bytes;
358
359 private UnicodeBOM(final String encoding, final byte [] bytes)
360 {
361 this.encoding = encoding;
362 this.bytes = bytes;
363 }
364
365 String getEncoding()
366 {
367 return encoding;
368 }
369
370 byte [] getBytes()
371 {
372 return bytes;
373 }
374 }
375 }