View Javadoc

1   package org.apache.velocity.io;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.PushbackInputStream;
26  
27  
28  /**
29   * This is an input stream that is unicode BOM aware. This allows you to e.g. read
30   * Windows Notepad Unicode files as Velocity templates.
31   *
32   * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
33   * the input stream reader.
34   *
35   * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
36   * the caller must provide synchronization.
37   *
38   * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
39   * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
40   * @version $Id: UnicodeInputStream.java 998264 2010-09-17 19:13:02Z apetrelli $
41   * @since 1.5
42   */
43  public class UnicodeInputStream
44      extends InputStream
45  {
46  
47      /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
48      public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });
49  
50      /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
51      public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });
52  
53      /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
54      public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });
55  
56      /**
57       * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
58       *
59       * TODO: Does Java actually support this?
60       */
61      public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });
62  
63      /**
64       * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
65       *
66       * TODO: Does Java actually support this?
67       */
68      public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });
69  
70      /** The maximum amount of bytes to read for a BOM */
71      private static final int MAX_BOM_SIZE = 4;
72  
73      /** Buffer for BOM reading */
74      private byte [] buf = new byte[MAX_BOM_SIZE];
75  
76      /** Buffer pointer. */
77      private int pos = 0;
78  
79      /** The stream encoding as read from the BOM or null. */
80      private final String encoding;
81  
82      /** True if the BOM itself should be skipped and not read. */
83      private final boolean skipBOM;
84  
85      private final PushbackInputStream inputStream;
86  
87      /**
88       * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
89       *
90       * @param  inputStream The input stream to use for reading.
91       */
92      public UnicodeInputStream(final InputStream inputStream)
93              throws IllegalStateException, IOException
94      {
95          this(inputStream, true);
96      }
97  
98      /**
99       * Creates a new UnicodeInputStream object.
100      *
101      * @param  inputStream The input stream to use for reading.
102      * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
103      */
104     public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
105             throws IllegalStateException, IOException
106     {
107         super();
108 
109         this.skipBOM = skipBOM;
110         this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);
111 
112         try
113         {
114             this.encoding = readEncoding();
115         }
116         catch (IOException ioe)
117         {
118             throw new IllegalStateException("Could not read BOM from Stream", ioe);
119         }
120     }
121 
122     /**
123      * Returns true if the input stream discards the BOM.
124      *
125      * @return  True if the input stream discards the BOM.
126      */
127     public boolean isSkipBOM()
128     {
129         return skipBOM;
130     }
131 
132     /**
133      * Read encoding based on BOM.
134      *
135      * @return  The encoding based on the BOM.
136      *
137      * @throws  IllegalStateException  When a problem reading the BOM occured.
138      */
139     public String getEncodingFromStream()
140     {
141         return encoding;
142     }
143 
144     /**
145      * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
146      * is undefined.
147      *
148      * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
149      */
150     protected String readEncoding()
151         throws IOException
152     {
153         pos = 0;
154 
155         UnicodeBOM encoding = null;
156 
157         // read first byte.
158         if (readByte())
159         {
160             // Build a list of matches
161             //
162             // 00 00 FE FF --> UTF 32 BE
163             // EF BB BF    --> UTF 8
164             // FE FF       --> UTF 16 BE
165             // FF FE       --> UTF 16 LE
166             // FF FE 00 00 --> UTF 32 LE
167 
168             switch (buf[0])
169             {
170             case (byte)0x00: // UTF32 BE
171                 encoding = match(UTF32BE_BOM, null);
172                 break;
173             case (byte)0xef: // UTF8
174                 encoding = match(UTF8_BOM, null);
175                 break;
176             case (byte)0xfe: // UTF16 BE
177                 encoding = match(UTF16BE_BOM, null);
178                 break;
179             case (byte)0xff: // UTF16/32 LE
180                 encoding = match(UTF16LE_BOM, null);
181 
182                 if (encoding != null)
183                 {
184                     encoding = match(UTF32LE_BOM, encoding);
185                 }
186                 break;
187 
188             default:
189                 encoding = null;
190                 break;
191             }
192         }
193 
194         pushback(encoding);
195 
196         return (encoding != null) ? encoding.getEncoding() : null;
197     }
198 
199     private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
200         throws IOException
201     {
202         byte [] bom = matchEncoding.getBytes();
203 
204         for (int i = 0; i < bom.length; i++)
205         {
206             if (pos <= i) // Byte has not yet been read
207             {
208                 if (!readByte())
209                 {
210                     return noMatchEncoding;
211                 }
212             }
213 
214             if (bom[i] != buf[i])
215             {
216                 return noMatchEncoding;
217             }
218         }
219 
220         return matchEncoding;
221     }
222 
223     private final boolean readByte()
224             throws IOException
225     {
226         int res = inputStream.read();
227         if (res == -1)
228         {
229             return false;
230         }
231 
232         if (pos >= buf.length)
233         {
234             throw new IOException("BOM read error");
235         }
236 
237         buf[pos++] = (byte) res;
238         return true;
239     }
240 
241     private final void pushback(final UnicodeBOM matchBOM)
242         throws IOException
243     {
244         int count = pos; // By default, all bytes are pushed back.
245         int start = 0;
246 
247         if (matchBOM != null && skipBOM)
248         {
249             // We have a match (some bytes are part of the BOM)
250             // and we want to skip the BOM. Push back only the bytes
251             // after the BOM.
252             start = matchBOM.getBytes().length;
253             count = (pos - start);
254 
255             if (count < 0)
256             {
257                 throw new IllegalStateException("Match has more bytes than available!");
258             }
259         }
260 
261         inputStream.unread(buf, start, count);
262     }
263 
264     /**
265      * @see java.io.InputStream#close()
266      */
267     public void close()
268         throws IOException
269     {
270         inputStream.close();
271     }
272 
273     /**
274      * @see java.io.InputStream#available()
275      */
276     public int available()
277         throws IOException
278     {
279         return inputStream.available();
280     }
281 
282     /**
283      * @see java.io.InputStream#mark(int)
284      */
285     public void mark(final int readlimit)
286     {
287         inputStream.mark(readlimit);
288     }
289 
290     /**
291      * @see java.io.InputStream#markSupported()
292      */
293     public boolean markSupported()
294     {
295         return inputStream.markSupported();
296     }
297 
298     /**
299      * @see java.io.InputStream#read()
300      */
301     public int read()
302         throws IOException
303     {
304         return inputStream.read();
305     }
306 
307     /**
308      * @see java.io.InputStream#read(byte[])
309      */
310     public int read(final byte [] b)
311         throws IOException
312     {
313         return inputStream.read(b);
314     }
315 
316     /**
317      * @see java.io.InputStream#read(byte[], int, int)
318      */
319     public int read(final byte [] b, final int off, final int len)
320         throws IOException
321     {
322         return inputStream.read(b, off, len);
323     }
324 
325     /**
326      * @see java.io.InputStream#reset()
327      */
328     public void reset()
329         throws IOException
330     {
331         inputStream.reset();
332     }
333 
334     /**
335      * @see java.io.InputStream#skip(long)
336      */
337     public long skip(final long n)
338         throws IOException
339     {
340         return inputStream.skip(n);
341     }
342 
343     /**
344      * Helper class to bundle encoding and BOM marker.
345      *
346      * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
347      * @version $Id: UnicodeInputStream.java 998264 2010-09-17 19:13:02Z apetrelli $
348      */
349     static final class UnicodeBOM
350     {
351         private final String encoding;
352 
353         private final byte [] bytes;
354 
355         private UnicodeBOM(final String encoding, final byte [] bytes)
356         {
357             this.encoding = encoding;
358             this.bytes = bytes;
359         }
360 
361         String getEncoding()
362         {
363             return encoding;
364         }
365 
366         byte [] getBytes()
367         {
368             return bytes;
369         }
370     }
371 }