UnicodeInputStream xref

View Javadoc

1   package org.apache.velocity.io;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.PushbackInputStream;
26  
27  import org.apache.velocity.util.ExceptionUtils;
28  
29  
30  /**
31   * This is an input stream that is unicode BOM aware. This allows you to e.g. read
32   * Windows Notepad Unicode files as Velocity templates.
33   *
34   * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
35   * the input stream reader.
36   *
37   * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
38   * the caller must provide synchronization.
39   *
40   * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
41   * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
42   * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
43   * @since 1.5
44   */
45  public class UnicodeInputStream
46      extends InputStream
47  {
48  
49      /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
50      public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });
51  
52      /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
53      public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });
54  
55      /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html" target="alexandria_uri">http://www.unicode.org/unicode/faq/utf_bom.html */
56      public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });
57  
58      /**
59       * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
60       *
61       * TODO: Does Java actually support this?
62       */
63      public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });
64  
65      /**
66       * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
67       *
68       * TODO: Does Java actually support this?
69       */
70      public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });
71  
72      /** The maximum amount of bytes to read for a BOM */
73      private static final int MAX_BOM_SIZE = 4;
74  
75      /** Buffer for BOM reading */
76      private byte [] buf = new byte[MAX_BOM_SIZE];
77  
78      /** Buffer pointer. */
79      private int pos = 0;
80  
81      /** The stream encoding as read from the BOM or null. */
82      private final String encoding;
83  
84      /** True if the BOM itself should be skipped and not read. */
85      private final boolean skipBOM;
86  
87      private final PushbackInputStream inputStream;
88  
89      /**
90       * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
91       *
92       * @param  inputStream The input stream to use for reading.
93       */
94      public UnicodeInputStream(final InputStream inputStream)
95              throws IllegalStateException, IOException
96      {
97          this(inputStream, true);
98      }
99  
100     /**
101      * Creates a new UnicodeInputStream object.
102      *
103      * @param  inputStream The input stream to use for reading.
104      * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
105      */
106     public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
107             throws IllegalStateException, IOException
108     {
109         super();
110 
111         this.skipBOM = skipBOM;
112         this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);
113 
114         try
115         {
116             this.encoding = readEncoding();
117         }
118         catch (IOException ioe)
119         {
120             IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream");
121             ExceptionUtils.setCause(ex, ioe);
122             throw ex;
123         }
124     }
125 
126     /**
127      * Returns true if the input stream discards the BOM.
128      *
129      * @return  True if the input stream discards the BOM.
130      */
131     public boolean isSkipBOM()
132     {
133         return skipBOM;
134     }
135 
136     /**
137      * Read encoding based on BOM.
138      *
139      * @return  The encoding based on the BOM.
140      *
141      * @throws  IllegalStateException  When a problem reading the BOM occured.
142      */
143     public String getEncodingFromStream()
144     {
145         return encoding;
146     }
147 
148     /**
149      * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
150      * is undefined.
151      *
152      * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
153      */
154     protected String readEncoding()
155         throws IOException
156     {
157         pos = 0;
158 
159         UnicodeBOM encoding = null;
160 
161         // read first byte.
162         if (readByte())
163         {
164             // Build a list of matches
165             //
166             // 00 00 FE FF --> UTF 32 BE
167             // EF BB BF    --> UTF 8
168             // FE FF       --> UTF 16 BE
169             // FF FE       --> UTF 16 LE
170             // FF FE 00 00 --> UTF 32 LE
171 
172             switch (buf[0])
173             {
174             case (byte)0x00: // UTF32 BE
175                 encoding = match(UTF32BE_BOM, null);
176                 break;
177             case (byte)0xef: // UTF8
178                 encoding = match(UTF8_BOM, null);
179                 break;
180             case (byte)0xfe: // UTF16 BE
181                 encoding = match(UTF16BE_BOM, null);
182                 break;
183             case (byte)0xff: // UTF16/32 LE
184                 encoding = match(UTF16LE_BOM, null);
185 
186                 if (encoding != null)
187                 {
188                     encoding = match(UTF32LE_BOM, encoding);
189                 }
190                 break;
191 
192             default:
193                 encoding = null;
194                 break;
195             }
196         }
197 
198         pushback(encoding);
199 
200         return (encoding != null) ? encoding.getEncoding() : null;
201     }
202 
203     private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
204         throws IOException
205     {
206         byte [] bom = matchEncoding.getBytes();
207 
208         for (int i = 0; i < bom.length; i++)
209         {
210             if (pos <= i) // Byte has not yet been read
211             {
212                 if (!readByte())
213                 {
214                     return noMatchEncoding;
215                 }
216             }
217 
218             if (bom[i] != buf[i])
219             {
220                 return noMatchEncoding;
221             }
222         }
223 
224         return matchEncoding;
225     }
226 
227     private final boolean readByte()
228             throws IOException
229     {
230         int res = inputStream.read();
231         if (res == -1)
232         {
233             return false;
234         }
235 
236         if (pos >= buf.length)
237         {
238             throw new IOException("BOM read error");
239         }
240 
241         buf[pos++] = (byte) res;
242         return true;
243     }
244 
245     private final void pushback(final UnicodeBOM matchBOM)
246         throws IOException
247     {
248         int count = pos; // By default, all bytes are pushed back.
249         int start = 0;
250 
251         if (matchBOM != null && skipBOM)
252         {
253             // We have a match (some bytes are part of the BOM)
254             // and we want to skip the BOM. Push back only the bytes
255             // after the BOM.
256             start = matchBOM.getBytes().length;
257             count = (pos - start);
258 
259             if (count < 0)
260             {
261                 throw new IllegalStateException("Match has more bytes than available!");
262             }
263         }
264 
265         inputStream.unread(buf, start, count);
266     }
267 
268     /**
269      * @see java.io.InputStream#close()
270      */
271     public void close()
272         throws IOException
273     {
274         inputStream.close();
275     }
276 
277     /**
278      * @see java.io.InputStream#available()
279      */
280     public int available()
281         throws IOException
282     {
283         return inputStream.available();
284     }
285 
286     /**
287      * @see java.io.InputStream#mark(int)
288      */
289     public void mark(final int readlimit)
290     {
291         inputStream.mark(readlimit);
292     }
293 
294     /**
295      * @see java.io.InputStream#markSupported()
296      */
297     public boolean markSupported()
298     {
299         return inputStream.markSupported();
300     }
301 
302     /**
303      * @see java.io.InputStream#read()
304      */
305     public int read()
306         throws IOException
307     {
308         return inputStream.read();
309     }
310 
311     /**
312      * @see java.io.InputStream#read(byte[])
313      */
314     public int read(final byte [] b)
315         throws IOException
316     {
317         return inputStream.read(b);
318     }
319 
320     /**
321      * @see java.io.InputStream#read(byte[], int, int)
322      */
323     public int read(final byte [] b, final int off, final int len)
324         throws IOException
325     {
326         return inputStream.read(b, off, len);
327     }
328 
329     /**
330      * @see java.io.InputStream#reset()
331      */
332     public void reset()
333         throws IOException
334     {
335         inputStream.reset();
336     }
337 
338     /**
339      * @see java.io.InputStream#skip(long)
340      */
341     public long skip(final long n)
342         throws IOException
343     {
344         return inputStream.skip(n);
345     }
346 
347     /**
348      * Helper class to bundle encoding and BOM marker.
349      *
350      * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
351      * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
352      */
353     static final class UnicodeBOM
354     {
355         private final String encoding;
356 
357         private final byte [] bytes;
358 
359         private UnicodeBOM(final String encoding, final byte [] bytes)
360         {
361             this.encoding = encoding;
362             this.bytes = bytes;
363         }
364 
365         String getEncoding()
366         {
367             return encoding;
368         }
369 
370         byte [] getBytes()
371         {
372             return bytes;
373         }
374     }
375 }