View Javadoc

1   /* MultiByteReplayCharSequenceFactory
2    *
3    * (Re)Created on Dec 21, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import java.io.BufferedReader;
26  import java.io.BufferedWriter;
27  import java.io.File;
28  import java.io.FileInputStream;
29  import java.io.FileNotFoundException;
30  import java.io.FileOutputStream;
31  import java.io.IOException;
32  import java.io.InputStreamReader;
33  import java.io.OutputStreamWriter;
34  import java.io.Writer;
35  import java.nio.ByteBuffer;
36  import java.nio.CharBuffer;
37  import java.nio.channels.FileChannel;
38  import java.nio.charset.Charset;
39  import java.nio.charset.CharsetDecoder;
40  import java.nio.charset.CoderResult;
41  import java.nio.charset.CodingErrorAction;
42  import java.util.logging.Level;
43  import java.util.logging.Logger;
44  
45  /***
46   * Provides a (Replay)CharSequence view on recorded streams (a prefix
47   * buffer and overflow backing file) that can handle streams of multibyte
48   * characters.
49   *
50   * If possible, use {@link ByteReplayCharSequence}.  It performs better even
51   * for the single byte case (Decoding is an expensive process).
52   *
53   * <p>Call close on this class when done so can clean up resources.
54   *
55   * <p>Implementation currently works by checking to see if content to read
56   * all fits the in-memory buffer.  If so, we decode into a CharBuffer and
57   * keep this around for CharSequence operations.  This CharBuffer is
58   * discarded on close.
59   *
60   * <p>If content length is greater than in-memory buffer, we decode the
61   * buffer plus backing file into a new file named for the backing file w/
62   * a suffix of the encoding we write the file as. We then run w/ a
63   * memory-mapped CharBuffer against this file to implement CharSequence.
64   * Reasons for this implemenation are that CharSequence wants to return the
65   * length of the CharSequence.
66   *
67   * <p>Obvious optimizations would keep around decodings whether the
68   * in-memory decoded buffer or the file of decodings written to disk but the
69   * general usage pattern processing URIs is that the decoding is used by one
70   * processor only.  Also of note, files usually fit into the in-memory
71   * buffer.
72   *
73   * <p>We might also be able to keep up 3 windows that moved across the file
74   * decoding a window at a time trying to keep one of the buffers just in
75   * front of the regex processing returning it a length that would be only
76   * the length of current position to end of current block or else the length
77   * could be got by multipling the backing files length by the decoders'
78   * estimate of average character size.  This would save us writing out the
79   * decoded file.  We'd have to do the latter for files that are
80   * > Integer.MAX_VALUE.
81   *
82   * @author stack
83   * @version $Revision: 5672 $, $Date: 2008-01-10 20:28:33 +0000 (Thu, 10 Jan 2008) $
84   */
85  public class MultiByteReplayCharSequence implements ReplayCharSequence {
86  
87      protected static Logger logger =
88          Logger.getLogger(MultiByteReplayCharSequence.class.getName());
89      
90      /***
91       * Name of the encoding we use writing out concatenated decoded prefix
92       * buffer and decoded backing file.
93       *
94       * <p>This define is also used as suffix for the file that holds the
95       * decodings.  The name of the file that holds the decoding is the name
96       * of the backing file w/ this encoding for a suffix.
97       *
98       * <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
99       */
100     private static final String WRITE_ENCODING = "UTF-16BE";
101 
102     /***
103      * CharBuffer of decoded content.
104      *
105      * Content of this buffer is unicode.
106      */
107     private CharBuffer content = null;
108 
109     /***
110      * File that has decoded content.
111      *
112      * Keep it around so we can remove on close.
113      */
114     private File decodedFile = null;
115 
116 
117     /***
118      * Constructor for all in-memory operation.
119      *
120      * @param buffer In-memory buffer of recordings prefix.  We read from
121      * here first and will only go to the backing file if <code>size</code>
122      * requested is greater than <code>buffer.length</code>.
123      * @param size Total size of stream to replay in bytes.  Used to find
124      * EOS. This is total length of content including HTTP headers if
125      * present.
126      * @param responseBodyStart Where the response body starts in bytes.
127      * Used to skip over the HTTP headers if present.
128      * @param backingFilename Path to backing file with content in excess of
129      * whats in <code>buffer</code>.
130      * @param encoding Encoding to use reading the passed prefix buffer and
131      * backing file.  For now, should be java canonical name for the
132      * encoding. (If null is passed, we will default to
133      * ByteReplayCharSequence).
134      *
135      * @throws IOException
136      */
137     public MultiByteReplayCharSequence(byte[] buffer, long size,
138             long responseBodyStart, String encoding)
139         throws IOException {
140         super();
141         this.content = decodeInMemory(buffer, size, responseBodyStart, 
142                 encoding);
143      }
144 
145     /***
146      * Constructor for overflow-to-disk-file operation.
147      *
148      * @param contentReplayInputStream inputStream of content
149      * @param backingFilename hint for name of temp file
150      * @param characterEncoding Encoding to use reading the stream.
151      * For now, should be java canonical name for the
152      * encoding. 
153      *
154      * @throws IOException
155      */
156     public MultiByteReplayCharSequence(
157             ReplayInputStream contentReplayInputStream,
158             String backingFilename,
159             String characterEncoding)
160         throws IOException {
161         super();
162         this.content = decodeToFile(contentReplayInputStream, 
163                 backingFilename, characterEncoding);
164     }
165 
166     /***
167      * Decode passed buffer and backing file into a CharBuffer.
168      *
169      * This method writes a new file made of the decoded concatenation of
170      * the in-memory prefix buffer and the backing file.  Returns a
171      * charSequence view onto this new file.
172      *
173      * @param buffer In-memory buffer of recordings prefix.  We read from
174      * here first and will only go to the backing file if <code>size</code>
175      * requested is greater than <code>buffer.length</code>.
176      * @param size Total size of stream to replay in bytes.  Used to find
177      * EOS. This is total length of content including HTTP headers if
178      * present.
179      * @param responseBodyStart Where the response body starts in bytes.
180      * Used to skip over the HTTP headers if present.
181      * @param backingFilename Path to backing file with content in excess of
182      * whats in <code>buffer</code>.
183      * @param encoding Encoding to use reading the passed prefix buffer and
184      * backing file.  For now, should be java canonical name for the
185      * encoding. (If null is passed, we will default to
186      * ByteReplayCharSequence).
187      *
188      * @return A CharBuffer view on decodings of the contents of passed
189      * buffer.
190      * @throws IOException
191      */
192     private CharBuffer decodeToFile(ReplayInputStream inStream, 
193             String backingFilename, String encoding)
194         throws IOException {
195 
196         CharBuffer charBuffer = null;
197 
198         BufferedReader reader = new BufferedReader(
199                 new InputStreamReader(inStream,encoding));
200         
201         this.decodedFile = new File(backingFilename + "." + WRITE_ENCODING);
202         FileOutputStream fos;
203         try {
204             fos = new FileOutputStream(this.decodedFile);
205         } catch (FileNotFoundException e) {
206             // Windows workaround attempt
207             System.gc();
208             System.runFinalization();
209             logger.info("Windows 'file with a user-mapped section open' "+
210                     "workaround gc-finalization performed.");
211             // try again 
212             fos = new FileOutputStream(this.decodedFile);
213         }
214         BufferedWriter writer = new BufferedWriter(
215                 new OutputStreamWriter(
216                         fos, 
217                         WRITE_ENCODING)); 
218 
219         int c;
220         while((c = reader.read())>=0) {
221             writer.write(c);
222         }
223         writer.close();
224         
225         charBuffer = getReadOnlyMemoryMappedBuffer(this.decodedFile).
226             asCharBuffer();
227 
228         return charBuffer;
229     }
230 
231     /***
232      * Decode passed buffer into a CharBuffer.
233      *
234      * This method decodes a memory buffer returning a memory buffer.
235      *
236      * @param buffer In-memory buffer of recordings prefix.  We read from
237      * here first and will only go to the backing file if <code>size</code>
238      * requested is greater than <code>buffer.length</code>.
239      * @param size Total size of stream to replay in bytes.  Used to find
240      * EOS. This is total length of content including HTTP headers if
241      * present.
242      * @param responseBodyStart Where the response body starts in bytes.
243      * Used to skip over the HTTP headers if present.
244      * @param encoding Encoding to use reading the passed prefix buffer and
245      * backing file.  For now, should be java canonical name for the
246      * encoding. (If null is passed, we will default to
247      * ByteReplayCharSequence).
248      *
249      * @return A CharBuffer view on decodings of the contents of passed
250      * buffer.
251      */
252     private CharBuffer decodeInMemory(byte[] buffer, long size,
253             long responseBodyStart, String encoding)
254     {
255         ByteBuffer bb = ByteBuffer.wrap(buffer);
256         // Move past the HTTP header if present.
257         bb.position((int)responseBodyStart);
258         // Set the end-of-buffer to be end-of-content.
259         bb.limit((int)size);
260         return (Charset.forName(encoding)).decode(bb).asReadOnlyBuffer();
261     }
262 
263     /***
264      * Create read-only memory-mapped buffer onto passed file.
265      *
266      * @param file File to get memory-mapped buffer on.
267      * @return Read-only memory-mapped ByteBuffer view on to passed file.
268      * @throws IOException
269      */
270     private ByteBuffer getReadOnlyMemoryMappedBuffer(File file)
271         throws IOException {
272 
273         ByteBuffer bb = null;
274         FileInputStream in = null;
275         FileChannel c = null;
276         assert file.exists(): "No file " + file.getAbsolutePath();
277 
278         try {
279             in = new FileInputStream(file);
280             c = in.getChannel();
281             // TODO: Confirm the READ_ONLY works.  I recall it not working.
282             // The buffers seem to always say that the buffer is writeable.
283             bb = c.map(FileChannel.MapMode.READ_ONLY, 0, c.size()).
284                 asReadOnlyBuffer();
285         }
286 
287         finally {
288             if (c != null && c.isOpen()) {
289                 c.close();
290             }
291             if (in != null) {
292                 in.close();
293             }
294         }
295 
296         return bb;
297     }
298 
299     private void deleteFile(File fileToDelete) {
300         deleteFile(fileToDelete, null);        
301     }
302 
303     private void deleteFile(File fileToDelete, final Exception e) {
304         if (e != null) {
305             // Log why the delete to help with debug of java.io.FileNotFoundException:
306             // ....tt53http.ris.UTF-16BE.
307             logger.severe("Deleting " + fileToDelete + " because of "
308                 + e.toString());
309         }
310         if (fileToDelete != null && fileToDelete.exists()) {
311             fileToDelete.delete();
312         }
313     }
314 
315     public void close()
316     {
317         this.content = null;
318         deleteFile(this.decodedFile);
319         // clear decodedFile -- so that double-close (as in 
320         // finalize()) won't delete a later instance with same name
321         // see bug [ 1218961 ] "failed get of replay" in ExtractorHTML... usu: UTF-16BE
322         this.decodedFile = null;
323     }
324 
325     protected void finalize() throws Throwable
326     {
327         super.finalize();
328         // Maybe TODO: eliminate close here, requiring explicit close instead
329         close();
330     }
331 
332     public int length()
333     {
334         return this.content.limit();
335     }
336 
337     public char charAt(int index)
338     {
339         return this.content.get(index);
340     }
341 
342     public CharSequence subSequence(int start, int end) {
343         return new CharSubSequence(this, start, end);
344     }
345     
346     public String toString() {
347         StringBuffer sb = new StringBuffer(length());
348         // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up
349         for (int i = 0;i<length();i++) {
350             sb.append(charAt(i)); 
351         }
352         return sb.toString();
353     }
354 }