View Javadoc

1   /* ByteReplayCharSequenceFactory
2    *
3    * (Re)Created on Dec 21, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import java.io.IOException;
26  import java.io.RandomAccessFile;
27  import java.io.UnsupportedEncodingException;
28  import java.util.logging.Level;
29  import java.util.logging.Logger;
30  
31  import org.archive.util.DevUtils;
32  
33  /***
34   * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix
35   * buffer and overflow backing file).
36   *
37   * Treats the byte stream as 8-bit.
38   *
39   * <p>Uses a wraparound rolling buffer of the last windowSize bytes read
40   * from disk in memory; as long as the 'random access' of a CharSequence
41   * user stays within this window, access should remain fairly efficient.
42   * (So design any regexps pointed at these CharSequences to work within
43   * that range!)
44   *
45   * <p>When rereading of a location is necessary, the whole window is
46   * recentered around the location requested. (TODO: More research
47   * into whether this is the best strategy.)
48   *
49   * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one
50   * to wrap the passed prefix buffer and the second, a memory-mapped
51   * ByteBuffer view into the backing file -- was consistently slower: ~10%.
52   * My tests did the following. Made a buffer filled w/ regular content.
53   * This buffer was used as the prefix buffer.  The buffer content was
54   * written MULTIPLER times to a backing file.  I then did accesses w/ the
55   * following pattern: Skip forward 32 bytes, then back 16 bytes, and then
56   * read forward from byte 16-32.  Repeat.  Though I varied the size of the
57   * buffer to the size of the backing file,from 3-10, the difference of 10%
58   * or so seemed to persist.  Same if I tried to favor get() over get(index).
59   * I used a profiler, JMP, to study times taken (St.Ack did above comment).
60   *
61   * <p>TODO determine in memory mapped files is better way to do this;
62   * probably not -- they don't offer the level of control over
63   * total memory used that this approach does.
64   *
65   * @author Gordon Mohr
66   * @version $Revision: 5027 $, $Date: 2007-03-29 00:30:33 +0000 (Thu, 29 Mar 2007) $
67   */
68  class ByteReplayCharSequence implements ReplayCharSequence {
69  
70      protected static Logger logger =
71          Logger.getLogger(ByteReplayCharSequence.class.getName());
72  
73      /***
74       * Buffer that holds the first bit of content.
75       *
76       * Once this is exhausted we go to the backing file.
77       */
78      private byte[] prefixBuffer;
79  
80      /***
81       * Total length of character stream to replay minus the HTTP headers
82       * if present.
83       *
84       * Used to find EOS.
85       */
86      protected int length;
87  
88      /***
89       * Absolute length of the stream.
90       *
91       * Includes HTTP headers.  Needed doing calc. in the below figuring
92       * how much to load into buffer.
93       */
94      private int absoluteLength = -1;
95  
96      /***
97       * Buffer window on to backing file.
98       */
99      private byte[] wraparoundBuffer;
100 
101     /***
102      * Absolute index into underlying bytestream where wrap starts.
103      */
104     private int wrapOrigin;
105 
106     /***
107      * Index in wraparoundBuffer that corresponds to wrapOrigin
108      */
109     private int wrapOffset;
110 
111     /***
112      * Name of backing file we go to when we've exhausted content from the
113      * prefix buffer.
114      */
115     private String backingFilename;
116 
117     /***
118      * Random access to the backing file.
119      */
120     private RandomAccessFile raFile;
121 
122     /***
123      * Offset into prefix buffer at which content beings.
124      */
125     private int contentOffset;
126 
127     /***
128      * 8-bit encoding used reading single bytes from buffer and
129      * stream.
130      */
131     private static final String DEFAULT_SINGLE_BYTE_ENCODING =
132         "ISO-8859-1";
133 
134 
135     /***
136      * Constructor.
137      *
138      * @param buffer In-memory buffer of recordings prefix.  We read from
139      * here first and will only go to the backing file if <code>size</code>
140      * requested is greater than <code>buffer.length</code>.
141      * @param size Total size of stream to replay in bytes.  Used to find
142      * EOS. This is total length of content including HTTP headers if
143      * present.
144      * @param responseBodyStart Where the response body starts in bytes.
145      * Used to skip over the HTTP headers if present.
146      * @param backingFilename Path to backing file with content in excess of
147      * whats in <code>buffer</code>.
148      *
149      * @throws IOException
150      */
151     public ByteReplayCharSequence(byte[] buffer, long size,
152             long responseBodyStart, String backingFilename)
153         throws IOException {
154 
155         this.length = (int)(size - responseBodyStart);
156         this.absoluteLength = (int)size;
157         this.prefixBuffer = buffer;
158         this.contentOffset = (int)responseBodyStart;
159 
160         // If amount to read is > than what is in our prefix buffer, then
161         // open the backing file.
162         if (size > buffer.length) {
163             this.backingFilename = backingFilename;
164             this.raFile = new RandomAccessFile(backingFilename, "r");
165             this.wraparoundBuffer = new byte[this.prefixBuffer.length];
166             this.wrapOrigin = this.prefixBuffer.length;
167             this.wrapOffset = 0;
168             loadBuffer();
169         }
170     }
171 
172     /***
173      * @return Length of characters in stream to replay.  Starts counting
174      * at the HTTP header/body boundary.
175      */
176     public int length() {
177         return this.length;
178     }
179 
180     /***
181      * Get character at passed absolute position.
182      *
183      * Called by {@link #charAt(int)} which has a relative index into the
184      * content, one that doesn't account for HTTP header if present.
185      *
186      * @param index Index into content adjusted to accomodate initial offset
187      * to get us past the HTTP header if present (i.e.
188      * {@link #contentOffset}).
189      *
190      * @return Characater at offset <code>index</code>.
191      */
192     public char charAt(int index) {
193         int c = -1;
194         // Add to index start-of-content offset to get us over HTTP header
195         // if present.
196         index += this.contentOffset;
197         if (index < this.prefixBuffer.length) {
198             // If index is into our prefix buffer.
199             c = this.prefixBuffer[index];
200         } else if (index >= this.wrapOrigin &&
201             (index - this.wrapOrigin) < this.wraparoundBuffer.length) {
202             // If index is into our buffer window on underlying backing file.
203             c = this.wraparoundBuffer[
204                     ((index - this.wrapOrigin) + this.wrapOffset) %
205                         this.wraparoundBuffer.length];
206         } else {
207             // Index is outside of both prefix buffer and our buffer window
208             // onto the underlying backing file.  Fix the buffer window
209             // location.
210             c = faultCharAt(index);
211         }
212         // Stream is treated as single byte.  Make sure characters returned
213         // are not negative.
214         return (char)(c & 0xff);
215     }
216 
217     /***
218      * Get a character that's outside the current buffers.
219      *
220      * will cause the wraparoundBuffer to be changed to
221      * cover a region including the index
222      *
223      * if index is higher than the highest index in the
224      * wraparound buffer, buffer is moved forward such
225      * that requested char is last item in buffer
226      *
227      * if index is lower than lowest index in the
228      * wraparound buffer, buffet is reset centered around
229      * index
230      *
231      * @param index Index of character to fetch.
232      * @return A character that's outside the current buffers
233      */
234     private int faultCharAt(int index) {
235         if(Thread.interrupted()) {
236             throw new RuntimeException("thread interrupted");
237         }
238         if(index >= this.wrapOrigin + this.wraparoundBuffer.length) {
239             // Moving forward
240             while (index >= this.wrapOrigin + this.wraparoundBuffer.length)
241             {
242                 // TODO optimize this
243                 advanceBuffer();
244             }
245             return charAt(index - this.contentOffset);
246         }
247         // Moving backward
248         recenterBuffer(index);
249         return charAt(index - this.contentOffset);
250     }
251 
252     /***
253      * Move the buffer window on backing file back centering current access
254      * position in middle of window.
255      *
256      * @param index Index of character to access.
257      */
258     private void recenterBuffer(int index) {
259         if (logger.isLoggable(Level.FINE)) {
260             logger.fine("Recentering around " + index + " in " +
261                 this.backingFilename);
262         }
263         this.wrapOrigin = index - (this.wraparoundBuffer.length / 2);
264         if(this.wrapOrigin < this.prefixBuffer.length) {
265             this.wrapOrigin = this.prefixBuffer.length;
266         }
267         this.wrapOffset = 0;
268         loadBuffer();
269     }
270 
271     /***
272      * Load from backing file into the wrapper buffer.
273      */
274     private void loadBuffer()
275     {
276         long len = -1;
277         try {
278             len = this.raFile.length();
279             this.raFile.seek(this.wrapOrigin - this.prefixBuffer.length);
280             this.raFile.readFully(this.wraparoundBuffer, 0,
281                 Math.min(this.wraparoundBuffer.length,
282                      this.absoluteLength - this.wrapOrigin));
283         }
284 
285         catch (IOException e) {
286             // TODO convert this to a runtime error?
287             DevUtils.logger.log (
288                 Level.SEVERE,
289                 "raFile.seek(" +
290                 (this.wrapOrigin - this.prefixBuffer.length) +
291                 ")\n" +
292                 "raFile.readFully(wraparoundBuffer,0," +
293                 (Math.min(this.wraparoundBuffer.length,
294                     this.length - this.wrapOrigin )) +
295                 ")\n"+
296                 "raFile.length()" + len + "\n" +
297                 DevUtils.extraInfo(),
298                 e);
299             throw new RuntimeException(e);
300         }
301     }
302 
303     /***
304      * Roll the wraparound buffer forward one position
305      */
306     private void advanceBuffer() {
307         try {
308             this.wraparoundBuffer[this.wrapOffset] =
309                 (byte)this.raFile.read();
310             this.wrapOffset++;
311             this.wrapOffset %= this.wraparoundBuffer.length;
312             this.wrapOrigin++;
313         } catch (IOException e) {
314             DevUtils.logger.log(Level.SEVERE, "advanceBuffer()" +
315                 DevUtils.extraInfo(), e);
316             throw new RuntimeException(e);
317         }
318     }
319 
320     public CharSequence subSequence(int start, int end) {
321         return new CharSubSequence(this, start, end);
322     }
323 
324     /***
325      * Cleanup resources.
326      *
327      * @exception IOException Failed close of random access file.
328      */
329     public void close() throws IOException
330     {
331         this.prefixBuffer = null;
332         if (this.raFile != null) {
333             this.raFile.close();
334             this.raFile = null;
335         }
336     }
337 
338     /* (non-Javadoc)
339      * @see java.lang.Object#finalize()
340      */
341     protected void finalize() throws Throwable
342     {
343         super.finalize();
344         close();
345     }
346     
347     /***
348      * Convenience method for getting a substring. 
349      * @deprecated please use subSequence() and then toString() directly 
350      */
351     public String substring(int offset, int len) {
352         return subSequence(offset, offset+len).toString();
353     }
354 
355     /* (non-Javadoc)
356      * @see java.lang.Object#toString()
357      */
358     public String toString() {
359         StringBuilder sb = new StringBuilder(this.length());
360         sb.append(this);
361         return sb.toString();
362     }
363 }