1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import java.io.IOException;
26 import java.io.RandomAccessFile;
27 import java.io.UnsupportedEncodingException;
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import org.archive.util.DevUtils;
32
33 /***
34 * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix
35 * buffer and overflow backing file).
36 *
37 * Treats the byte stream as 8-bit.
38 *
39 * <p>Uses a wraparound rolling buffer of the last windowSize bytes read
40 * from disk in memory; as long as the 'random access' of a CharSequence
41 * user stays within this window, access should remain fairly efficient.
42 * (So design any regexps pointed at these CharSequences to work within
43 * that range!)
44 *
45 * <p>When rereading of a location is necessary, the whole window is
46 * recentered around the location requested. (TODO: More research
47 * into whether this is the best strategy.)
48 *
49 * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one
50 * to wrap the passed prefix buffer and the second, a memory-mapped
51 * ByteBuffer view into the backing file -- was consistently slower: ~10%.
52 * My tests did the following. Made a buffer filled w/ regular content.
53 * This buffer was used as the prefix buffer. The buffer content was
54 * written MULTIPLER times to a backing file. I then did accesses w/ the
55 * following pattern: Skip forward 32 bytes, then back 16 bytes, and then
56 * read forward from byte 16-32. Repeat. Though I varied the size of the
57 * buffer to the size of the backing file,from 3-10, the difference of 10%
58 * or so seemed to persist. Same if I tried to favor get() over get(index).
59 * I used a profiler, JMP, to study times taken (St.Ack did above comment).
60 *
61 * <p>TODO determine in memory mapped files is better way to do this;
62 * probably not -- they don't offer the level of control over
63 * total memory used that this approach does.
64 *
65 * @author Gordon Mohr
66 * @version $Revision: 5027 $, $Date: 2007-03-29 00:30:33 +0000 (Thu, 29 Mar 2007) $
67 */
68 class ByteReplayCharSequence implements ReplayCharSequence {
69
70 protected static Logger logger =
71 Logger.getLogger(ByteReplayCharSequence.class.getName());
72
73 /***
74 * Buffer that holds the first bit of content.
75 *
76 * Once this is exhausted we go to the backing file.
77 */
78 private byte[] prefixBuffer;
79
80 /***
81 * Total length of character stream to replay minus the HTTP headers
82 * if present.
83 *
84 * Used to find EOS.
85 */
86 protected int length;
87
88 /***
89 * Absolute length of the stream.
90 *
91 * Includes HTTP headers. Needed doing calc. in the below figuring
92 * how much to load into buffer.
93 */
94 private int absoluteLength = -1;
95
96 /***
97 * Buffer window on to backing file.
98 */
99 private byte[] wraparoundBuffer;
100
101 /***
102 * Absolute index into underlying bytestream where wrap starts.
103 */
104 private int wrapOrigin;
105
106 /***
107 * Index in wraparoundBuffer that corresponds to wrapOrigin
108 */
109 private int wrapOffset;
110
111 /***
112 * Name of backing file we go to when we've exhausted content from the
113 * prefix buffer.
114 */
115 private String backingFilename;
116
117 /***
118 * Random access to the backing file.
119 */
120 private RandomAccessFile raFile;
121
122 /***
123 * Offset into prefix buffer at which content beings.
124 */
125 private int contentOffset;
126
127 /***
128 * 8-bit encoding used reading single bytes from buffer and
129 * stream.
130 */
131 private static final String DEFAULT_SINGLE_BYTE_ENCODING =
132 "ISO-8859-1";
133
134
135 /***
136 * Constructor.
137 *
138 * @param buffer In-memory buffer of recordings prefix. We read from
139 * here first and will only go to the backing file if <code>size</code>
140 * requested is greater than <code>buffer.length</code>.
141 * @param size Total size of stream to replay in bytes. Used to find
142 * EOS. This is total length of content including HTTP headers if
143 * present.
144 * @param responseBodyStart Where the response body starts in bytes.
145 * Used to skip over the HTTP headers if present.
146 * @param backingFilename Path to backing file with content in excess of
147 * whats in <code>buffer</code>.
148 *
149 * @throws IOException
150 */
151 public ByteReplayCharSequence(byte[] buffer, long size,
152 long responseBodyStart, String backingFilename)
153 throws IOException {
154
155 this.length = (int)(size - responseBodyStart);
156 this.absoluteLength = (int)size;
157 this.prefixBuffer = buffer;
158 this.contentOffset = (int)responseBodyStart;
159
160
161
162 if (size > buffer.length) {
163 this.backingFilename = backingFilename;
164 this.raFile = new RandomAccessFile(backingFilename, "r");
165 this.wraparoundBuffer = new byte[this.prefixBuffer.length];
166 this.wrapOrigin = this.prefixBuffer.length;
167 this.wrapOffset = 0;
168 loadBuffer();
169 }
170 }
171
172 /***
173 * @return Length of characters in stream to replay. Starts counting
174 * at the HTTP header/body boundary.
175 */
176 public int length() {
177 return this.length;
178 }
179
180 /***
181 * Get character at passed absolute position.
182 *
183 * Called by {@link #charAt(int)} which has a relative index into the
184 * content, one that doesn't account for HTTP header if present.
185 *
186 * @param index Index into content adjusted to accomodate initial offset
187 * to get us past the HTTP header if present (i.e.
188 * {@link #contentOffset}).
189 *
190 * @return Characater at offset <code>index</code>.
191 */
192 public char charAt(int index) {
193 int c = -1;
194
195
196 index += this.contentOffset;
197 if (index < this.prefixBuffer.length) {
198
199 c = this.prefixBuffer[index];
200 } else if (index >= this.wrapOrigin &&
201 (index - this.wrapOrigin) < this.wraparoundBuffer.length) {
202
203 c = this.wraparoundBuffer[
204 ((index - this.wrapOrigin) + this.wrapOffset) %
205 this.wraparoundBuffer.length];
206 } else {
207
208
209
210 c = faultCharAt(index);
211 }
212
213
214 return (char)(c & 0xff);
215 }
216
217 /***
218 * Get a character that's outside the current buffers.
219 *
220 * will cause the wraparoundBuffer to be changed to
221 * cover a region including the index
222 *
223 * if index is higher than the highest index in the
224 * wraparound buffer, buffer is moved forward such
225 * that requested char is last item in buffer
226 *
227 * if index is lower than lowest index in the
228 * wraparound buffer, buffet is reset centered around
229 * index
230 *
231 * @param index Index of character to fetch.
232 * @return A character that's outside the current buffers
233 */
234 private int faultCharAt(int index) {
235 if(Thread.interrupted()) {
236 throw new RuntimeException("thread interrupted");
237 }
238 if(index >= this.wrapOrigin + this.wraparoundBuffer.length) {
239
240 while (index >= this.wrapOrigin + this.wraparoundBuffer.length)
241 {
242
243 advanceBuffer();
244 }
245 return charAt(index - this.contentOffset);
246 }
247
248 recenterBuffer(index);
249 return charAt(index - this.contentOffset);
250 }
251
252 /***
253 * Move the buffer window on backing file back centering current access
254 * position in middle of window.
255 *
256 * @param index Index of character to access.
257 */
258 private void recenterBuffer(int index) {
259 if (logger.isLoggable(Level.FINE)) {
260 logger.fine("Recentering around " + index + " in " +
261 this.backingFilename);
262 }
263 this.wrapOrigin = index - (this.wraparoundBuffer.length / 2);
264 if(this.wrapOrigin < this.prefixBuffer.length) {
265 this.wrapOrigin = this.prefixBuffer.length;
266 }
267 this.wrapOffset = 0;
268 loadBuffer();
269 }
270
271 /***
272 * Load from backing file into the wrapper buffer.
273 */
274 private void loadBuffer()
275 {
276 long len = -1;
277 try {
278 len = this.raFile.length();
279 this.raFile.seek(this.wrapOrigin - this.prefixBuffer.length);
280 this.raFile.readFully(this.wraparoundBuffer, 0,
281 Math.min(this.wraparoundBuffer.length,
282 this.absoluteLength - this.wrapOrigin));
283 }
284
285 catch (IOException e) {
286
287 DevUtils.logger.log (
288 Level.SEVERE,
289 "raFile.seek(" +
290 (this.wrapOrigin - this.prefixBuffer.length) +
291 ")\n" +
292 "raFile.readFully(wraparoundBuffer,0," +
293 (Math.min(this.wraparoundBuffer.length,
294 this.length - this.wrapOrigin )) +
295 ")\n"+
296 "raFile.length()" + len + "\n" +
297 DevUtils.extraInfo(),
298 e);
299 throw new RuntimeException(e);
300 }
301 }
302
303 /***
304 * Roll the wraparound buffer forward one position
305 */
306 private void advanceBuffer() {
307 try {
308 this.wraparoundBuffer[this.wrapOffset] =
309 (byte)this.raFile.read();
310 this.wrapOffset++;
311 this.wrapOffset %= this.wraparoundBuffer.length;
312 this.wrapOrigin++;
313 } catch (IOException e) {
314 DevUtils.logger.log(Level.SEVERE, "advanceBuffer()" +
315 DevUtils.extraInfo(), e);
316 throw new RuntimeException(e);
317 }
318 }
319
320 public CharSequence subSequence(int start, int end) {
321 return new CharSubSequence(this, start, end);
322 }
323
324 /***
325 * Cleanup resources.
326 *
327 * @exception IOException Failed close of random access file.
328 */
329 public void close() throws IOException
330 {
331 this.prefixBuffer = null;
332 if (this.raFile != null) {
333 this.raFile.close();
334 this.raFile = null;
335 }
336 }
337
338
339
340
341 protected void finalize() throws Throwable
342 {
343 super.finalize();
344 close();
345 }
346
347 /***
348 * Convenience method for getting a substring.
349 * @deprecated please use subSequence() and then toString() directly
350 */
351 public String substring(int offset, int len) {
352 return subSequence(offset, offset+len).toString();
353 }
354
355
356
357
358 public String toString() {
359 StringBuilder sb = new StringBuilder(this.length());
360 sb.append(this);
361 return sb.toString();
362 }
363 }