1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import java.io.BufferedReader;
26 import java.io.BufferedWriter;
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileNotFoundException;
30 import java.io.FileOutputStream;
31 import java.io.IOException;
32 import java.io.InputStreamReader;
33 import java.io.OutputStreamWriter;
34 import java.io.Writer;
35 import java.nio.ByteBuffer;
36 import java.nio.CharBuffer;
37 import java.nio.channels.FileChannel;
38 import java.nio.charset.Charset;
39 import java.nio.charset.CharsetDecoder;
40 import java.nio.charset.CoderResult;
41 import java.nio.charset.CodingErrorAction;
42 import java.util.logging.Level;
43 import java.util.logging.Logger;
44
45 /***
46 * Provides a (Replay)CharSequence view on recorded streams (a prefix
47 * buffer and overflow backing file) that can handle streams of multibyte
48 * characters.
49 *
50 * If possible, use {@link ByteReplayCharSequence}. It performs better even
51 * for the single byte case (Decoding is an expensive process).
52 *
53 * <p>Call close on this class when done so can clean up resources.
54 *
55 * <p>Implementation currently works by checking to see if content to read
56 * all fits the in-memory buffer. If so, we decode into a CharBuffer and
57 * keep this around for CharSequence operations. This CharBuffer is
58 * discarded on close.
59 *
60 * <p>If content length is greater than in-memory buffer, we decode the
61 * buffer plus backing file into a new file named for the backing file w/
62 * a suffix of the encoding we write the file as. We then run w/ a
63 * memory-mapped CharBuffer against this file to implement CharSequence.
64 * Reasons for this implemenation are that CharSequence wants to return the
65 * length of the CharSequence.
66 *
67 * <p>Obvious optimizations would keep around decodings whether the
68 * in-memory decoded buffer or the file of decodings written to disk but the
69 * general usage pattern processing URIs is that the decoding is used by one
70 * processor only. Also of note, files usually fit into the in-memory
71 * buffer.
72 *
73 * <p>We might also be able to keep up 3 windows that moved across the file
74 * decoding a window at a time trying to keep one of the buffers just in
75 * front of the regex processing returning it a length that would be only
76 * the length of current position to end of current block or else the length
77 * could be got by multipling the backing files length by the decoders'
78 * estimate of average character size. This would save us writing out the
79 * decoded file. We'd have to do the latter for files that are
80 * > Integer.MAX_VALUE.
81 *
82 * @author stack
83 * @version $Revision: 5672 $, $Date: 2008-01-10 20:28:33 +0000 (Thu, 10 Jan 2008) $
84 */
85 public class MultiByteReplayCharSequence implements ReplayCharSequence {
86
87 protected static Logger logger =
88 Logger.getLogger(MultiByteReplayCharSequence.class.getName());
89
90 /***
91 * Name of the encoding we use writing out concatenated decoded prefix
92 * buffer and decoded backing file.
93 *
94 * <p>This define is also used as suffix for the file that holds the
95 * decodings. The name of the file that holds the decoding is the name
96 * of the backing file w/ this encoding for a suffix.
97 *
98 * <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
99 */
100 private static final String WRITE_ENCODING = "UTF-16BE";
101
102 /***
103 * CharBuffer of decoded content.
104 *
105 * Content of this buffer is unicode.
106 */
107 private CharBuffer content = null;
108
109 /***
110 * File that has decoded content.
111 *
112 * Keep it around so we can remove on close.
113 */
114 private File decodedFile = null;
115
116
117 /***
118 * Constructor for all in-memory operation.
119 *
120 * @param buffer In-memory buffer of recordings prefix. We read from
121 * here first and will only go to the backing file if <code>size</code>
122 * requested is greater than <code>buffer.length</code>.
123 * @param size Total size of stream to replay in bytes. Used to find
124 * EOS. This is total length of content including HTTP headers if
125 * present.
126 * @param responseBodyStart Where the response body starts in bytes.
127 * Used to skip over the HTTP headers if present.
128 * @param backingFilename Path to backing file with content in excess of
129 * whats in <code>buffer</code>.
130 * @param encoding Encoding to use reading the passed prefix buffer and
131 * backing file. For now, should be java canonical name for the
132 * encoding. (If null is passed, we will default to
133 * ByteReplayCharSequence).
134 *
135 * @throws IOException
136 */
137 public MultiByteReplayCharSequence(byte[] buffer, long size,
138 long responseBodyStart, String encoding)
139 throws IOException {
140 super();
141 this.content = decodeInMemory(buffer, size, responseBodyStart,
142 encoding);
143 }
144
145 /***
146 * Constructor for overflow-to-disk-file operation.
147 *
148 * @param contentReplayInputStream inputStream of content
149 * @param backingFilename hint for name of temp file
150 * @param characterEncoding Encoding to use reading the stream.
151 * For now, should be java canonical name for the
152 * encoding.
153 *
154 * @throws IOException
155 */
156 public MultiByteReplayCharSequence(
157 ReplayInputStream contentReplayInputStream,
158 String backingFilename,
159 String characterEncoding)
160 throws IOException {
161 super();
162 this.content = decodeToFile(contentReplayInputStream,
163 backingFilename, characterEncoding);
164 }
165
166 /***
167 * Decode passed buffer and backing file into a CharBuffer.
168 *
169 * This method writes a new file made of the decoded concatenation of
170 * the in-memory prefix buffer and the backing file. Returns a
171 * charSequence view onto this new file.
172 *
173 * @param buffer In-memory buffer of recordings prefix. We read from
174 * here first and will only go to the backing file if <code>size</code>
175 * requested is greater than <code>buffer.length</code>.
176 * @param size Total size of stream to replay in bytes. Used to find
177 * EOS. This is total length of content including HTTP headers if
178 * present.
179 * @param responseBodyStart Where the response body starts in bytes.
180 * Used to skip over the HTTP headers if present.
181 * @param backingFilename Path to backing file with content in excess of
182 * whats in <code>buffer</code>.
183 * @param encoding Encoding to use reading the passed prefix buffer and
184 * backing file. For now, should be java canonical name for the
185 * encoding. (If null is passed, we will default to
186 * ByteReplayCharSequence).
187 *
188 * @return A CharBuffer view on decodings of the contents of passed
189 * buffer.
190 * @throws IOException
191 */
192 private CharBuffer decodeToFile(ReplayInputStream inStream,
193 String backingFilename, String encoding)
194 throws IOException {
195
196 CharBuffer charBuffer = null;
197
198 BufferedReader reader = new BufferedReader(
199 new InputStreamReader(inStream,encoding));
200
201 this.decodedFile = new File(backingFilename + "." + WRITE_ENCODING);
202 FileOutputStream fos;
203 try {
204 fos = new FileOutputStream(this.decodedFile);
205 } catch (FileNotFoundException e) {
206
207 System.gc();
208 System.runFinalization();
209 logger.info("Windows 'file with a user-mapped section open' "+
210 "workaround gc-finalization performed.");
211
212 fos = new FileOutputStream(this.decodedFile);
213 }
214 BufferedWriter writer = new BufferedWriter(
215 new OutputStreamWriter(
216 fos,
217 WRITE_ENCODING));
218
219 int c;
220 while((c = reader.read())>=0) {
221 writer.write(c);
222 }
223 writer.close();
224
225 charBuffer = getReadOnlyMemoryMappedBuffer(this.decodedFile).
226 asCharBuffer();
227
228 return charBuffer;
229 }
230
231 /***
232 * Decode passed buffer into a CharBuffer.
233 *
234 * This method decodes a memory buffer returning a memory buffer.
235 *
236 * @param buffer In-memory buffer of recordings prefix. We read from
237 * here first and will only go to the backing file if <code>size</code>
238 * requested is greater than <code>buffer.length</code>.
239 * @param size Total size of stream to replay in bytes. Used to find
240 * EOS. This is total length of content including HTTP headers if
241 * present.
242 * @param responseBodyStart Where the response body starts in bytes.
243 * Used to skip over the HTTP headers if present.
244 * @param encoding Encoding to use reading the passed prefix buffer and
245 * backing file. For now, should be java canonical name for the
246 * encoding. (If null is passed, we will default to
247 * ByteReplayCharSequence).
248 *
249 * @return A CharBuffer view on decodings of the contents of passed
250 * buffer.
251 */
252 private CharBuffer decodeInMemory(byte[] buffer, long size,
253 long responseBodyStart, String encoding)
254 {
255 ByteBuffer bb = ByteBuffer.wrap(buffer);
256
257 bb.position((int)responseBodyStart);
258
259 bb.limit((int)size);
260 return (Charset.forName(encoding)).decode(bb).asReadOnlyBuffer();
261 }
262
263 /***
264 * Create read-only memory-mapped buffer onto passed file.
265 *
266 * @param file File to get memory-mapped buffer on.
267 * @return Read-only memory-mapped ByteBuffer view on to passed file.
268 * @throws IOException
269 */
270 private ByteBuffer getReadOnlyMemoryMappedBuffer(File file)
271 throws IOException {
272
273 ByteBuffer bb = null;
274 FileInputStream in = null;
275 FileChannel c = null;
276 assert file.exists(): "No file " + file.getAbsolutePath();
277
278 try {
279 in = new FileInputStream(file);
280 c = in.getChannel();
281
282
283 bb = c.map(FileChannel.MapMode.READ_ONLY, 0, c.size()).
284 asReadOnlyBuffer();
285 }
286
287 finally {
288 if (c != null && c.isOpen()) {
289 c.close();
290 }
291 if (in != null) {
292 in.close();
293 }
294 }
295
296 return bb;
297 }
298
299 private void deleteFile(File fileToDelete) {
300 deleteFile(fileToDelete, null);
301 }
302
303 private void deleteFile(File fileToDelete, final Exception e) {
304 if (e != null) {
305
306
307 logger.severe("Deleting " + fileToDelete + " because of "
308 + e.toString());
309 }
310 if (fileToDelete != null && fileToDelete.exists()) {
311 fileToDelete.delete();
312 }
313 }
314
315 public void close()
316 {
317 this.content = null;
318 deleteFile(this.decodedFile);
319
320
321
322 this.decodedFile = null;
323 }
324
325 protected void finalize() throws Throwable
326 {
327 super.finalize();
328
329 close();
330 }
331
332 public int length()
333 {
334 return this.content.limit();
335 }
336
337 public char charAt(int index)
338 {
339 return this.content.get(index);
340 }
341
342 public CharSequence subSequence(int start, int end) {
343 return new CharSubSequence(this, start, end);
344 }
345
346 public String toString() {
347 StringBuffer sb = new StringBuffer(length());
348
349 for (int i = 0;i<length();i++) {
350 sb.append(charAt(i));
351 }
352 return sb.toString();
353 }
354 }