1   /* ReplayableOutputStream
2    *
3    * $Id: RecordingOutputStream.java 5365 2007-07-20 00:07:35Z gojomo $
4    *
5    * Created on Sep 23, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io;
26  
27  import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
28  
29  import java.io.FileOutputStream;
30  import java.io.IOException;
31  import java.io.OutputStream;
32  import java.security.MessageDigest;
33  import java.security.NoSuchAlgorithmException;
34  import java.util.logging.Level;
35  import java.util.logging.Logger;
36  
37  import org.archive.util.IoUtils;
38  
39  
40  /***
41   * An output stream that records all writes to wrapped output
42   * stream.
43   *
44   * A RecordingOutputStream can be wrapped around any other
45   * OutputStream to record all bytes written to it.  You can
46   * then request a ReplayInputStream to read those bytes.
47   *
48   * <p>The RecordingOutputStream uses an in-memory buffer and
49   * backing disk file to allow it to record streams of
50   * arbitrary length limited only by available disk space.
51   *
52   * <p>As long as the stream recorded is smaller than the
53   * in-memory buffer, no disk access will occur.
54   *
55   * <p>Recorded content can be recovered as a ReplayInputStream
56   * (via getReplayInputStream() or, for only the content after
57   * the content-begin-mark is set, getContentReplayInputStream() )
58   * or as a ReplayCharSequence (via getReplayCharSequence()).
59   *
60   * <p>This class is also used as a straight output stream
61   * by {@link RecordingInputStream} to which it records all reads.
62   * {@link RecordingInputStream} is exploiting the file backed buffer
63   * facility of this class passing <code>null</code> for the stream
64   * to wrap.  TODO: Make a FileBackedOutputStream class that is
65   * subclassed by RecordingInputStream.
66   *
67   * @author gojomo
68   *
69   */
70  public class RecordingOutputStream extends OutputStream {
71      protected static Logger logger =
72          Logger.getLogger(RecordingOutputStream.class.getName());
73      
74      /***
75       * Size of recording.
76       *
77       * Later passed to ReplayInputStream on creation.  It uses it to know when
78       * EOS.
79       */
80      private long size = 0;
81  
82      private String backingFilename;
83      private OutputStream diskStream = null;
84  
85      /***
86       * Buffer we write recordings to.
87       *
88       * We write all recordings here first till its full.  Thereafter we
89       * write the backing file.
90       */
91      private byte[] buffer;
92  
93      /*** current virtual position in the recording */
94      private long position;
95      
96      /*** flag to disable recording */
97      private boolean recording;
98      
99      /***
100      * Reusable buffer for FastBufferedOutputStream
101      */
102     protected byte[] bufStreamBuf = 
103         new byte [ FastBufferedOutputStream.DEFAULT_BUFFER_SIZE ];
104     
105     /***
106      * True if we're to digest content.
107      */
108     private boolean shouldDigest = false;
109  
110     /***
111      * Digest instance.
112      */
113     private MessageDigest digest = null;
114 
115     /***
116      * Define for SHA1 alogarithm.
117      */
118     private static final String SHA1 = "SHA1";
119 
120     /***
121      * Maximum amount of header material to accept without the content
122      * body beginning -- if more, throw a RecorderTooMuchHeaderException.
123      * TODO: make configurable? make smaller?
124      */
125     protected static final long MAX_HEADER_MATERIAL = 1024*1024; // 1MB
126     
127     // configurable max length, max time limits
128     /*** maximum length of material to record before throwing exception */ 
129     protected long maxLength = Long.MAX_VALUE;
130     /*** maximum time to record before throwing exception */ 
131     protected long timeoutMs = Long.MAX_VALUE;
132     /*** maximum rate to record (adds delays to hit target rate) */ 
133     protected long maxRateBytesPerMs = Long.MAX_VALUE;
134     /*** time recording begins for timeout, rate calculations */ 
135     protected long startTime = Long.MAX_VALUE;
136     
137     /***
138      * When recording HTTP, where the content-body starts.
139      */
140     private long contentBeginMark;
141 
142     /***
143      * Stream to record.
144      */
145     private OutputStream out = null;
146 
147     // mark/reset support 
148     /*** furthest position reached before any reset()s */
149     private long maxPosition = 0;
150     /*** remembered position to reset() to */ 
151     private long markPosition = 0; 
152 
153     /***
154      * Create a new RecordingOutputStream.
155      *
156      * @param bufferSize Buffer size to use.
157      * @param backingFilename Name of backing file to use.
158      */
159     public RecordingOutputStream(int bufferSize, String backingFilename) {
160         this.buffer = new byte[bufferSize];
161         this.backingFilename = backingFilename;
162         recording = true;
163     }
164 
165     /***
166      * Wrap the given stream, both recording and passing along any data written
167      * to this RecordingOutputStream.
168      *
169      * @throws IOException If failed creation of backing file.
170      */
171     public void open() throws IOException {
172         this.open(null);
173     }
174 
175     /***
176      * Wrap the given stream, both recording and passing along any data written
177      * to this RecordingOutputStream.
178      *
179      * @param wrappedStream Stream to wrap.  May be null for case where we
180      * want to write to a file backed stream only.
181      *
182      * @throws IOException If failed creation of backing file.
183      */
184     public void open(OutputStream wrappedStream) throws IOException {
185         if(isOpen()) {
186             // error; should not be opening/wrapping in an unclosed 
187             // stream remains open
188             throw new IOException("ROS already open for "
189                     +Thread.currentThread().getName());
190         }
191         this.out = wrappedStream;
192         this.position = 0;
193         this.markPosition = 0;
194         this.maxPosition = 0; 
195         this.size = 0;
196         this.contentBeginMark = -1;
197         // ensure recording turned on
198         this.recording = true;
199         // Always begins false; must use startDigest() to begin
200         this.shouldDigest = false;
201         if (this.diskStream != null) {
202             closeDiskStream();
203         }
204         if (this.diskStream == null) {
205             // TODO: Fix so we only make file when its actually needed.
206             FileOutputStream fis = new FileOutputStream(this.backingFilename);
207             
208             this.diskStream = new RecyclingFastBufferedOutputStream(fis, bufStreamBuf);
209         }
210         startTime = System.currentTimeMillis();
211     }
212 
213     public void write(int b) throws IOException {
214         if(position<maxPosition) {
215             // revisiting previous content; do nothing but advance position
216             position++;
217             return; 
218         }
219         if(recording) {
220             record(b);
221         }
222         if (this.out != null) {
223             this.out.write(b);
224         }
225         checkLimits();
226     }
227 
228     public void write(byte[] b, int off, int len) throws IOException {
229         if(position < maxPosition) {
230             if(position+len<=maxPosition) {
231                 // revisiting; do nothing but advance position
232                 position += len;
233                 return;
234             }
235             // consume part of the array doing nothing but advancing position
236             long consumeRange = maxPosition - position; 
237             position += consumeRange;
238             off += consumeRange;
239             len -= consumeRange; 
240         }
241         if(recording) {
242             record(b, off, len);
243         }
244         if (this.out != null) {
245             this.out.write(b, off, len);
246         }
247         checkLimits();
248     }
249     
250     /***
251      * Check any enforced limits. 
252      */
253     protected void checkLimits() throws RecorderIOException {
254         // too much material before finding end of headers? 
255         if (contentBeginMark<0) {
256             // no mark yet
257             if(position>MAX_HEADER_MATERIAL) {
258                 throw new RecorderTooMuchHeaderException();
259             }
260         }
261         // overlong?
262         if(position>maxLength) {
263             throw new RecorderLengthExceededException(); 
264         }
265         // taking too long? 
266         long duration = System.currentTimeMillis() - startTime; 
267         duration = Math.max(duration,1); // !divzero
268         if(duration>timeoutMs) {
269             throw new RecorderTimeoutException(); 
270         }
271         // need to throttle reading to hit max configured rate? 
272         if(position/duration > maxRateBytesPerMs) {
273             long desiredDuration = position / maxRateBytesPerMs;
274             try {
275                 Thread.sleep(desiredDuration-duration);
276             } catch (InterruptedException e) {
277                 logger.log(Level.WARNING,
278                         "bandwidth throttling sleep interrupted", e);
279             } 
280         }
281     }
282 
283     /***
284      * Record the given byte for later recovery
285      *
286      * @param b Int to record.
287      *
288      * @exception IOException Failed write to backing file.
289      */
290     private void record(int b) throws IOException {
291         if (this.shouldDigest) {
292             this.digest.update((byte)b);
293         }
294         if (this.position >= this.buffer.length) {
295             // TODO: Its possible to call write w/o having first opened a
296             // stream.  Protect ourselves against this.
297             assert this.diskStream != null: "Diskstream is null";
298             this.diskStream.write(b);
299         } else {
300             this.buffer[(int) this.position] = (byte) b;
301         }
302         this.position++;
303     }
304 
305     /***
306      * Record the given byte-array range for recovery later
307      *
308      * @param b Buffer to record.
309      * @param off Offset into buffer at which to start recording.
310      * @param len Length of buffer to record.
311      *
312      * @exception IOException Failed write to backing file.
313      */
314     private void record(byte[] b, int off, int len) throws IOException {
315         if(this.shouldDigest) {
316             assert this.digest != null: "Digest is null.";
317             this.digest.update(b, off, len);
318         }
319         tailRecord(b, off, len);
320     }
321 
322     /***
323      * Record without digesting.
324      * 
325      * @param b Buffer to record.
326      * @param off Offset into buffer at which to start recording.
327      * @param len Length of buffer to record.
328      *
329      * @exception IOException Failed write to backing file.
330      */
331     private void tailRecord(byte[] b, int off, int len) throws IOException {
332         if(this.position >= this.buffer.length){
333             // TODO: Its possible to call write w/o having first opened a
334             // stream.  Lets protect ourselves against this.
335             if (this.diskStream == null) {
336                 throw new IOException("diskstream is null");
337             }
338             this.diskStream.write(b, off, len);
339             this.position += len;
340         } else {
341             assert this.buffer != null: "Buffer is null";
342             int toCopy = (int)Math.min(this.buffer.length - this.position, len);
343             assert b != null: "Passed buffer is null";
344             System.arraycopy(b, off, this.buffer, (int)this.position, toCopy);
345             this.position += toCopy;
346             // TODO verify these are +1 -1 right
347             if (toCopy < len) {
348                 tailRecord(b, off + toCopy, len - toCopy);
349             }
350         }
351     }
352 
353     public void close() throws IOException {
354         if(contentBeginMark<0) {
355             // if unset, consider 0 posn as content-start
356             // (so that a -1 never survives to replay step)
357             contentBeginMark = 0;
358         }
359         if (this.out != null) {
360             this.out.close();
361             this.out = null;
362         }
363         closeRecorder();
364     }
365     
366     protected synchronized void closeDiskStream()
367     throws IOException {
368         if (this.diskStream != null) {
369             this.diskStream.close();
370             this.diskStream = null;
371         }
372     }
373 
374     public void closeRecorder() throws IOException {
375         recording = false;
376         closeDiskStream(); // if any
377         // This setting of size is important.  Its passed to ReplayInputStream
378         // on creation.  It uses it to know EOS.
379         if (this.size == 0) {
380             this.size = this.position;
381         }
382     }
383 
384     /* (non-Javadoc)
385      * @see java.io.OutputStream#flush()
386      */
387     public void flush() throws IOException {
388         if (this.out != null) {
389             this.out.flush();
390         }
391         if (this.diskStream != null) {
392             this.diskStream.flush();
393         }
394     }
395 
396     public ReplayInputStream getReplayInputStream() throws IOException {
397         return getReplayInputStream(0);
398     }
399     
400     public ReplayInputStream getReplayInputStream(long skip) throws IOException {
401         // If this method is being called, then assumption must be that the
402         // stream is closed. If it ain't, then the stream gotten won't work
403         // -- the size will zero so any attempt at a read will get back EOF.
404         assert this.out == null: "Stream is still open.";
405         ReplayInputStream replay = new ReplayInputStream(this.buffer, 
406                 this.size, this.contentBeginMark, this.backingFilename);
407         replay.skip(skip);
408         return replay; 
409     }
410 
411     /***
412      * Return a replay stream, cued up to begining of content
413      *
414      * @throws IOException
415      * @return An RIS.
416      */
417     public ReplayInputStream getContentReplayInputStream() throws IOException {
418         return getReplayInputStream(this.contentBeginMark);
419     }
420 
421     public long getSize() {
422         return this.size;
423     }
424 
425     /***
426      * Remember the current position as the start of the "response
427      * body". Useful when recording HTTP traffic as a way to start
428      * replays after the headers.
429      */
430     public void markContentBegin() {
431         this.contentBeginMark = this.position;
432         startDigest();
433     }
434 
435     /***
436      * Return stored content-begin-mark (which is also end-of-headers)
437      */
438     public long getContentBegin() {
439         return this.contentBeginMark;
440     }
441     
442     /***
443      * Starts digesting recorded data, if a MessageDigest has been
444      * set.
445      */
446     public void startDigest() {
447         if (this.digest != null) {
448             this.digest.reset();
449             this.shouldDigest = true;
450         }
451     }
452 
453     /***
454      * Convenience method for setting SHA1 digest.
455      * @see #setDigest(String)
456      */
457     public void setSha1Digest() {
458         setDigest(SHA1);
459     }
460     
461 
462     /***
463      * Sets a digest function which may be applied to recorded data.
464      * The difference between calling this method and {@link #setDigest(MessageDigest)}
465      * is that this method tries to reuse MethodDigest instance if already allocated
466      * and of appropriate algorithm.
467      * @param algorithm Message digest algorithm to use.
468      * @see #setDigest(MessageDigest)
469      */
470     public void setDigest(String algorithm) {
471         try {
472             // Reuse extant digest if its sha1 algorithm.
473             if (this.digest == null ||
474                     !this.digest.getAlgorithm().equals(algorithm)) {
475                 setDigest(MessageDigest.getInstance(algorithm));
476             }
477         } catch (NoSuchAlgorithmException e) {
478             e.printStackTrace();
479         }
480     }
481 
482     /***
483      * Sets a digest function which may be applied to recorded data.
484      *
485      * As usually only a subset of the recorded data should
486      * be fed to the digest, you must also call startDigest()
487      * to begin digesting.
488      *
489      * @param md Message digest function to use.
490      */
491     public void setDigest(MessageDigest md) {
492         this.digest = md;
493     }
494 
495     /***
496      * Return the digest value for any recorded, digested data. Call
497      * only after all data has been recorded; otherwise, the running
498      * digest state is ruined.
499      *
500      * @return the digest final value
501      */
502     public byte[] getDigestValue() {
503         if(this.digest == null) {
504             return null;
505         }
506         return this.digest.digest();
507     }
508 
509     public ReplayCharSequence getReplayCharSequence() throws IOException {
510         return getReplayCharSequence(null);
511     }
512 
513     public ReplayCharSequence getReplayCharSequence(String characterEncoding) 
514     throws IOException {
515         return getReplayCharSequence(characterEncoding, this.contentBeginMark);
516     }
517     
518     /***
519      * @param characterEncoding Encoding of recorded stream.
520      * @return A ReplayCharSequence  Will return null if an IOException.  Call
521      * close on returned RCS when done.
522      * @throws IOException
523      */
524     public ReplayCharSequence getReplayCharSequence(String characterEncoding, 
525             long startOffset) throws IOException {
526         // TODO: handled transfer-encoding: chunked content-bodies properly
527         float maxBytesPerChar = IoUtils.encodingMaxBytesPerChar(characterEncoding);
528         if(maxBytesPerChar<=1) {
529             // single
530             // TODO: take into account single-byte encoding may be non-default
531             return new ByteReplayCharSequence(
532                     this.buffer, 
533                     this.size, 
534                     startOffset,
535                     this.backingFilename);
536         } else {
537             // multibyte 
538             if(this.size <= this.buffer.length) {
539                 // raw data is all in memory; do in memory
540                 return new MultiByteReplayCharSequence(
541                         this.buffer, 
542                         this.size, 
543                         startOffset,
544                         characterEncoding);
545                 
546             } else {
547                 // raw data overflows to disk; use temp file
548                 ReplayInputStream ris = getReplayInputStream(startOffset);
549                 ReplayCharSequence rcs = new MultiByteReplayCharSequence(
550                         ris, 
551                         this.backingFilename,
552                         characterEncoding);
553                 ris.close(); 
554                 return rcs;
555             }
556             
557         }
558         
559     }
560 
561     public long getResponseContentLength() {
562         return this.size - this.contentBeginMark;
563     }
564 
565     /***
566      * @return True if this ROS is open.
567      */
568     public boolean isOpen() {
569         return this.out != null;
570     }
571     
572     /***
573      * When used alongside a mark-supporting RecordingInputStream, remember
574      * a position reachable by a future reset().
575      */
576     public void mark() {
577         // remember this position for subsequent reset()
578         this.markPosition = position; 
579     }
580     
581     /***
582      * When used alongside a mark-supporting RecordingInputStream, reset 
583      * the position to that saved by previous mark(). Until the position 
584      * again reached "new" material, none of the bytes pushed to this 
585      * stream will be digested or recorded. 
586      */
587     public void reset() {
588         // take note of furthest-position-reached to avoid double-recording
589         maxPosition = Math.max(maxPosition, position); 
590         // reset to previous position
591         position = markPosition;
592     }
593     
594     /***
595      * Set limits on length, time, and rate to enforce.
596      * 
597      * @param length
598      * @param milliseconds
599      * @param rateKBps
600      */
601     public void setLimits(long length, long milliseconds, long rateKBps) {
602         maxLength = (length>0) ? length : Long.MAX_VALUE;
603         timeoutMs = (milliseconds>0) ? milliseconds : Long.MAX_VALUE;
604         maxRateBytesPerMs = (rateKBps>0) ? rateKBps*1024/1000 : Long.MAX_VALUE;
605     }
606     
607     /***
608      * Reset limits to effectively-unlimited defaults
609      */
610     public void resetLimits() {
611         maxLength = Long.MAX_VALUE;
612         timeoutMs = Long.MAX_VALUE;
613         maxRateBytesPerMs = Long.MAX_VALUE;
614     }
615     
616     /***
617      * Return number of bytes that could be recorded without hitting 
618      * length limit
619      * 
620      * @return long byte count
621      */
622     public long getRemainingLength() {
623         return maxLength - position; 
624     }
625 }