1   /* $Id: ArchiveReader.java 5369 2007-07-31 00:36:35Z gojomo $
2    *
3    * Created on August 21st, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import it.unimi.dsi.fastutil.io.RepositionableStream;
26  
27  import java.io.BufferedInputStream;
28  import java.io.BufferedWriter;
29  import java.io.EOFException;
30  import java.io.File;
31  import java.io.FileWriter;
32  import java.io.IOException;
33  import java.io.InputStream;
34  import java.util.ArrayList;
35  import java.util.Iterator;
36  import java.util.List;
37  import java.util.logging.Level;
38  import java.util.logging.Logger;
39  import org.apache.commons.cli.Option;
40  import org.apache.commons.cli.Options;
41  import org.archive.util.MimetypeUtils;
42  
43  
44  /***
45   * Reader for an Archive file of Archive {@link ArchiveRecord}s.
46   * @author stack
47   * @version $Date: 2007-07-31 00:36:35 +0000 (Tue, 31 Jul 2007) $ $Version$
48   */
49  public abstract class ArchiveReader implements ArchiveFileConstants {    
50      /***
51       * Is this Archive file compressed?
52       */
53      private boolean compressed = false;
54      
55      /***
56       * Should we digest as we read?
57       */
58      private boolean digest = true;
59      
60      /***
61       * Should the parse be strict?
62       */
63      private boolean strict = false;
64      
65      /***
66       * Archive file input stream.
67       *
68       * Keep it around so we can close it when done.
69       *
70       * <p>Set in constructor. Must support {@link RepositionableStream}
71       * interface.  Make it protected so subclasses have access.
72       */
73      private InputStream in = null;
74      
75      /***
76       * Maximum amount of recoverable exceptions in a row.
77       * If more than this amount in a row, we'll let out the exception rather
78       * than go back in for yet another retry.
79       */
80      public static final int MAX_ALLOWED_RECOVERABLES = 10;
81      
82  
83      /***
84       * The Record currently being read.
85       *
86       * Keep this ongoing reference so we'll close the record even if the caller
87       * doesn't.
88       */
89      private ArchiveRecord currentRecord = null;
90      
91      /***
92       * Descriptive string for the Archive file we're going against:
93       * full path, url, etc. -- depends on context in which file was made.
94       */
95      private String identifier = null;
96      
97      /***
98       * Archive file version.
99       */
100     private String version = null;
101     
102     
103     protected ArchiveReader() {
104         super();
105     }
106     
107     /***
108      * Convenience method used by subclass constructors.
109      * @param i Identifier for Archive file this reader goes against.
110      */
111     protected void initialize(final String i) {
112         setReaderIdentifier(i);
113     }
114     
115     /***
116      * Convenience method for constructors.
117      * 
118      * @param f File to read.
119      * @param offset Offset at which to start reading.
120      * @return InputStream to read from.
121      * @throws IOException If failed open or fail to get a memory
122      * mapped byte buffer on file.
123      */
124     protected InputStream getInputStream(final File f, final long offset)
125     throws IOException {
126         return new RandomAccessBufferedInputStream(
127             new RandomAccessInputStream(f, offset));
128     }
129 
130     public boolean isCompressed() {
131         return this.compressed;
132     }
133 
134     /***
135      * Get record at passed <code>offset</code>.
136      * 
137      * @param offset Byte index into file at which a record starts.
138      * @return An Archive Record reference.
139      * @throws IOException
140      */
141     public ArchiveRecord get(long offset) throws IOException {
142         cleanupCurrentRecord();
143         RepositionableStream ps = (RepositionableStream)this.in;
144         long currentOffset = ps.position();
145         if (currentOffset != offset) {
146             currentOffset = offset;
147             ps.position(offset);
148         }
149         return createArchiveRecord(this.in, currentOffset);
150     }
151     
152     /***
153      * @return Return Archive Record created against current offset.
154      * @throws IOException
155      */
156     public ArchiveRecord get() throws IOException {
157         return createArchiveRecord(this.in,
158             ((RepositionableStream)this.in).position());
159     }
160 
161     public void close() throws IOException {
162         if (this.in != null) {
163             this.in.close();
164             this.in = null;
165         }
166     }
167     
168     /***
169      * Rewinds stream to start of the Archive file.
170      * @throws IOException if stream is not resettable.
171      */
172     protected void rewind() throws IOException {
173         cleanupCurrentRecord();
174         if (this.in instanceof RepositionableStream) {
175             try {
176                 ((RepositionableStream)this.in).position(0);
177             } catch (IOException e) {
178                 throw new RuntimeException(e);
179             }
180        } else {
181            throw new IOException("Stream is not resettable.");
182        }
183     }
184     
185     /***
186      * Cleanout the current record if there is one.
187      * @throws IOException
188      */
189     protected void cleanupCurrentRecord() throws IOException {
190         if (this.currentRecord != null) {
191             this.currentRecord.close();
192             gotoEOR(this.currentRecord);
193             this.currentRecord = null;
194         }
195     }
196     
197     /***
198      * Return an Archive Record homed on <code>offset</code> into
199      * <code>is</code>.
200      * @param is Stream to read Record from.
201      * @param offset Offset to find Record at.
202      * @return ArchiveRecord instance.
203      * @throws IOException
204      */
205     protected abstract ArchiveRecord createArchiveRecord(InputStream is,
206     	long offset)
207     throws IOException;
208     
209     /***
210      * Skip over any trailing new lines at end of the record so we're lined up
211      * ready to read the next.
212      * @param record
213      * @throws IOException
214      */
215     protected abstract void gotoEOR(ArchiveRecord record) throws IOException;
216     
217     public abstract String getFileExtension();
218     public abstract String getDotFileExtension();
219 
220     /***
221      * @return Version of this Archive file.
222      */
223     public String getVersion() {
224     	return this.version;
225     }
226 
227     /***
228      * Validate the Archive file.
229      *
230      * This method iterates over the file throwing exception if it fails
231      * to successfully parse any record.
232      *
233      * <p>Assumes the stream is at the start of the file.
234      * @return List of all read Archive Headers.
235      *
236      * @throws IOException
237      */
238     public List validate() throws IOException {
239         return validate(-1);
240     }
241 
242     /***
243      * Validate the Archive file.
244      *
245      * This method iterates over the file throwing exception if it fails
246      * to successfully parse.
247      *
248      * <p>We start validation from whereever we are in the stream.
249      *
250      * @param noRecords Number of records expected.  Pass -1 if number is
251      * unknown.
252      *
253      * @return List of all read metadatas. As we validate records, we add
254      * a reference to the read metadata.
255      *
256      * @throws IOException
257      */
258     public List validate(int noRecords) throws IOException {
259         List<ArchiveRecordHeader> hs = new ArrayList<ArchiveRecordHeader>();
260         int count = 0;
261         setStrict(true);
262         for (Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
263             count++;
264             ArchiveRecord r = i.next();
265             if (r.getHeader().getLength() <= 0
266                 && r.getHeader().getMimetype().
267                     equals(MimetypeUtils.NO_TYPE_MIMETYPE)) {
268                 throw new IOException("ARCRecord content is empty.");
269             }
270             r.close();
271             // Add reference to metadata into a list of metadatas.
272             hs.add(r.getHeader());
273         }
274 
275         if (noRecords != -1) {
276             if (count != noRecords) {
277                 throw new IOException("Count of records, " +
278                     Integer.toString(count) + " is less than expected " +
279                     Integer.toString(noRecords));
280             }
281         }
282 
283         return hs;
284     }
285 
286     /***
287      * Test Archive file is valid.
288      * Assumes the stream is at the start of the file.  Be aware that this
289      * method makes a pass over the whole file. 
290      * @return True if file can be successfully parsed.
291      */
292     public boolean isValid() {
293         boolean valid = false;
294         try {
295             validate();
296             valid = true;
297         } catch(Exception e) {
298             // File is not valid if exception thrown parsing.
299             valid = false;
300         }
301     
302         return valid;
303     }
304 
305     /***
306      * @return Returns the strict.
307      */
308     public boolean isStrict() {
309         return this.strict;
310     }
311 
312     /***
313      * @param s The strict to set.
314      */
315     public void setStrict(boolean s) {
316         this.strict = s;
317     }
318 
319     /***
320      * @param d True if we're to digest.
321      */
322     public void setDigest(boolean d) {
323         this.digest = d;
324     }
325 
326     /***
327      * @return True if we're digesting as we read.
328      */
329     public boolean isDigest() {
330         return this.digest;
331     }
332  
333     protected Logger getLogger() {
334         return Logger.getLogger(this.getClass().getName());
335     }
336     
337     protected InputStream getInputStream() {
338         return this.in;
339     }
340     
341     /***
342      * Returns an ArchiveRecord iterator.
343      * Of note, on IOException, especially if ZipException reading compressed
344      * ARCs, rather than fail the iteration, try moving to the next record.
345      * If {@link ArchiveReader#strict} is not set, this will usually succeed.
346      * @return An iterator over ARC records.
347      */
348     public Iterator<ArchiveRecord> iterator() {
349         // Eat up any record outstanding.
350         try {
351             cleanupCurrentRecord();
352         } catch (IOException e) {
353             throw new RuntimeException(e);
354         }
355         
356         // Now reset stream to the start of the arc file.
357         try {
358             rewind();
359         } catch (IOException e) {
360             throw new RuntimeException(e);
361         }
362         return new ArchiveRecordIterator();
363     }
364 
365 	protected void setCompressed(boolean compressed) {
366 		this.compressed = compressed;
367 	}
368 
369     /***
370      * @return The current ARC record or null if none.
371      * After construction has the arcfile header record.
372      * @see #get()
373      */
374 	protected ArchiveRecord getCurrentRecord() {
375 		return this.currentRecord;
376 	}
377 
378 	protected ArchiveRecord currentRecord(final ArchiveRecord currentRecord) {
379 		this.currentRecord = currentRecord;
380         return currentRecord;
381 	}
382 
383 	protected InputStream getIn() {
384 		return in;
385 	}
386 
387 	protected void setIn(InputStream in) {
388 		this.in = in;
389 	}
390 
391 	protected void setVersion(String version) {
392 		this.version = version;
393 	}
394 
395 	public String getReaderIdentifier() {
396 		return this.identifier;
397 	}
398 
399 	protected void setReaderIdentifier(final String i) {
400 		this.identifier = i;
401 	}
402 	
403     /***
404      * Log on stderr.
405      * Logging should go via the logging system.  This method
406      * bypasses the logging system going direct to stderr.
407      * Should not generally be used.  Its used for rare messages
408      * that come of cmdline usage of ARCReader ERRORs and WARNINGs.
409      * Override if using ARCReader in a context where no stderr or
410      * where you'd like to redirect stderr to other than System.err.
411      * @param level Level to log message at.
412      * @param message Message to log.
413      */
414     public void logStdErr(Level level, String message) {
415         System.err.println(level.toString() + " " + message);
416     }
417     
418     /***
419      * Add buffering to RandomAccessInputStream.
420      */
421     protected class RandomAccessBufferedInputStream
422     extends BufferedInputStream implements RepositionableStream {
423 
424         public RandomAccessBufferedInputStream(RandomAccessInputStream is)
425         		throws IOException {
426             super(is);
427         }
428 
429         public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size)
430         		throws IOException {
431             super(is, size);
432         }
433 
434         public long position() throws IOException {
435             // Current position is the underlying files position
436             // minus the amount thats in the buffer yet to be read.
437             return ((RandomAccessInputStream)this.in).position() -
438             	(this.count - this.pos);
439         }
440 
441         public void position(long position) throws IOException {
442             // Force refill of buffer whenever there's been a seek.
443             this.pos = 0;
444             this.count = 0;
445             ((RandomAccessInputStream)this.in).position(position);
446         }
447         
448         public int available() throws IOException {
449             // Avoid overflow on large datastreams
450             long amount = (long)in.available() + (long)(count - pos);
451             return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount;
452         }
453     }
454     
455     /***
456      * Inner ArchiveRecord Iterator class.
457      * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if
458      * trouble pulling record from underlying stream.
459      * @author stack
460      */
461     protected class ArchiveRecordIterator implements Iterator<ArchiveRecord> {
462         private final Logger logger =
463             Logger.getLogger(this.getClass().getName());
464         /***
465          * @return True if we have more records to read.
466          * @exception RuntimeException Can throw an IOException wrapped in a
467          * RuntimeException if a problem reading underlying stream (Corrupted
468          * gzip, etc.).
469          */
470         public boolean hasNext() {
471             // Call close on any extant record.  This will scoot us past
472             // any content not yet read.
473             try {
474                 cleanupCurrentRecord();
475             } catch (IOException e) {
476                 if (isStrict()) {
477                     throw new RuntimeException(e);
478                 }
479                 if (e instanceof EOFException) {
480                     logger.warning("Premature EOF cleaning up " + 
481                         currentRecord.getHeader().toString() + ": " +
482                         e.getMessage());
483                     return false;
484                 }
485                 // If not strict, try going again.  We might be able to skip
486                 // over the bad record.
487                 logger.warning("Trying skip of failed record cleanup of " +
488                     currentRecord.getHeader().toString() + ": " +
489                     e.getMessage());
490             }
491             return innerHasNext();
492         }
493         
494         protected boolean innerHasNext() {
495             long offset = -1;
496             try {
497                 offset = ((RepositionableStream)getInputStream()).position();
498                 return getInputStream().available() > 0;
499             } catch (IOException e) {
500                 throw new RuntimeException("Offset " + offset, e);
501             }
502         }
503 
504         /***
505          * Tries to move to next record if we get
506          * {@link RecoverableIOException}. If not <code>strict</code>
507          * tries to move to next record if we get an
508          * {@link IOException}.
509          * @return Next object.
510          * @exception RuntimeException Throws a runtime exception,
511          * usually a wrapping of an IOException, if trouble getting
512          * a record (Throws exception rather than return null).
513          */
514         public ArchiveRecord next() {
515             long offset = -1;
516             try {
517                 offset = ((RepositionableStream)getInputStream()).position();
518                 return exceptionNext();
519             } catch (IOException e) {
520                 if (!isStrict()) {
521                     // Retry though an IOE.  Maybe we will succeed reading
522                     // subsequent record.
523                     try {
524                         if (hasNext()) {
525                             getLogger().warning("Bad Record. Trying skip " +
526                                 "(Current offset " +  offset + "): " +
527                                 e.getMessage());
528                             return exceptionNext();
529                         }
530                         // Else we are at last record.  Iterator#next is
531                         // expecting value. We do not have one. Throw exception.
532                         throw new RuntimeException("Retried but no next " + 
533                             "record (Offset " + offset + ")", e);
534                     } catch (IOException e1) {
535                         throw new RuntimeException("After retry (Offset " +
536                                 offset + ")", e1);
537                     }
538                 }
539                 throw new RuntimeException("(Offset " + offset + ")", e);
540             }
541         }
542         
543         /***
544          * A next that throws exceptions and has handling of
545          * recoverable exceptions moving us to next record. Can call
546          * hasNext which itself may throw exceptions.
547          * @return Next record.
548          * @throws IOException
549          * @throws RuntimeException Thrown when we've reached maximum
550          * retries.
551          */
552         protected ArchiveRecord exceptionNext()
553         throws IOException, RuntimeException {
554             ArchiveRecord result = null;
555             IOException ioe = null;
556             for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&
557                     result == null; i--) {
558                 ioe = null;
559                 try {
560                     result = innerNext();
561                 } catch (RecoverableIOException e) {
562                     ioe = e;
563                     getLogger().warning(e.getMessage());
564                     if (hasNext()) {
565                         continue;
566                     }
567                     // No records left.  Throw exception rather than
568                     // return null.  The caller is expecting to get
569                     // back a record since they've just called
570                     // hasNext.
571                     break;
572                 }
573             }
574             if (ioe != null) {
575                 // Then we did MAX_ALLOWED_RECOVERABLES retries.  Throw
576                 // the recoverable ioe wrapped in a RuntimeException so
577                 // it goes out pass checks for IOE.
578                 throw new RuntimeException("Retried " +
579                     MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);
580             }
581             return result;
582         }
583         
584         protected ArchiveRecord innerNext() throws IOException {
585             return get(((RepositionableStream)getInputStream()).position());
586         }
587         
588         public void remove() {
589             throw new UnsupportedOperationException();
590         }
591     }
592     
593     protected static String stripExtension(final String name,
594     		final String ext) {
595         return (!name.endsWith(ext))? name:
596             name.substring(0, name.length() - ext.length());
597     }
598     
599     /***
600      * @return short name of Archive file.
601      */
602     public String getFileName() {
603         return (new File(getReaderIdentifier())).getName();
604     }
605 
606     /***
607      * @return short name of Archive file.
608      */
609     public String getStrippedFileName() {
610         return getStrippedFileName(getFileName(),
611     		getDotFileExtension());
612     }
613     
614     /***
615      * @param name Name of ARCFile.
616      * @param dotFileExtension '.arc' or '.warc', etc.
617      * @return short name of Archive file.
618      */
619     public static String getStrippedFileName(String name,
620     		final String dotFileExtension) {
621     	name = stripExtension(name,
622     		ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);
623     	return stripExtension(name, dotFileExtension);
624     }
625     
626     /***
627      * @param value Value to test.
628      * @return True if value is 'true', else false.
629      */
630     protected static boolean getTrueOrFalse(final String value) {
631     	if (value == null || value.length() <= 0) {
632     		return false;
633     	}
634         return Boolean.TRUE.toString().equals(value.toLowerCase());
635     }
636     
637     /***
638      * @param format Format to use outputting.
639      * @throws IOException
640      * @throws java.text.ParseException
641      * @return True if handled.
642      */
643     protected boolean output(final String format)
644     throws IOException, java.text.ParseException {
645     	boolean result = true;
646         // long start = System.currentTimeMillis();
647     	
648         // Write output as pseudo-CDX file.  See
649         // http://www.archive.org/web/researcher/cdx_legend.php
650         // and http://www.archive.org/web/researcher/example_cdx.php.
651         // Hash is hard-coded straight SHA-1 hash of content.
652         if (format.equals(DUMP)) {
653         	// No point digesting dumping.
654         	setDigest(false);
655             dump(false);
656         } else if (format.equals(GZIP_DUMP)) {
657         	// No point digesting dumping.
658         	setDigest(false);
659             dump(true);
660         } else if (format.equals(CDX)) {
661         	cdxOutput(false);   
662         } else if (format.equals(CDX_FILE)) {
663             cdxOutput(true);
664         } else {
665         	result = false;
666         }	
667         return result;
668     }
669     
670     protected void cdxOutput(boolean toFile)
671     throws IOException {
672         BufferedWriter cdxWriter = null;
673         if (toFile) {
674             String cdxFilename = stripExtension(getReaderIdentifier(),
675                 DOT_COMPRESSED_FILE_EXTENSION);
676             cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
677             cdxFilename += ('.' + CDX);
678             cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
679         }
680         
681         String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
682             + " n g";
683         if (toFile) {
684             cdxWriter.write(header);
685             cdxWriter.newLine();
686         } else {
687             System.out.println(header);
688         }
689         
690         String strippedFileName = getStrippedFileName();
691         try {
692             for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
693             	ArchiveRecord r = ii.next();
694                 if (toFile) {
695                     cdxWriter.write(r.outputCdx(strippedFileName));
696                     cdxWriter.newLine();
697                 } else {
698                     System.out.println(r.outputCdx(strippedFileName));
699                 }
700             }
701         } finally {
702             if (toFile) {
703                 cdxWriter.close();
704             }
705         }
706     }
707     
708     /***
709      * Output passed record using passed format specifier.
710      * @param format What format to use outputting.
711      * @throws IOException
712      * @return True if handled.
713      */
714     public boolean outputRecord(final String format)
715     throws IOException {
716     	boolean result = true;
717         if (format.equals(CDX)) {
718             System.out.println(get().outputCdx(getStrippedFileName()));
719         } else if(format.equals(ArchiveFileConstants.DUMP)) {
720             // No point digesting if dumping content.
721             setDigest(false);
722             get().dump();
723         } else {
724         	result = false;
725         }
726         return result;
727     }
728 
729     /***
730      * Dump this file on STDOUT
731      * @throws compress True if dumped output is compressed.
732      * @throws IOException
733      * @throws java.text.ParseException
734      */
735     public abstract void dump(final boolean compress)
736     throws IOException, java.text.ParseException;
737     
738     /***
739      * @return an ArchiveReader that will delete a local file on close.  Used
740      * when we bring Archive files local and need to clean up afterward.
741      */
742     public abstract ArchiveReader getDeleteFileOnCloseReader(final File f);
743     
744     /***
745      * Output passed record using passed format specifier.
746      * @param r ARCReader instance to output.
747      * @param format What format to use outputting.
748      * @throws IOException
749      */
750     protected static void outputRecord(final ArchiveReader r,
751         final String format)
752     throws IOException {
753         if (!r.outputRecord(format)) {
754             throw new IOException("Unsupported format" +
755                 " (or unsupported on a single record): " + format);
756         }
757     }
758     
759     /***
760      * @return Base Options object filled out with help, digest, strict, etc.
761      * options.
762      */
763     protected static Options getOptions() {
764         Options options = new Options();
765         options.addOption(new Option("h","help", false,
766             "Prints this message and exits."));
767         options.addOption(new Option("o","offset", true,
768             "Outputs record at this offset into file."));
769         options.addOption(new Option("d","digest", true,
770             "Pass true|false. Expensive. Default: true (SHA-1)."));
771         options.addOption(new Option("s","strict", false,
772             "Strict mode. Fails parse if incorrectly formatted file."));
773         options.addOption(new Option("f","format", true,
774             "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +
775             "'or 'nohead'. Default: 'cdx'."));
776         return options;
777     }
778 }