1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import it.unimi.dsi.fastutil.io.RepositionableStream;
26
27 import java.io.BufferedInputStream;
28 import java.io.BufferedWriter;
29 import java.io.EOFException;
30 import java.io.File;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.InputStream;
34 import java.util.ArrayList;
35 import java.util.Iterator;
36 import java.util.List;
37 import java.util.logging.Level;
38 import java.util.logging.Logger;
39 import org.apache.commons.cli.Option;
40 import org.apache.commons.cli.Options;
41 import org.archive.util.MimetypeUtils;
42
43
44 /***
45 * Reader for an Archive file of Archive {@link ArchiveRecord}s.
46 * @author stack
47 * @version $Date: 2007-07-31 00:36:35 +0000 (Tue, 31 Jul 2007) $ $Version$
48 */
49 public abstract class ArchiveReader implements ArchiveFileConstants {
50 /***
51 * Is this Archive file compressed?
52 */
53 private boolean compressed = false;
54
55 /***
56 * Should we digest as we read?
57 */
58 private boolean digest = true;
59
60 /***
61 * Should the parse be strict?
62 */
63 private boolean strict = false;
64
65 /***
66 * Archive file input stream.
67 *
68 * Keep it around so we can close it when done.
69 *
70 * <p>Set in constructor. Must support {@link RepositionableStream}
71 * interface. Make it protected so subclasses have access.
72 */
73 private InputStream in = null;
74
75 /***
76 * Maximum amount of recoverable exceptions in a row.
77 * If more than this amount in a row, we'll let out the exception rather
78 * than go back in for yet another retry.
79 */
80 public static final int MAX_ALLOWED_RECOVERABLES = 10;
81
82
83 /***
84 * The Record currently being read.
85 *
86 * Keep this ongoing reference so we'll close the record even if the caller
87 * doesn't.
88 */
89 private ArchiveRecord currentRecord = null;
90
91 /***
92 * Descriptive string for the Archive file we're going against:
93 * full path, url, etc. -- depends on context in which file was made.
94 */
95 private String identifier = null;
96
97 /***
98 * Archive file version.
99 */
100 private String version = null;
101
102
103 protected ArchiveReader() {
104 super();
105 }
106
107 /***
108 * Convenience method used by subclass constructors.
109 * @param i Identifier for Archive file this reader goes against.
110 */
111 protected void initialize(final String i) {
112 setReaderIdentifier(i);
113 }
114
115 /***
116 * Convenience method for constructors.
117 *
118 * @param f File to read.
119 * @param offset Offset at which to start reading.
120 * @return InputStream to read from.
121 * @throws IOException If failed open or fail to get a memory
122 * mapped byte buffer on file.
123 */
124 protected InputStream getInputStream(final File f, final long offset)
125 throws IOException {
126 return new RandomAccessBufferedInputStream(
127 new RandomAccessInputStream(f, offset));
128 }
129
130 public boolean isCompressed() {
131 return this.compressed;
132 }
133
134 /***
135 * Get record at passed <code>offset</code>.
136 *
137 * @param offset Byte index into file at which a record starts.
138 * @return An Archive Record reference.
139 * @throws IOException
140 */
141 public ArchiveRecord get(long offset) throws IOException {
142 cleanupCurrentRecord();
143 RepositionableStream ps = (RepositionableStream)this.in;
144 long currentOffset = ps.position();
145 if (currentOffset != offset) {
146 currentOffset = offset;
147 ps.position(offset);
148 }
149 return createArchiveRecord(this.in, currentOffset);
150 }
151
152 /***
153 * @return Return Archive Record created against current offset.
154 * @throws IOException
155 */
156 public ArchiveRecord get() throws IOException {
157 return createArchiveRecord(this.in,
158 ((RepositionableStream)this.in).position());
159 }
160
161 public void close() throws IOException {
162 if (this.in != null) {
163 this.in.close();
164 this.in = null;
165 }
166 }
167
168 /***
169 * Rewinds stream to start of the Archive file.
170 * @throws IOException if stream is not resettable.
171 */
172 protected void rewind() throws IOException {
173 cleanupCurrentRecord();
174 if (this.in instanceof RepositionableStream) {
175 try {
176 ((RepositionableStream)this.in).position(0);
177 } catch (IOException e) {
178 throw new RuntimeException(e);
179 }
180 } else {
181 throw new IOException("Stream is not resettable.");
182 }
183 }
184
185 /***
186 * Cleanout the current record if there is one.
187 * @throws IOException
188 */
189 protected void cleanupCurrentRecord() throws IOException {
190 if (this.currentRecord != null) {
191 this.currentRecord.close();
192 gotoEOR(this.currentRecord);
193 this.currentRecord = null;
194 }
195 }
196
197 /***
198 * Return an Archive Record homed on <code>offset</code> into
199 * <code>is</code>.
200 * @param is Stream to read Record from.
201 * @param offset Offset to find Record at.
202 * @return ArchiveRecord instance.
203 * @throws IOException
204 */
205 protected abstract ArchiveRecord createArchiveRecord(InputStream is,
206 long offset)
207 throws IOException;
208
209 /***
210 * Skip over any trailing new lines at end of the record so we're lined up
211 * ready to read the next.
212 * @param record
213 * @throws IOException
214 */
215 protected abstract void gotoEOR(ArchiveRecord record) throws IOException;
216
217 public abstract String getFileExtension();
218 public abstract String getDotFileExtension();
219
220 /***
221 * @return Version of this Archive file.
222 */
223 public String getVersion() {
224 return this.version;
225 }
226
227 /***
228 * Validate the Archive file.
229 *
230 * This method iterates over the file throwing exception if it fails
231 * to successfully parse any record.
232 *
233 * <p>Assumes the stream is at the start of the file.
234 * @return List of all read Archive Headers.
235 *
236 * @throws IOException
237 */
238 public List validate() throws IOException {
239 return validate(-1);
240 }
241
242 /***
243 * Validate the Archive file.
244 *
245 * This method iterates over the file throwing exception if it fails
246 * to successfully parse.
247 *
248 * <p>We start validation from whereever we are in the stream.
249 *
250 * @param noRecords Number of records expected. Pass -1 if number is
251 * unknown.
252 *
253 * @return List of all read metadatas. As we validate records, we add
254 * a reference to the read metadata.
255 *
256 * @throws IOException
257 */
258 public List validate(int noRecords) throws IOException {
259 List<ArchiveRecordHeader> hs = new ArrayList<ArchiveRecordHeader>();
260 int count = 0;
261 setStrict(true);
262 for (Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
263 count++;
264 ArchiveRecord r = i.next();
265 if (r.getHeader().getLength() <= 0
266 && r.getHeader().getMimetype().
267 equals(MimetypeUtils.NO_TYPE_MIMETYPE)) {
268 throw new IOException("ARCRecord content is empty.");
269 }
270 r.close();
271
272 hs.add(r.getHeader());
273 }
274
275 if (noRecords != -1) {
276 if (count != noRecords) {
277 throw new IOException("Count of records, " +
278 Integer.toString(count) + " is less than expected " +
279 Integer.toString(noRecords));
280 }
281 }
282
283 return hs;
284 }
285
286 /***
287 * Test Archive file is valid.
288 * Assumes the stream is at the start of the file. Be aware that this
289 * method makes a pass over the whole file.
290 * @return True if file can be successfully parsed.
291 */
292 public boolean isValid() {
293 boolean valid = false;
294 try {
295 validate();
296 valid = true;
297 } catch(Exception e) {
298
299 valid = false;
300 }
301
302 return valid;
303 }
304
305 /***
306 * @return Returns the strict.
307 */
308 public boolean isStrict() {
309 return this.strict;
310 }
311
312 /***
313 * @param s The strict to set.
314 */
315 public void setStrict(boolean s) {
316 this.strict = s;
317 }
318
319 /***
320 * @param d True if we're to digest.
321 */
322 public void setDigest(boolean d) {
323 this.digest = d;
324 }
325
326 /***
327 * @return True if we're digesting as we read.
328 */
329 public boolean isDigest() {
330 return this.digest;
331 }
332
333 protected Logger getLogger() {
334 return Logger.getLogger(this.getClass().getName());
335 }
336
337 protected InputStream getInputStream() {
338 return this.in;
339 }
340
341 /***
342 * Returns an ArchiveRecord iterator.
343 * Of note, on IOException, especially if ZipException reading compressed
344 * ARCs, rather than fail the iteration, try moving to the next record.
345 * If {@link ArchiveReader#strict} is not set, this will usually succeed.
346 * @return An iterator over ARC records.
347 */
348 public Iterator<ArchiveRecord> iterator() {
349
350 try {
351 cleanupCurrentRecord();
352 } catch (IOException e) {
353 throw new RuntimeException(e);
354 }
355
356
357 try {
358 rewind();
359 } catch (IOException e) {
360 throw new RuntimeException(e);
361 }
362 return new ArchiveRecordIterator();
363 }
364
365 protected void setCompressed(boolean compressed) {
366 this.compressed = compressed;
367 }
368
369 /***
370 * @return The current ARC record or null if none.
371 * After construction has the arcfile header record.
372 * @see #get()
373 */
374 protected ArchiveRecord getCurrentRecord() {
375 return this.currentRecord;
376 }
377
378 protected ArchiveRecord currentRecord(final ArchiveRecord currentRecord) {
379 this.currentRecord = currentRecord;
380 return currentRecord;
381 }
382
383 protected InputStream getIn() {
384 return in;
385 }
386
387 protected void setIn(InputStream in) {
388 this.in = in;
389 }
390
391 protected void setVersion(String version) {
392 this.version = version;
393 }
394
395 public String getReaderIdentifier() {
396 return this.identifier;
397 }
398
399 protected void setReaderIdentifier(final String i) {
400 this.identifier = i;
401 }
402
403 /***
404 * Log on stderr.
405 * Logging should go via the logging system. This method
406 * bypasses the logging system going direct to stderr.
407 * Should not generally be used. Its used for rare messages
408 * that come of cmdline usage of ARCReader ERRORs and WARNINGs.
409 * Override if using ARCReader in a context where no stderr or
410 * where you'd like to redirect stderr to other than System.err.
411 * @param level Level to log message at.
412 * @param message Message to log.
413 */
414 public void logStdErr(Level level, String message) {
415 System.err.println(level.toString() + " " + message);
416 }
417
418 /***
419 * Add buffering to RandomAccessInputStream.
420 */
421 protected class RandomAccessBufferedInputStream
422 extends BufferedInputStream implements RepositionableStream {
423
424 public RandomAccessBufferedInputStream(RandomAccessInputStream is)
425 throws IOException {
426 super(is);
427 }
428
429 public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size)
430 throws IOException {
431 super(is, size);
432 }
433
434 public long position() throws IOException {
435
436
437 return ((RandomAccessInputStream)this.in).position() -
438 (this.count - this.pos);
439 }
440
441 public void position(long position) throws IOException {
442
443 this.pos = 0;
444 this.count = 0;
445 ((RandomAccessInputStream)this.in).position(position);
446 }
447
448 public int available() throws IOException {
449
450 long amount = (long)in.available() + (long)(count - pos);
451 return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount;
452 }
453 }
454
455 /***
456 * Inner ArchiveRecord Iterator class.
457 * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if
458 * trouble pulling record from underlying stream.
459 * @author stack
460 */
461 protected class ArchiveRecordIterator implements Iterator<ArchiveRecord> {
462 private final Logger logger =
463 Logger.getLogger(this.getClass().getName());
464 /***
465 * @return True if we have more records to read.
466 * @exception RuntimeException Can throw an IOException wrapped in a
467 * RuntimeException if a problem reading underlying stream (Corrupted
468 * gzip, etc.).
469 */
470 public boolean hasNext() {
471
472
473 try {
474 cleanupCurrentRecord();
475 } catch (IOException e) {
476 if (isStrict()) {
477 throw new RuntimeException(e);
478 }
479 if (e instanceof EOFException) {
480 logger.warning("Premature EOF cleaning up " +
481 currentRecord.getHeader().toString() + ": " +
482 e.getMessage());
483 return false;
484 }
485
486
487 logger.warning("Trying skip of failed record cleanup of " +
488 currentRecord.getHeader().toString() + ": " +
489 e.getMessage());
490 }
491 return innerHasNext();
492 }
493
494 protected boolean innerHasNext() {
495 long offset = -1;
496 try {
497 offset = ((RepositionableStream)getInputStream()).position();
498 return getInputStream().available() > 0;
499 } catch (IOException e) {
500 throw new RuntimeException("Offset " + offset, e);
501 }
502 }
503
504 /***
505 * Tries to move to next record if we get
506 * {@link RecoverableIOException}. If not <code>strict</code>
507 * tries to move to next record if we get an
508 * {@link IOException}.
509 * @return Next object.
510 * @exception RuntimeException Throws a runtime exception,
511 * usually a wrapping of an IOException, if trouble getting
512 * a record (Throws exception rather than return null).
513 */
514 public ArchiveRecord next() {
515 long offset = -1;
516 try {
517 offset = ((RepositionableStream)getInputStream()).position();
518 return exceptionNext();
519 } catch (IOException e) {
520 if (!isStrict()) {
521
522
523 try {
524 if (hasNext()) {
525 getLogger().warning("Bad Record. Trying skip " +
526 "(Current offset " + offset + "): " +
527 e.getMessage());
528 return exceptionNext();
529 }
530
531
532 throw new RuntimeException("Retried but no next " +
533 "record (Offset " + offset + ")", e);
534 } catch (IOException e1) {
535 throw new RuntimeException("After retry (Offset " +
536 offset + ")", e1);
537 }
538 }
539 throw new RuntimeException("(Offset " + offset + ")", e);
540 }
541 }
542
543 /***
544 * A next that throws exceptions and has handling of
545 * recoverable exceptions moving us to next record. Can call
546 * hasNext which itself may throw exceptions.
547 * @return Next record.
548 * @throws IOException
549 * @throws RuntimeException Thrown when we've reached maximum
550 * retries.
551 */
552 protected ArchiveRecord exceptionNext()
553 throws IOException, RuntimeException {
554 ArchiveRecord result = null;
555 IOException ioe = null;
556 for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&
557 result == null; i--) {
558 ioe = null;
559 try {
560 result = innerNext();
561 } catch (RecoverableIOException e) {
562 ioe = e;
563 getLogger().warning(e.getMessage());
564 if (hasNext()) {
565 continue;
566 }
567
568
569
570
571 break;
572 }
573 }
574 if (ioe != null) {
575
576
577
578 throw new RuntimeException("Retried " +
579 MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);
580 }
581 return result;
582 }
583
584 protected ArchiveRecord innerNext() throws IOException {
585 return get(((RepositionableStream)getInputStream()).position());
586 }
587
588 public void remove() {
589 throw new UnsupportedOperationException();
590 }
591 }
592
593 protected static String stripExtension(final String name,
594 final String ext) {
595 return (!name.endsWith(ext))? name:
596 name.substring(0, name.length() - ext.length());
597 }
598
599 /***
600 * @return short name of Archive file.
601 */
602 public String getFileName() {
603 return (new File(getReaderIdentifier())).getName();
604 }
605
606 /***
607 * @return short name of Archive file.
608 */
609 public String getStrippedFileName() {
610 return getStrippedFileName(getFileName(),
611 getDotFileExtension());
612 }
613
614 /***
615 * @param name Name of ARCFile.
616 * @param dotFileExtension '.arc' or '.warc', etc.
617 * @return short name of Archive file.
618 */
619 public static String getStrippedFileName(String name,
620 final String dotFileExtension) {
621 name = stripExtension(name,
622 ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);
623 return stripExtension(name, dotFileExtension);
624 }
625
626 /***
627 * @param value Value to test.
628 * @return True if value is 'true', else false.
629 */
630 protected static boolean getTrueOrFalse(final String value) {
631 if (value == null || value.length() <= 0) {
632 return false;
633 }
634 return Boolean.TRUE.toString().equals(value.toLowerCase());
635 }
636
637 /***
638 * @param format Format to use outputting.
639 * @throws IOException
640 * @throws java.text.ParseException
641 * @return True if handled.
642 */
643 protected boolean output(final String format)
644 throws IOException, java.text.ParseException {
645 boolean result = true;
646
647
648
649
650
651
652 if (format.equals(DUMP)) {
653
654 setDigest(false);
655 dump(false);
656 } else if (format.equals(GZIP_DUMP)) {
657
658 setDigest(false);
659 dump(true);
660 } else if (format.equals(CDX)) {
661 cdxOutput(false);
662 } else if (format.equals(CDX_FILE)) {
663 cdxOutput(true);
664 } else {
665 result = false;
666 }
667 return result;
668 }
669
670 protected void cdxOutput(boolean toFile)
671 throws IOException {
672 BufferedWriter cdxWriter = null;
673 if (toFile) {
674 String cdxFilename = stripExtension(getReaderIdentifier(),
675 DOT_COMPRESSED_FILE_EXTENSION);
676 cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
677 cdxFilename += ('.' + CDX);
678 cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
679 }
680
681 String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
682 + " n g";
683 if (toFile) {
684 cdxWriter.write(header);
685 cdxWriter.newLine();
686 } else {
687 System.out.println(header);
688 }
689
690 String strippedFileName = getStrippedFileName();
691 try {
692 for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
693 ArchiveRecord r = ii.next();
694 if (toFile) {
695 cdxWriter.write(r.outputCdx(strippedFileName));
696 cdxWriter.newLine();
697 } else {
698 System.out.println(r.outputCdx(strippedFileName));
699 }
700 }
701 } finally {
702 if (toFile) {
703 cdxWriter.close();
704 }
705 }
706 }
707
708 /***
709 * Output passed record using passed format specifier.
710 * @param format What format to use outputting.
711 * @throws IOException
712 * @return True if handled.
713 */
714 public boolean outputRecord(final String format)
715 throws IOException {
716 boolean result = true;
717 if (format.equals(CDX)) {
718 System.out.println(get().outputCdx(getStrippedFileName()));
719 } else if(format.equals(ArchiveFileConstants.DUMP)) {
720
721 setDigest(false);
722 get().dump();
723 } else {
724 result = false;
725 }
726 return result;
727 }
728
729 /***
730 * Dump this file on STDOUT
731 * @throws compress True if dumped output is compressed.
732 * @throws IOException
733 * @throws java.text.ParseException
734 */
735 public abstract void dump(final boolean compress)
736 throws IOException, java.text.ParseException;
737
738 /***
739 * @return an ArchiveReader that will delete a local file on close. Used
740 * when we bring Archive files local and need to clean up afterward.
741 */
742 public abstract ArchiveReader getDeleteFileOnCloseReader(final File f);
743
744 /***
745 * Output passed record using passed format specifier.
746 * @param r ARCReader instance to output.
747 * @param format What format to use outputting.
748 * @throws IOException
749 */
750 protected static void outputRecord(final ArchiveReader r,
751 final String format)
752 throws IOException {
753 if (!r.outputRecord(format)) {
754 throw new IOException("Unsupported format" +
755 " (or unsupported on a single record): " + format);
756 }
757 }
758
759 /***
760 * @return Base Options object filled out with help, digest, strict, etc.
761 * options.
762 */
763 protected static Options getOptions() {
764 Options options = new Options();
765 options.addOption(new Option("h","help", false,
766 "Prints this message and exits."));
767 options.addOption(new Option("o","offset", true,
768 "Outputs record at this offset into file."));
769 options.addOption(new Option("d","digest", true,
770 "Pass true|false. Expensive. Default: true (SHA-1)."));
771 options.addOption(new Option("s","strict", false,
772 "Strict mode. Fails parse if incorrectly formatted file."));
773 options.addOption(new Option("f","format", true,
774 "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +
775 "'or 'nohead'. Default: 'cdx'."));
776 return options;
777 }
778 }