1   /* $Id: ARCReader.java 5039 2007-04-06 00:29:39Z gojomo $
2    *
3    * Created on May 1, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.arc;
24  
25  import java.io.ByteArrayOutputStream;
26  import java.io.File;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.HashMap;
32  import java.util.Iterator;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.concurrent.atomic.AtomicInteger;
36  import java.util.logging.Level;
37  import java.util.logging.Logger;
38  import java.util.regex.Matcher;
39  
40  import org.apache.commons.cli.CommandLine;
41  import org.apache.commons.cli.HelpFormatter;
42  import org.apache.commons.cli.Option;
43  import org.apache.commons.cli.Options;
44  import org.apache.commons.cli.ParseException;
45  import org.apache.commons.cli.PosixParser;
46  import org.archive.io.ArchiveReader;
47  import org.archive.io.ArchiveRecord;
48  import org.archive.io.ArchiveRecordHeader;
49  import org.archive.io.RecoverableIOException;
50  import org.archive.io.WriterPoolMember;
51  import org.archive.util.ArchiveUtils;
52  import org.archive.util.InetAddressUtil;
53  import org.archive.util.TextUtils;
54  
55  
56  /***
57   * Get an iterator on an ARC file or get a record by absolute position.
58   *
59   * ARC files are described here:
60   * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
61   * File Format</a>.
62   *
63   * <p>This class knows how to parse an ARC file.  Pass it a file path
64   * or an URL to an ARC. It can parse ARC Version 1 and 2.
65   *
66   * <p>Iterator returns <code>ARCRecord</code>
67   * though {@link Iterator#next()} is returning
68   * java.lang.Object.  Cast the return.
69   *
70   * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
71   * latter slightly slower -- but not by much.  TODO: Test more.  Just
72   * change {@link #getInputStream(File, long)}.
73   *
74   * @author stack
75   * @version $Date: 2007-04-06 00:29:39 +0000 (Fri, 06 Apr 2007) $ $Revision: 5039 $
76   */
77  public abstract class ARCReader extends ArchiveReader
78  implements ARCConstants {
79      Logger logger = Logger.getLogger(ARCReader.class.getName());
80      
81      /***
82       * Set to true if we are aligned on first record of Archive file.
83       * We used depend on offset. If offset was zero, then we were
84       * aligned on first record.  This is no longer necessarily the case when
85       * Reader is created at an offset into an Archive file: The offset is zero
86       * but its relative to where we started reading.
87       */
88      private boolean alignedOnFirstRecord = true;
89      
90      /***
91       * Assumed maximum size of a record meta header line.
92       *
93       * This 100k which seems massive but its the same as the LINE_LENGTH from
94       * <code>alexa/include/a_arcio.h</code>:
95       * <pre>
96       * #define LINE_LENGTH     (100*1024)
97       * </pre>
98       */
99      private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100;
100 
101     /***
102      * Array of field names.
103      * 
104      * Used to initialize <code>headerFieldNameKeys</code>.
105      */
106     private final String [] headerFieldNameKeysArray = {
107         URL_FIELD_KEY,
108         IP_HEADER_FIELD_KEY,
109         DATE_FIELD_KEY,
110         MIMETYPE_FIELD_KEY,
111         LENGTH_FIELD_KEY
112     };
113     
114     /***
115      * An array of the header field names found in the ARC file header on
116      * the 3rd line.
117      * 
118      * We used to read these in from the arc file first record 3rd line but
119      * now we hardcode them for sake of improved performance.
120      */
121     private final List<String> headerFieldNameKeys =
122         Arrays.asList(this.headerFieldNameKeysArray);
123     
124     private boolean parseHttpHeaders = true;
125     
126     ARCReader() {
127     	super();
128     }
129     
130     /***
131      * Skip over any trailing new lines at end of the record so we're lined up
132      * ready to read the next.
133      * @param record
134      * @throws IOException
135      */
136     protected void gotoEOR(ArchiveRecord record) throws IOException {
137         if (getIn().available() <= 0) {
138             return;
139         }
140         
141         // Remove any trailing LINE_SEPARATOR
142         int c = -1;
143         while (getIn().available() > 0) {
144             if (getIn().markSupported()) {
145                 getIn().mark(1);
146             }
147             c = getIn().read();
148             if (c != -1) {
149                 if (c == LINE_SEPARATOR) {
150                     continue;
151                 }
152                 if (getIn().markSupported()) {
153                     // We've overread.  We're probably in next record.  There is
154                     // no way of telling for sure. It may be dross at end of
155                     // current record. Backup.
156                 	getIn().reset();
157                     break;
158                 }
159                 ArchiveRecordHeader h = (getCurrentRecord() != null)?
160                     record.getHeader(): null;
161                 throw new IOException("Read " + (char)c +
162                     " when only " + LINE_SEPARATOR + " expected. " + 
163                     getReaderIdentifier() + ((h != null)?
164                         h.getHeaderFields().toString(): ""));
165             }
166         }
167     }
168     
169     /***
170      * Create new arc record.
171      *
172      * Encapsulate housekeeping that has to do w/ creating a new record.
173      *
174      * <p>Call this method at end of constructor to read in the
175      * arcfile header.  Will be problems reading subsequent arc records
176      * if you don't since arcfile header has the list of metadata fields for
177      * all records that follow.
178      * 
179      * <p>When parsing through ARCs writing out CDX info, we spend about
180      * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
181      * -- of which 16% is reading.
182      *
183      * @param is InputStream to use.
184      * @param offset Absolute offset into arc file.
185      * @return An arc record.
186      * @throws IOException
187      */
188     protected ARCRecord createArchiveRecord(InputStream is, long offset)
189     throws IOException {
190         ArrayList<String> firstLineValues = new ArrayList<String>(20);
191         getTokenizedHeaderLine(is, firstLineValues);
192         int bodyOffset = 0;
193         if (offset == 0 && isAlignedOnFirstRecord()) {
194             // If offset is zero and we were aligned at first record on
195             // creation (See #alignedOnFirstRecord for more on this), then no
196             // records have been read yet and we're reading our first one, the
197             // record of ARC file meta info.  Its special.  In ARC versions
198             // 1.x, first record has three lines of meta info. We've just read
199             // the first line. There are two more.  The second line has misc.
200             // info.  We're only interested in the first field, the version
201             // number.  The third line is the list of field names. Here's what
202             // ARC file version 1.x meta content looks like:
203             //
204             // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 //
205             //      20040107015752 text/plain 77
206             // 1 0 InternetArchive
207             // URL IP-address Archive-date Content-type Archive-length
208             //
209             ArrayList<String> secondLineValues = new ArrayList<String>(20);
210             bodyOffset += getTokenizedHeaderLine(is, secondLineValues);
211             setVersion((String)secondLineValues.get(0) +
212                 "." + (String)secondLineValues.get(1));
213             // Just read over the 3rd line.  We used to parse it and use
214             // values found here but now we just hardcode them to avoid
215             // having to read this 3rd line even for random arc file accesses.
216             bodyOffset += getTokenizedHeaderLine(is, null);
217         }
218 
219         try {
220             currentRecord(new ARCRecord(is,
221                 (ArchiveRecordHeader)computeMetaData(this.headerFieldNameKeys,
222                 	firstLineValues,
223                     getVersion(), offset), bodyOffset, isDigest(),
224                     isStrict(), isParseHttpHeaders()));
225         } catch (IOException e) {
226             if (e instanceof RecoverableIOException) {
227                 // Don't mess with RecoverableIOExceptions.  Let them out.
228                 throw e;
229             }
230             IOException newE = new IOException(e.getMessage() + " (Offset " +
231                     offset + ").");
232             newE.setStackTrace(e.getStackTrace());
233             throw newE;
234         }
235         return (ARCRecord)getCurrentRecord();
236     }
237     
238     /***
239      * Returns version of this ARC file.  Usually read from first record of ARC.
240      * If we're reading without having first read the first record -- e.g.
241      * random access into middle of an ARC -- then version will not have been
242      * set.  For now, we return a default, version 1.1.  Later, if more than
243      * just one version of ARC, we could look at such as the meta line to see
244      * what version of ARC this is.
245      * @return Version of this ARC file.
246      */
247     public String getVersion() {
248         return (super.getVersion() == null)? "1.1": super.getVersion();
249     }
250 
251     /***
252      * Get a record header line as list of tokens.
253      *
254      * We keep reading till we find a LINE_SEPARATOR or we reach the end
255      * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
256      *
257      * @param stream InputStream to read from.
258      * @param list Empty list that gets filled w/ string tokens.
259      * @return Count of characters read.
260      * @exception IOException If problem reading stream or no line separator
261      * found or EOF before EOL or we didn't get minimum header fields.
262      */
263     private int getTokenizedHeaderLine(final InputStream stream,
264             List<String> list) throws IOException {
265         // Preallocate usual line size.
266         StringBuilder buffer = new StringBuilder(2048 + 20);
267         int read = 0;
268         int previous = -1;
269         for (int c = -1; true;) {
270         	previous = c;
271             c = stream.read();
272             if (c == -1) {
273                 throw new RecoverableIOException("Hit EOF before header EOL.");
274             }
275             c &= 0xff; 
276             read++;
277             if (read > MAX_HEADER_LINE_LENGTH) {
278                 throw new IOException("Header line longer than max allowed " +
279                     " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +
280                     " -- or passed buffer doesn't contain a line (Read: " +
281                     buffer.length() + ").  Here's" +
282                     " some of what was read: " +
283                     buffer.substring(0, Math.min(buffer.length(), 256)));
284             }
285 
286             if (c == LINE_SEPARATOR) {
287                 if (buffer.length() == 0) {
288                     // Empty line at start of buffer.  Skip it and try again.
289                     continue;
290                 }
291 
292                 if (list != null) {
293                     list.add(buffer.toString());
294                 }
295                 // LOOP TERMINATION.
296                 break;
297             } else if (c == HEADER_FIELD_SEPARATOR) {
298             	if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
299             		// Early ARCs sometimes had multiple spaces between fields.
300             		continue;
301             	}
302                 if (list != null) {
303                     list.add(buffer.toString());
304                 }
305                 // reset to empty
306                 buffer.setLength(0);
307             } else {
308                 buffer.append((char)c);
309             }
310         }
311 
312         // List must have at least 3 elements in it and no more than 10.  If
313         // it has other than this, then bogus parse.
314         if (list != null && (list.size() < 3 || list.size() > 100)) {
315             throw new IOException("Unparseable header line: " + list);
316         }
317 
318         return read;
319     }
320 
321     /***
322      * Compute metadata fields.
323      *
324      * Here we check the meta field has right number of items in it.
325      *
326      * @param keys Keys to use composing headerFields map.
327      * @param values Values to set into the headerFields map.
328      * @param v The version of this ARC file.
329      * @param offset Offset into arc file.
330      *
331      * @return Metadata structure for this record.
332      *
333      * @exception IOException  If no. of keys doesn't match no. of values.
334      */
335     private ARCRecordMetaData computeMetaData(List<String> keys,
336     		List<String> values, String v, long offset)
337     throws IOException {
338         if (keys.size() != values.size()) {
339             List<String> originalValues = values;
340             if (!isStrict()) {
341                 values = fixSpaceInURL(values, keys.size());
342                 // If values still doesn't match key size, try and do
343                 // further repair.
344 	            if (keys.size() != values.size()) {
345 	            	// Early ARCs had a space in mimetype.
346 	            	if (values.size() == (keys.size() + 1) &&
347 	            			values.get(4).toLowerCase().startsWith("charset=")) {
348 	            		List<String> nuvalues =
349 	            			new ArrayList<String>(keys.size());
350 	            		nuvalues.add(0, values.get(0));
351 	            		nuvalues.add(1, values.get(1));
352 	            		nuvalues.add(2, values.get(2));
353 	            		nuvalues.add(3, values.get(3) + values.get(4));
354 	            		nuvalues.add(4, values.get(5));
355 	            		values = nuvalues;
356 	            	} else if((values.size() + 1) == keys.size() &&
357                             isLegitimateIPValue(values.get(1)) &&
358                             isDate(values.get(2)) && isNumber(values.get(3))) {
359                         // Mimetype is empty.
360                         List<String> nuvalues =
361                             new ArrayList<String>(keys.size());
362                         nuvalues.add(0, values.get(0));
363                         nuvalues.add(1, values.get(1));
364                         nuvalues.add(2, values.get(2));
365                         nuvalues.add(3, "-");
366                         nuvalues.add(4, values.get(3));
367                         values = nuvalues;
368                     }
369 	            }
370         	}
371             if (keys.size() != values.size()) {
372                 throw new IOException("Size of field name keys does" +
373                     " not match count of field values: " + values);
374             }
375             // Note that field was fixed on stderr.
376             logStdErr(Level.WARNING, "Fixed spaces in metadata line at " +
377             	"offset " + offset +
378                 " Original: " + originalValues + ", New: " + values);
379         }
380         
381         Map<Object, Object> headerFields =
382         	new HashMap<Object, Object>(keys.size() + 2);
383         for (int i = 0; i < keys.size(); i++) {
384             headerFields.put(keys.get(i), values.get(i));
385         }
386         
387         // Add a check for tabs in URLs.  If any, replace with '%09'.
388         // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
389         // [ 1010966 ] crawl.log has URIs with spaces in them.
390         String url = (String)headerFields.get(URL_FIELD_KEY);
391         if (url != null && url.indexOf('\t') >= 0) {
392             headerFields.put(URL_FIELD_KEY,
393                 TextUtils.replaceAll("\t", url, "%09"));
394         }
395 
396         headerFields.put(VERSION_FIELD_KEY, v);
397         headerFields.put(ABSOLUTE_OFFSET_KEY, new  Long(offset));
398 
399         return new ARCRecordMetaData(getReaderIdentifier(), headerFields);
400     }
401     
402     protected boolean isDate(final String date) {
403         if (date.length() != 14) {
404             return false;
405         }
406         return isNumber(date);
407     }
408     
409     protected boolean isNumber(final String n) {
410         for (int i = 0; i < n.length(); i++) {
411             if (!Character.isDigit(n.charAt(i))) {
412                 return false;
413             }
414         }
415         return true;
416     }
417     
418     protected boolean isLegitimateIPValue(final String ip) {
419         if ("-".equals(ip)) {
420             return true;
421         }
422         Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);
423         return m != null && m.matches();
424     }
425     
426     /***
427      * Fix space in URLs.
428      * The ARCWriter used to write into the ARC URLs with spaces in them.
429      * See <a
430      * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]
431      * crawl.log has URIs with spaces in them</a>.
432      * This method does fix up on such headers converting all spaces found
433      * to '%20'.
434      * @param values List of metadata values.
435      * @param requiredSize Expected size of resultant values list.
436      * @return New list if we successfully fixed up values or original if
437      * fixup failed.
438      */
439     protected List<String> fixSpaceInURL(List<String> values, int requiredSize) {
440         // Do validity check. 3rd from last is a date of 14 numeric
441         // characters. The 4th from last is IP, all before the IP
442         // should be concatenated together with a '%20' joiner.
443         // In the below, '4' is 4th field from end which has the IP.
444         if (!(values.size() > requiredSize) || values.size() < 4) {
445             return values;
446         }
447         // Test 3rd field is valid date.
448         if (!isDate((String) values.get(values.size() - 3))) {
449             return values;
450         }
451 
452         // Test 4th field is valid IP.
453         if (!isLegitimateIPValue((String) values.get(values.size() - 4))) {
454             return values;
455         }
456 
457         List<String> newValues = new ArrayList<String>(requiredSize);
458         StringBuffer url = new StringBuffer();
459         for (int i = 0; i < (values.size() - 4); i++) {
460             if (i > 0) {
461                 url.append("%20");
462             }
463             url.append(values.get(i));
464         }
465         newValues.add(url.toString());
466         for (int i = values.size() - 4; i < values.size(); i++) {
467             newValues.add(values.get(i));
468         }
469         return newValues;
470     }
471     
472 	protected boolean isAlignedOnFirstRecord() {
473 		return alignedOnFirstRecord;
474 	}
475 
476 	protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
477 		this.alignedOnFirstRecord = alignedOnFirstRecord;
478 	}
479 	
480     /***
481      * @return Returns the parseHttpHeaders.
482      */
483     public boolean isParseHttpHeaders() {
484         return this.parseHttpHeaders;
485     }
486     
487     /***
488      * @param parse The parseHttpHeaders to set.
489      */
490     public void setParseHttpHeaders(boolean parse) {
491         this.parseHttpHeaders = parse;
492     }
493     
494 	public String getFileExtension() {
495 		return ARC_FILE_EXTENSION;
496 	}
497 	
498 	public String getDotFileExtension() {
499 		return DOT_ARC_FILE_EXTENSION;
500 	}
501 	
502 	protected boolean output(final String format) 
503 	throws IOException, java.text.ParseException {
504 		boolean result = super.output(format);
505 		if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
506 			throw new IOException(format +
507 				" format only supported for single Records");
508 		}
509 		return result;
510 	}
511     
512     public boolean outputRecord(final String format) throws IOException {
513 		boolean result = super.outputRecord(format);
514 		if (result) {
515 			return result;
516 		}
517 		if (format.equals(NOHEAD)) {
518 			// No point digesting if dumping content.
519 			setDigest(false);
520 			ARCRecord r = (ARCRecord) get();
521 			r.skipHttpHeader();
522 			r.dump();
523 			result = true;
524 		} else if (format.equals(HEADER)) {
525 			// No point digesting if dumping content.
526 			setDigest(false);
527 			ARCRecord r = (ARCRecord) get();
528 			r.dumpHttpHeader();
529 			result = true;
530 		}
531 
532 		return result;
533 	}
534 
535     public void dump(final boolean compress)
536     throws IOException, java.text.ParseException {
537         // No point digesting if we're doing a dump.
538         setDigest(false);
539         boolean firstRecord = true;
540         ARCWriter writer = null;
541         for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
542             ARCRecord r = (ARCRecord)ii.next();
543             // We're to dump the arc on stdout.
544             // Get the first record's data if any.
545             ARCRecordMetaData meta = r.getMetaData();
546             if (firstRecord) {
547                 firstRecord = false;
548                 // Get an ARCWriter.
549                 ByteArrayOutputStream baos =
550                     new ByteArrayOutputStream(r.available());
551                 // This is slow but done only once at top of ARC.
552                 while (r.available() > 0) {
553                     baos.write(r.read());
554                 }
555                 List<String> listOfMetadata = new ArrayList<String>();
556                 listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
557                 // Assume getArc returns full path to file.  ARCWriter
558                 // or new File will complain if it is otherwise.
559                 writer = new ARCWriter(new AtomicInteger(), System.out,
560                     new File(meta.getArc()),
561                     compress, meta.getDate(), listOfMetadata);
562                 continue;
563             }
564             
565             writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
566                 ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
567                 (int)meta.getLength(), r);
568         }
569         // System.out.println(System.currentTimeMillis() - start);
570     }
571     
572     /***
573      * @return an ArchiveReader that will delete a local file on close.  Used
574      * when we bring Archive files local and need to clean up afterward.
575      */
576     public ARCReader getDeleteFileOnCloseReader(final File f) {
577         final ARCReader d = this;
578         return new ARCReader() {
579             private final ARCReader delegate = d;
580             private File archiveFile = f;
581             
582             public void close() throws IOException {
583                 this.delegate.close();
584                 if (this.archiveFile != null) {
585                     if (archiveFile.exists()) {
586                         archiveFile.delete();
587                     }
588                     this.archiveFile = null;
589                 }
590             }
591             
592             public ArchiveRecord get(long o) throws IOException {
593                 return this.delegate.get(o);
594             }
595             
596             public boolean isDigest() {
597                 return this.delegate.isDigest();
598             }
599             
600             public boolean isStrict() {
601                 return this.delegate.isStrict();
602             }
603             
604             public Iterator<ArchiveRecord> iterator() {
605                 return this.delegate.iterator();
606             }
607             
608             public void setDigest(boolean d) {
609                 this.delegate.setDigest(d);
610             }
611             
612             public void setStrict(boolean s) {
613                 this.delegate.setStrict(s);
614             }
615             
616             public List validate() throws IOException {
617                 return this.delegate.validate();
618             }
619 
620             @Override
621             public ArchiveRecord get() throws IOException {
622                 return this.delegate.get();
623             }
624 
625             @Override
626             public String getVersion() {
627                 return this.delegate.getVersion();
628             }
629 
630             @Override
631             public List validate(int noRecords) throws IOException {
632                 return this.delegate.validate(noRecords);
633             }
634 
635             @Override
636             protected ARCRecord createArchiveRecord(InputStream is,
637                     long offset)
638             throws IOException {
639                 return this.delegate.createArchiveRecord(is, offset);
640             }
641 
642             @Override
643             protected void gotoEOR(ArchiveRecord record) throws IOException {
644                 this.delegate.gotoEOR(record);
645             }
646 
647             @Override
648             public void dump(boolean compress)
649             throws IOException, java.text.ParseException {
650                 this.delegate.dump(compress);
651             }
652 
653             @Override
654             public String getDotFileExtension() {
655                 return this.delegate.getDotFileExtension();
656             }
657 
658             @Override
659             public String getFileExtension() {
660                 return this.delegate.getFileExtension();
661             }
662         };
663     }
664     
665     // Static methods follow.
666 
667     /***
668      *
669      * @param formatter Help formatter instance.
670      * @param options Usage options.
671      * @param exitCode Exit code.
672      */
673     private static void usage(HelpFormatter formatter, Options options,
674             int exitCode) {
675         formatter.printHelp("java org.archive.io.arc.ARCReader" +
676             " [--digest=true|false] //\n" +
677             " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" +
678             " [--offset=#] //\n[--strict] [--parse] ARC_FILE|ARC_URL",
679                 options);
680         System.exit(exitCode);
681     }
682 
683     /***
684      * Write out the arcfile.
685      * 
686      * @param reader
687      * @param format Format to use outputting.
688      * @throws IOException
689      * @throws java.text.ParseException
690      */
691     protected static void output(ARCReader reader, String format)
692     throws IOException, java.text.ParseException {
693     	if (!reader.output(format)) {
694             throw new IOException("Unsupported format: " + format);
695     	}
696     }
697 
698     /***
699      * Generate a CDX index file for an ARC file.
700      *
701      * @param urlOrPath The ARC file to generate a CDX index for
702      * @throws IOException
703      * @throws java.text.ParseException
704      */
705     public static void createCDXIndexFile(String urlOrPath)
706     throws IOException, java.text.ParseException {
707     	ARCReader r = ARCReaderFactory.get(urlOrPath);
708     	r.setStrict(false);
709     	r.setParseHttpHeaders(true);
710     	r.setDigest(true);
711     	output(r, CDX_FILE);
712     }
713 
714     /***
715      * Command-line interface to ARCReader.
716      *
717      * Here is the command-line interface:
718      * <pre>
719      * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
720      *  -h,--help      Prints this message and exits.
721      *  -o,--offset    Outputs record at this offset into arc file.</pre>
722      *
723      * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll
724      * take care of classpaths and the calling of ARCReader.
725      *
726      * <p>Outputs using a pseudo-CDX format as described here:
727      * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
728      * Legent</a> and here
729      * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
730      * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
731      * Hash is hard-coded straight SHA-1 hash of content.
732      *
733      * @param args Command-line arguments.
734      * @throws ParseException Failed parse of the command line.
735      * @throws IOException
736      * @throws java.text.ParseException
737      */
738     public static void main(String [] args)
739     throws ParseException, IOException, java.text.ParseException {
740         Options options = getOptions();
741         options.addOption(new Option("p","parse", false, "Parse headers."));
742         PosixParser parser = new PosixParser();
743         CommandLine cmdline = parser.parse(options, args, false);
744         List cmdlineArgs = cmdline.getArgList();
745         Option [] cmdlineOptions = cmdline.getOptions();
746         HelpFormatter formatter = new HelpFormatter();
747 
748         // If no args, print help.
749         if (cmdlineArgs.size() <= 0) {
750             usage(formatter, options, 0);
751         }
752 
753         // Now look at options passed.
754         long offset = -1;
755         boolean digest = false;
756         boolean strict = false;
757         boolean parse = false;
758         String format = CDX;
759         for (int i = 0; i < cmdlineOptions.length; i++) {
760             switch(cmdlineOptions[i].getId()) {
761                 case 'h':
762                     usage(formatter, options, 0);
763                     break;
764 
765                 case 'o':
766                     offset =
767                         Long.parseLong(cmdlineOptions[i].getValue());
768                     break;
769                     
770                 case 's':
771                     strict = true;
772                     break;
773                     
774                 case 'p':
775                 	parse = true;
776                     break;
777                     
778                 case 'd':
779                 	digest = getTrueOrFalse(cmdlineOptions[i].getValue());
780                     break;
781                     
782                 case 'f':
783                     format = cmdlineOptions[i].getValue().toLowerCase();
784                     boolean match = false;
785                     // List of supported formats.
786                     final String [] supportedFormats =
787                 		{CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};
788                     for (int ii = 0; ii < supportedFormats.length; ii++) {
789                         if (supportedFormats[ii].equals(format)) {
790                             match = true;
791                             break;
792                         }
793                     }
794                     if (!match) {
795                         usage(formatter, options, 1);
796                     }
797                     break;
798 
799                 default:
800                     throw new RuntimeException("Unexpected option: " +
801                         + cmdlineOptions[i].getId());
802             }
803         }
804         
805         if (offset >= 0) {
806             if (cmdlineArgs.size() != 1) {
807                 System.out.println("Error: Pass one arcfile only.");
808                 usage(formatter, options, 1);
809             }
810             ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0),
811             	offset);
812             arc.setStrict(strict);
813             // We must parse headers if we need to skip them.
814             if (format.equals(NOHEAD) || format.equals(HEADER)) {
815                 parse = true;
816             }
817             arc.setParseHttpHeaders(parse);
818             outputRecord(arc, format);
819         } else {
820             for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
821                 String urlOrPath = (String)i.next();
822                 try {
823                 	ARCReader r = ARCReaderFactory.get(urlOrPath);
824                 	r.setStrict(strict);
825                 	r.setParseHttpHeaders(parse);
826                 	r.setDigest(digest);
827                     output(r, format);
828                 } catch (RuntimeException e) {
829                     // Write out name of file we failed on to help with
830                     // debugging.  Then print stack trace and try to keep
831                     // going.  We do this for case where we're being fed
832                     // a bunch of ARCs; just note the bad one and move
833                     // on to the next.
834                     System.err.println("Exception processing " + urlOrPath +
835                         ": " + e.getMessage());
836                     e.printStackTrace(System.err);
837                     System.exit(1);
838                 }
839             }
840         }
841     }
842 }