View Javadoc

1   /* $Id: WARCReader.java 4754 2006-11-28 02:03:03Z stack-sf $
2    *
3    * Created Aug 23, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.warc;
24  
25  import java.io.File;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.util.Iterator;
29  import java.util.List;
30  
31  import org.apache.commons.cli.CommandLine;
32  import org.apache.commons.cli.HelpFormatter;
33  import org.apache.commons.cli.Option;
34  import org.apache.commons.cli.Options;
35  import org.apache.commons.cli.ParseException;
36  import org.apache.commons.cli.PosixParser;
37  import org.apache.commons.lang.NotImplementedException;
38  import org.archive.io.ArchiveReader;
39  import org.archive.io.ArchiveRecord;
40  import org.archive.io.warc.WARCConstants;
41  
42  /***
43   * WARCReader.
44   * Go via {@link WARCReaderFactory} to get instance.
45   * @author stack
46   * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
47   */
48  public class WARCReader extends ArchiveReader implements WARCConstants {
49      WARCReader() {
50          super();
51      }
52      
53      @Override
54      protected void initialize(String i) {
55          super.initialize(i);
56          setVersion(WARC_VERSION);
57      }
58      
59      /***
60       * Skip over any trailing new lines at end of the record so we're lined up
61       * ready to read the next.
62       * @param record
63       * @throws IOException
64       */
65      protected void gotoEOR(ArchiveRecord record) throws IOException {
66          if (record.available() != 0) {
67              throw new IOException("Record should be exhausted before coming " +
68                  "in here");
69          }
70  
71          // Records end in 2*CRLF.  Suck it up.
72          readExpectedChar(getIn(), CRLF.charAt(0));
73          readExpectedChar(getIn(), CRLF.charAt(1));
74          readExpectedChar(getIn(), CRLF.charAt(0));
75          readExpectedChar(getIn(), CRLF.charAt(1));
76      }
77      
78      protected void readExpectedChar(final InputStream is, final int expected)
79      throws IOException {
80          int c = is.read();
81          if (c != expected) {
82              throw new IOException("Unexpected character " +
83                  Integer.toHexString(c) + "(Expecting " +
84                  Integer.toHexString(expected) + ")");
85          }
86      }
87      
88      /***
89       * Create new WARC record.
90       * Encapsulate housekeeping that has to do w/ creating new Record.
91       * @param is InputStream to use.
92       * @param offset Absolute offset into WARC file.
93       * @return A WARCRecord.
94       * @throws IOException
95       */
96      protected WARCRecord createArchiveRecord(InputStream is, long offset)
97      throws IOException {
98          return (WARCRecord)currentRecord(new WARCRecord(is,
99          	getReaderIdentifier(), offset, isDigest(), isStrict()));
100     }
101     
102 	@Override
103 	public void dump(boolean compress)
104 	throws IOException, java.text.ParseException {
105 	    for (final Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
106             ArchiveRecord r = i.next();
107             System.out.println(r.getHeader().toString());
108             r.dump();
109             System.out.println();
110         }
111 	}
112     
113 
114     @Override
115     public ArchiveReader getDeleteFileOnCloseReader(final File f) {
116         throw new NotImplementedException("TODO");
117     }  
118 
119 	@Override
120 	public String getDotFileExtension() {
121 		return DOT_WARC_FILE_EXTENSION;
122 	}
123 
124 	@Override
125 	public String getFileExtension() {
126 		return WARC_FILE_EXTENSION;
127 	} 
128     
129     // Static methods follow.  Mostly for command-line processing.
130 
131     /***
132      *
133      * @param formatter Help formatter instance.
134      * @param options Usage options.
135      * @param exitCode Exit code.
136      */
137     private static void usage(HelpFormatter formatter, Options options,
138             int exitCode) {
139         formatter.printHelp("java org.archive.io.arc.WARCReader" +
140             " [--digest=true|false] //\n" +
141             " [--format=cdx|cdxfile|dump|gzipdump]" +
142             " [--offset=#] //\n[--strict] [--parse] WARC_FILE|WARC_URL",
143                 options);
144         System.exit(exitCode);
145     }
146 
147     /***
148      * Write out the arcfile.
149      * 
150      * @param reader
151      * @param format Format to use outputting.
152      * @throws IOException
153      * @throws java.text.ParseException
154      */
155     protected static void output(WARCReader reader, String format)
156     throws IOException, java.text.ParseException {
157     	if (!reader.output(format)) {
158             throw new IOException("Unsupported format: " + format);
159     	}
160     }
161 
162     /***
163      * Generate a CDX index file for an ARC file.
164      *
165      * @param urlOrPath The ARC file to generate a CDX index for
166      * @throws IOException
167      * @throws java.text.ParseException
168      */
169     public static void createCDXIndexFile(String urlOrPath)
170     throws IOException, java.text.ParseException {
171     	WARCReader r = WARCReaderFactory.get(urlOrPath);
172     	r.setStrict(false);
173     	r.setDigest(true);
174     	output(r, CDX_FILE);
175     }
176 
177     /***
178      * Command-line interface to WARCReader.
179      *
180      * Here is the command-line interface:
181      * <pre>
182      * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
183      *  -h,--help      Prints this message and exits.
184      *  -o,--offset    Outputs record at this offset into arc file.</pre>
185      *
186      * <p>Outputs using a pseudo-CDX format as described here:
187      * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
188      * Legent</a> and here
189      * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
190      * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
191      * Hash is hard-coded straight SHA-1 hash of content.
192      *
193      * @param args Command-line arguments.
194      * @throws ParseException Failed parse of the command line.
195      * @throws IOException
196      * @throws java.text.ParseException
197      */
198     public static void main(String [] args)
199     throws ParseException, IOException, java.text.ParseException {
200         Options options = getOptions();
201         PosixParser parser = new PosixParser();
202         CommandLine cmdline = parser.parse(options, args, false);
203         List cmdlineArgs = cmdline.getArgList();
204         Option [] cmdlineOptions = cmdline.getOptions();
205         HelpFormatter formatter = new HelpFormatter();
206 
207         // If no args, print help.
208         if (cmdlineArgs.size() <= 0) {
209             usage(formatter, options, 0);
210         }
211 
212         // Now look at options passed.
213         long offset = -1;
214         boolean digest = false;
215         boolean strict = false;
216         String format = CDX;
217         for (int i = 0; i < cmdlineOptions.length; i++) {
218             switch(cmdlineOptions[i].getId()) {
219                 case 'h':
220                     usage(formatter, options, 0);
221                     break;
222 
223                 case 'o':
224                     offset =
225                         Long.parseLong(cmdlineOptions[i].getValue());
226                     break;
227                     
228                 case 's':
229                     strict = true;
230                     break;
231                     
232                 case 'd':
233                 	digest = getTrueOrFalse(cmdlineOptions[i].getValue());
234                     break;
235                     
236                 case 'f':
237                     format = cmdlineOptions[i].getValue().toLowerCase();
238                     boolean match = false;
239                     // List of supported formats.
240                     final String [] supportedFormats =
241                 		{CDX, DUMP, GZIP_DUMP, CDX_FILE};
242                     for (int ii = 0; ii < supportedFormats.length; ii++) {
243                         if (supportedFormats[ii].equals(format)) {
244                             match = true;
245                             break;
246                         }
247                     }
248                     if (!match) {
249                         usage(formatter, options, 1);
250                     }
251                     break;
252 
253                 default:
254                     throw new RuntimeException("Unexpected option: " +
255                         + cmdlineOptions[i].getId());
256             }
257         }
258         
259         if (offset >= 0) {
260             if (cmdlineArgs.size() != 1) {
261                 System.out.println("Error: Pass one arcfile only.");
262                 usage(formatter, options, 1);
263             }
264             WARCReader r = WARCReaderFactory.get(
265             	new File((String)cmdlineArgs.get(0)), offset);
266             r.setStrict(strict);
267             outputRecord(r, format);
268         } else {
269             for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
270                 String urlOrPath = (String)i.next();
271                 try {
272                 	WARCReader r = WARCReaderFactory.get(urlOrPath);
273                 	r.setStrict(strict);
274                 	r.setDigest(digest);
275                     output(r, format);
276                 } catch (RuntimeException e) {
277                     // Write out name of file we failed on to help with
278                     // debugging.  Then print stack trace and try to keep
279                     // going.  We do this for case where we're being fed
280                     // a bunch of ARCs; just note the bad one and move
281                     // on to the next.
282                     System.err.println("Exception processing " + urlOrPath +
283                         ": " + e.getMessage());
284                     e.printStackTrace(System.err);
285                     System.exit(1);
286                 }
287             }
288         }
289     } 
290 }