1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io.warc;
24
25 import java.io.File;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.util.Iterator;
29 import java.util.List;
30
31 import org.apache.commons.cli.CommandLine;
32 import org.apache.commons.cli.HelpFormatter;
33 import org.apache.commons.cli.Option;
34 import org.apache.commons.cli.Options;
35 import org.apache.commons.cli.ParseException;
36 import org.apache.commons.cli.PosixParser;
37 import org.apache.commons.lang.NotImplementedException;
38 import org.archive.io.ArchiveReader;
39 import org.archive.io.ArchiveRecord;
40 import org.archive.io.warc.WARCConstants;
41
42 /***
43 * WARCReader.
44 * Go via {@link WARCReaderFactory} to get instance.
45 * @author stack
46 * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
47 */
48 public class WARCReader extends ArchiveReader implements WARCConstants {
49 WARCReader() {
50 super();
51 }
52
53 @Override
54 protected void initialize(String i) {
55 super.initialize(i);
56 setVersion(WARC_VERSION);
57 }
58
59 /***
60 * Skip over any trailing new lines at end of the record so we're lined up
61 * ready to read the next.
62 * @param record
63 * @throws IOException
64 */
65 protected void gotoEOR(ArchiveRecord record) throws IOException {
66 if (record.available() != 0) {
67 throw new IOException("Record should be exhausted before coming " +
68 "in here");
69 }
70
71
72 readExpectedChar(getIn(), CRLF.charAt(0));
73 readExpectedChar(getIn(), CRLF.charAt(1));
74 readExpectedChar(getIn(), CRLF.charAt(0));
75 readExpectedChar(getIn(), CRLF.charAt(1));
76 }
77
78 protected void readExpectedChar(final InputStream is, final int expected)
79 throws IOException {
80 int c = is.read();
81 if (c != expected) {
82 throw new IOException("Unexpected character " +
83 Integer.toHexString(c) + "(Expecting " +
84 Integer.toHexString(expected) + ")");
85 }
86 }
87
88 /***
89 * Create new WARC record.
90 * Encapsulate housekeeping that has to do w/ creating new Record.
91 * @param is InputStream to use.
92 * @param offset Absolute offset into WARC file.
93 * @return A WARCRecord.
94 * @throws IOException
95 */
96 protected WARCRecord createArchiveRecord(InputStream is, long offset)
97 throws IOException {
98 return (WARCRecord)currentRecord(new WARCRecord(is,
99 getReaderIdentifier(), offset, isDigest(), isStrict()));
100 }
101
102 @Override
103 public void dump(boolean compress)
104 throws IOException, java.text.ParseException {
105 for (final Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
106 ArchiveRecord r = i.next();
107 System.out.println(r.getHeader().toString());
108 r.dump();
109 System.out.println();
110 }
111 }
112
113
114 @Override
115 public ArchiveReader getDeleteFileOnCloseReader(final File f) {
116 throw new NotImplementedException("TODO");
117 }
118
119 @Override
120 public String getDotFileExtension() {
121 return DOT_WARC_FILE_EXTENSION;
122 }
123
124 @Override
125 public String getFileExtension() {
126 return WARC_FILE_EXTENSION;
127 }
128
129
130
131 /***
132 *
133 * @param formatter Help formatter instance.
134 * @param options Usage options.
135 * @param exitCode Exit code.
136 */
137 private static void usage(HelpFormatter formatter, Options options,
138 int exitCode) {
139 formatter.printHelp("java org.archive.io.arc.WARCReader" +
140 " [--digest=true|false] //\n" +
141 " [--format=cdx|cdxfile|dump|gzipdump]" +
142 " [--offset=#] //\n[--strict] [--parse] WARC_FILE|WARC_URL",
143 options);
144 System.exit(exitCode);
145 }
146
147 /***
148 * Write out the arcfile.
149 *
150 * @param reader
151 * @param format Format to use outputting.
152 * @throws IOException
153 * @throws java.text.ParseException
154 */
155 protected static void output(WARCReader reader, String format)
156 throws IOException, java.text.ParseException {
157 if (!reader.output(format)) {
158 throw new IOException("Unsupported format: " + format);
159 }
160 }
161
162 /***
163 * Generate a CDX index file for an ARC file.
164 *
165 * @param urlOrPath The ARC file to generate a CDX index for
166 * @throws IOException
167 * @throws java.text.ParseException
168 */
169 public static void createCDXIndexFile(String urlOrPath)
170 throws IOException, java.text.ParseException {
171 WARCReader r = WARCReaderFactory.get(urlOrPath);
172 r.setStrict(false);
173 r.setDigest(true);
174 output(r, CDX_FILE);
175 }
176
177 /***
178 * Command-line interface to WARCReader.
179 *
180 * Here is the command-line interface:
181 * <pre>
182 * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
183 * -h,--help Prints this message and exits.
184 * -o,--offset Outputs record at this offset into arc file.</pre>
185 *
186 * <p>Outputs using a pseudo-CDX format as described here:
187 * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
188 * Legent</a> and here
189 * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
190 * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
191 * Hash is hard-coded straight SHA-1 hash of content.
192 *
193 * @param args Command-line arguments.
194 * @throws ParseException Failed parse of the command line.
195 * @throws IOException
196 * @throws java.text.ParseException
197 */
198 public static void main(String [] args)
199 throws ParseException, IOException, java.text.ParseException {
200 Options options = getOptions();
201 PosixParser parser = new PosixParser();
202 CommandLine cmdline = parser.parse(options, args, false);
203 List cmdlineArgs = cmdline.getArgList();
204 Option [] cmdlineOptions = cmdline.getOptions();
205 HelpFormatter formatter = new HelpFormatter();
206
207
208 if (cmdlineArgs.size() <= 0) {
209 usage(formatter, options, 0);
210 }
211
212
213 long offset = -1;
214 boolean digest = false;
215 boolean strict = false;
216 String format = CDX;
217 for (int i = 0; i < cmdlineOptions.length; i++) {
218 switch(cmdlineOptions[i].getId()) {
219 case 'h':
220 usage(formatter, options, 0);
221 break;
222
223 case 'o':
224 offset =
225 Long.parseLong(cmdlineOptions[i].getValue());
226 break;
227
228 case 's':
229 strict = true;
230 break;
231
232 case 'd':
233 digest = getTrueOrFalse(cmdlineOptions[i].getValue());
234 break;
235
236 case 'f':
237 format = cmdlineOptions[i].getValue().toLowerCase();
238 boolean match = false;
239
240 final String [] supportedFormats =
241 {CDX, DUMP, GZIP_DUMP, CDX_FILE};
242 for (int ii = 0; ii < supportedFormats.length; ii++) {
243 if (supportedFormats[ii].equals(format)) {
244 match = true;
245 break;
246 }
247 }
248 if (!match) {
249 usage(formatter, options, 1);
250 }
251 break;
252
253 default:
254 throw new RuntimeException("Unexpected option: " +
255 + cmdlineOptions[i].getId());
256 }
257 }
258
259 if (offset >= 0) {
260 if (cmdlineArgs.size() != 1) {
261 System.out.println("Error: Pass one arcfile only.");
262 usage(formatter, options, 1);
263 }
264 WARCReader r = WARCReaderFactory.get(
265 new File((String)cmdlineArgs.get(0)), offset);
266 r.setStrict(strict);
267 outputRecord(r, format);
268 } else {
269 for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
270 String urlOrPath = (String)i.next();
271 try {
272 WARCReader r = WARCReaderFactory.get(urlOrPath);
273 r.setStrict(strict);
274 r.setDigest(digest);
275 output(r, format);
276 } catch (RuntimeException e) {
277
278
279
280
281
282 System.err.println("Exception processing " + urlOrPath +
283 ": " + e.getMessage());
284 e.printStackTrace(System.err);
285 System.exit(1);
286 }
287 }
288 }
289 }
290 }