1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.writer;
24
25 import java.io.ByteArrayInputStream;
26 import java.io.File;
27 import java.io.IOException;
28 import java.net.InetAddress;
29 import java.net.URI;
30 import java.net.URISyntaxException;
31 import java.net.UnknownHostException;
32 import java.text.ParseException;
33 import java.util.Collection;
34 import java.util.Date;
35 import java.util.HashMap;
36 import java.util.Map;
37 import java.util.concurrent.atomic.AtomicInteger;
38 import java.util.logging.Level;
39 import java.util.logging.Logger;
40
41 import org.apache.commons.httpclient.Header;
42 import org.apache.commons.httpclient.HttpMethodBase;
43 import org.apache.commons.httpclient.HttpStatus;
44 import org.apache.commons.lang.StringUtils;
45 import org.archive.crawler.Heritrix;
46 import org.archive.crawler.datamodel.CoreAttributeConstants;
47 import org.archive.crawler.datamodel.CrawlURI;
48 import org.archive.crawler.datamodel.FetchStatusCodes;
49 import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
50 import org.archive.crawler.event.CrawlStatusListener;
51 import org.archive.crawler.extractor.Link;
52 import org.archive.crawler.framework.WriterPoolProcessor;
53 import org.archive.crawler.settings.SimpleType;
54 import org.archive.crawler.settings.Type;
55 import org.archive.io.ReplayInputStream;
56 import org.archive.io.WriterPoolMember;
57 import org.archive.io.WriterPoolSettings;
58 import org.archive.io.warc.WARCWriter;
59 import org.archive.io.warc.WARCConstants;
60 import org.archive.io.warc.WARCWriterPool;
61 import org.archive.uid.GeneratorFactory;
62 import org.archive.util.ArchiveUtils;
63 import org.archive.util.XmlUtils;
64 import org.archive.util.anvl.ANVLRecord;
65 import org.w3c.dom.Document;
66
67 /***
68 * Experimental WARCWriterProcessor.
69 * Goes against the 0.17 version of the WARC specification.
70 * See http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc .
71 *
72 * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822
73 * (commons-httpclient?) or find something else.
74 *
75 * @author stack
76 */
77 public class WARCWriterProcessor extends WriterPoolProcessor
78 implements CoreAttributeConstants, CrawlStatusListener,
79 WriterPoolSettings, FetchStatusCodes, WARCConstants {
80 private static final long serialVersionUID = 6182850087635847443L;
81
82 private final Logger logger = Logger.getLogger(this.getClass().getName());
83
84 /***
85 * Key for whether to write 'request' type records where possible
86 */
87 public static final String ATTR_WRITE_REQUESTS =
88 "write-requests";
89
90 /***
91 * Key for whether to write 'metadata' type records where possible
92 */
93 public static final String ATTR_WRITE_METADATA =
94 "write-metadata";
95
96 /***
97 * Key for whether to write 'revisit' type records when
98 * consecutive identical digest
99 */
100 public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS =
101 "write-revisit-for-identical-digests";
102
103 /***
104 * Key for whether to write 'revisit' type records for server
105 * "304 not modified" responses
106 */
107 public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED =
108 "write-revisit-for-not-modified";
109
110 /***
111 * Default path list.
112 */
113 private static final String [] DEFAULT_PATH = {"warcs"};
114
115 protected String [] getDefaultPath() {
116 return DEFAULT_PATH;
117 }
118
119 /***
120 * @param name Name of this writer.
121 */
122 public WARCWriterProcessor(final String name) {
123 super(name, "Experimental WARCWriter processor (Version 0.17)");
124 Type e = addElementToDefinition(
125 new SimpleType(ATTR_WRITE_REQUESTS,
126 "Whether to write 'request' type records. " +
127 "Default is true.", new Boolean(true)));
128 e.setOverrideable(true);
129 e.setExpertSetting(true);
130 e = addElementToDefinition(
131 new SimpleType(ATTR_WRITE_METADATA,
132 "Whether to write 'metadata' type records. " +
133 "Default is true.", new Boolean(true)));
134 e.setOverrideable(true);
135 e.setExpertSetting(true);
136 e = addElementToDefinition(
137 new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
138 "Whether to write 'revisit' type records when a URI's " +
139 "history indicates the previous fetch had an identical " +
140 "content digest. " +
141 "Default is true.", new Boolean(true)));
142 e.setOverrideable(true);
143 e.setExpertSetting(true);
144 e = addElementToDefinition(
145 new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED,
146 "Whether to write 'revisit' type records when a " +
147 "304-Not Modified response is received. " +
148 "Default is true.", new Boolean(true)));
149 e.setOverrideable(true);
150 e.setExpertSetting(true);
151 }
152
153 protected void setupPool(final AtomicInteger serialNo) {
154 setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
155 getPoolMaximumWait()));
156 }
157
158 /***
159 * Writes a CrawlURI and its associated data to store file.
160 *
161 * Currently this method understands the following uri types: dns, http, and
162 * https.
163 *
164 * @param curi CrawlURI to process.
165 *
166 */
167 protected void innerProcess(CrawlURI curi) {
168
169 if (curi.getFetchStatus() <= 0) {
170 return;
171 }
172
173
174 long recordLength = curi.getContentSize();
175 if (recordLength <= 0) {
176
177
178 return;
179 }
180
181 String scheme = curi.getUURI().getScheme().toLowerCase();
182 try {
183 if (shouldWrite(curi)) {
184 write(scheme, curi);
185 } else {
186 logger.info("This writer does not write out scheme " +
187 scheme + " content");
188 }
189 } catch (IOException e) {
190 curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
191 curi.toString());
192 logger.log(Level.SEVERE, "Failed write of Record: " +
193 curi.toString(), e);
194 }
195 }
196
197 protected void write(final String lowerCaseScheme, final CrawlURI curi)
198 throws IOException {
199 WriterPoolMember writer = getPool().borrowFile();
200 long position = writer.getPosition();
201
202
203
204 writer.checkSize();
205 if (writer.getPosition() != position) {
206
207
208
209 setTotalBytesWritten(getTotalBytesWritten() +
210 (writer.getPosition() - position));
211 position = writer.getPosition();
212 }
213
214 WARCWriter w = (WARCWriter)writer;
215 try {
216
217
218 final URI baseid = getRecordID();
219 final String timestamp =
220 ArchiveUtils.getLog14Date(curi.getLong(A_FETCH_BEGAN_TIME));
221 if (lowerCaseScheme.startsWith("http")) {
222
223
224
225
226 ANVLRecord headers = new ANVLRecord(5);
227 if (curi.getContentDigest() != null) {
228 headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
229 curi.getContentDigestSchemeString());
230 }
231 headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
232 URI rid;
233
234 if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) &&
235 ((Boolean)getUncheckedAttribute(curi,
236 ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
237 rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE,
238 baseid, curi, headers);
239 } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED &&
240 ((Boolean)getUncheckedAttribute(curi,
241 ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED))) {
242 rid = writeRevisitNotModified(w, timestamp,
243 baseid, curi, headers);
244 } else {
245 if (curi.isTruncatedFetch()) {
246 String value = curi.isTimeTruncatedFetch()?
247 NAMED_FIELD_TRUNCATED_VALUE_TIME:
248 curi.isLengthTruncatedFetch()?
249 NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
250 curi.isHeaderTruncatedFetch()?
251 NAMED_FIELD_TRUNCATED_VALUE_HEAD:
252
253 TRUNCATED_VALUE_UNSPECIFIED;
254 headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
255 }
256 rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
257 baseid, curi, headers);
258 }
259
260 headers = new ANVLRecord(1);
261 headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
262 '<' + rid.toString() + '>');
263
264 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {
265 writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
266 baseid, curi, headers);
267 }
268 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
269 writeMetadata(w, timestamp, baseid, curi, headers);
270 }
271 } else if (lowerCaseScheme.equals("dns")) {
272 ANVLRecord headers = null;
273 String ip = curi.getString(A_DNS_SERVER_IP_LABEL);
274 if (ip != null && ip.length() > 0) {
275 headers = new ANVLRecord(1);
276 headers.addLabelValue(HEADER_KEY_IP, ip);
277 }
278 writeResponse(w, timestamp, curi.getContentType(), baseid,
279 curi, headers);
280 } else {
281 logger.warning("No handler for scheme " + lowerCaseScheme);
282 }
283 } catch (IOException e) {
284
285 getPool().invalidateFile(writer);
286
287
288
289 writer = null;
290 throw e;
291 } finally {
292 if (writer != null) {
293 setTotalBytesWritten(getTotalBytesWritten() +
294 (writer.getPosition() - position));
295 getPool().returnFile(writer);
296 }
297 }
298 checkBytesWritten();
299 }
300
301 protected URI writeRequest(final WARCWriter w,
302 final String timestamp, final String mimetype,
303 final URI baseid, final CrawlURI curi,
304 final ANVLRecord namedFields)
305 throws IOException {
306 final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);
307 ReplayInputStream ris =
308 curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();
309 try {
310 w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
311 namedFields, ris,
312 curi.getHttpRecorder().getRecordedOutput().getSize());
313 } finally {
314 if (ris != null) {
315 ris.close();
316 }
317 }
318 return uid;
319 }
320
321 protected URI writeResponse(final WARCWriter w,
322 final String timestamp, final String mimetype,
323 final URI baseid, final CrawlURI curi,
324 final ANVLRecord namedFields)
325 throws IOException {
326 ReplayInputStream ris =
327 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
328 try {
329 w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
330 namedFields, ris,
331 curi.getHttpRecorder().getRecordedInput().getSize());
332 } finally {
333 if (ris != null) {
334 ris.close();
335 }
336 }
337 return baseid;
338 }
339
340 protected URI writeResource(final WARCWriter w,
341 final String timestamp, final String mimetype,
342 final URI baseid, final CrawlURI curi,
343 final ANVLRecord namedFields)
344 throws IOException {
345 ReplayInputStream ris =
346 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
347 try {
348 w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,
349 namedFields, ris,
350 curi.getHttpRecorder().getRecordedInput().getSize());
351 } finally {
352 if (ris != null) {
353 ris.close();
354 }
355 }
356 return baseid;
357 }
358
359 protected URI writeRevisitDigest(final WARCWriter w,
360 final String timestamp, final String mimetype,
361 final URI baseid, final CrawlURI curi,
362 final ANVLRecord namedFields)
363 throws IOException {
364 long revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();
365 revisedLength = revisedLength > 0
366 ? revisedLength
367 : curi.getHttpRecorder().getRecordedInput().getSize();
368 namedFields.addLabelValue(
369 HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);
370 namedFields.addLabelValue(
371 HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
372 ReplayInputStream ris =
373 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
374 try {
375 w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,
376 namedFields, ris, revisedLength);
377 } finally {
378 if (ris != null) {
379 ris.close();
380 }
381 }
382 return baseid;
383 }
384
385 protected URI writeRevisitNotModified(final WARCWriter w,
386 final String timestamp,
387 final URI baseid, final CrawlURI curi,
388 final ANVLRecord namedFields)
389 throws IOException {
390 namedFields.addLabelValue(
391 HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);
392
393 if(curi.containsKey(A_HTTP_TRANSACTION)) {
394 HttpMethodBase method =
395 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
396 saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);
397 saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,
398 HEADER_KEY_LAST_MODIFIED);
399 }
400
401 namedFields.addLabelValue(HEADER_KEY_TRUNCATED,
402 NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
403 ReplayInputStream ris =
404 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
405 try {
406 w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,
407 namedFields, ris, 0);
408 } finally {
409 if (ris != null) {
410 ris.close();
411 }
412 }
413 return baseid;
414 }
415
416 /***
417 * Save a header from the given HTTP operation into the
418 * provider headers under a new name
419 *
420 * @param origName header name to get if present
421 * @param method http operation containing headers
422 */
423 protected void saveHeader(String origName, HttpMethodBase method,
424 ANVLRecord headers, String newName) {
425 Header header = method.getResponseHeader(origName);
426 if(header!=null) {
427 headers.addLabelValue(newName, header.getValue());
428 }
429 }
430
431 protected URI writeMetadata(final WARCWriter w,
432 final String timestamp,
433 final URI baseid, final CrawlURI curi,
434 final ANVLRecord namedFields)
435 throws IOException {
436 final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
437
438
439
440
441 ANVLRecord r = new ANVLRecord();
442 if (curi.isSeed()) {
443 r.addLabel("seed");
444 } else {
445 if (curi.forceFetch()) {
446 r.addLabel("force-fetch");
447 }
448 r.addLabelValue("via", curi.flattenVia());
449 r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
450 if (curi.containsKey(A_SOURCE_TAG)) {
451 r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));
452 }
453 }
454 long duration = curi.getFetchDuration();
455 if(duration>-1) {
456 r.addLabelValue("fetchTimeMs", Long.toString(duration));
457 }
458
459
460 Collection<Link> links = curi.getOutLinks();
461 if (links != null && links.size() > 0) {
462 for (Link link: links) {
463 r.addLabelValue("outlink", link.toString());
464 }
465 }
466
467
468
469
470
471
472
473
474
475
476 byte [] b = r.getUTF8Bytes();
477 w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
478 uid, namedFields, new ByteArrayInputStream(b), b.length);
479 return uid;
480 }
481
482 protected URI getRecordID() throws IOException {
483 URI result;
484 try {
485 result = GeneratorFactory.getFactory().getRecordID();
486 } catch (URISyntaxException e) {
487 throw new IOException(e.toString());
488 }
489 return result;
490 }
491
492 protected URI qualifyRecordID(final URI base, final String key,
493 final String value)
494 throws IOException {
495 URI result;
496 Map<String, String> qualifiers = new HashMap<String, String>(1);
497 qualifiers.put(key, value);
498 try {
499 result = GeneratorFactory.getFactory().
500 qualifyRecordID(base, qualifiers);
501 } catch (URISyntaxException e) {
502 throw new IOException(e.toString());
503 }
504 return result;
505 }
506
507 @Override
508 protected String getFirstrecordStylesheet() {
509 return "/warcinfobody.xsl";
510 }
511
512 /***
513 * Return relevant values as header-like fields (here ANVLRecord, but
514 * spec-defined "application/warc-fields" type when written). Field
515 * names from from DCMI Terms and the WARC/0.17 specification.
516 *
517 * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)
518 */
519 @Override
520 protected String getFirstrecordBody(File orderFile) {
521 ANVLRecord record = new ANVLRecord(7);
522 record.addLabelValue("software", "Heritrix/" +
523 Heritrix.getVersion() + " http://crawler.archive.org");
524 try {
525 InetAddress host = InetAddress.getLocalHost();
526 record.addLabelValue("ip", host.getHostAddress());
527 record.addLabelValue("hostname", host.getHostName());
528 } catch (UnknownHostException e) {
529 logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
530 }
531 record.addLabelValue("format","WARC File Format 0.17");
532 record.addLabelValue("conformsTo","http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc");
533
534 try {
535 Document doc = XmlUtils.getDocument(orderFile);
536 addIfNotBlank(record,"operator",
537 XmlUtils.xpathOrNull(doc,"//meta/operator"));
538 addIfNotBlank(record,"publisher",
539 XmlUtils.xpathOrNull(doc,"//meta/organization"));
540 addIfNotBlank(record,"audience",
541 XmlUtils.xpathOrNull(doc,"//meta/audience"));
542 addIfNotBlank(record,"isPartOf",
543 XmlUtils.xpathOrNull(doc,"//meta/name"));
544 String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date");
545 if(StringUtils.isNotBlank(rawDate)) {
546 Date date;
547 try {
548 date = ArchiveUtils.parse14DigitDate(rawDate);
549 addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
550 } catch (ParseException e) {
551 logger.log(Level.WARNING,"obtaining warc created date",e);
552 }
553 }
554 addIfNotBlank(record,"description",
555 XmlUtils.xpathOrNull(doc,"//meta/description"));
556 addIfNotBlank(record,"robots",
557 XmlUtils.xpathOrNull(doc,
558 "//newObject[@name='robots-honoring-policy']/string[@name='type']"));
559 addIfNotBlank(record,"http-header-user-agent",
560 XmlUtils.xpathOrNull(doc,
561 "//map[@name='http-headers']/string[@name='user-agent']"));
562 addIfNotBlank(record,"http-header-from",
563 XmlUtils.xpathOrNull(doc,
564 "//map[@name='http-headers']/string[@name='from']"));
565 } catch (IOException e) {
566 logger.log(Level.WARNING,"obtaining warcinfo",e);
567 }
568
569
570
571 return record.toString();
572 }
573
574
575 protected void addIfNotBlank(ANVLRecord record, String label, String value) {
576 if(StringUtils.isNotBlank(value)) {
577 record.addLabelValue(label, value);
578 }
579 }
580 }