1   /* $Id: ExperimentalWARCWriterProcessor.java 4935 2007-02-23 00:27:24Z gojomo $
2    *
3    * Created on August 1st, 2006.
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.writer;
24  
25  import java.io.ByteArrayInputStream;
26  import java.io.File;
27  import java.io.IOException;
28  import java.net.InetAddress;
29  import java.net.URI;
30  import java.net.URISyntaxException;
31  import java.net.UnknownHostException;
32  import java.text.ParseException;
33  import java.util.Collection;
34  import java.util.Date;
35  import java.util.HashMap;
36  import java.util.Map;
37  import java.util.concurrent.atomic.AtomicInteger;
38  import java.util.logging.Level;
39  import java.util.logging.Logger;
40  
41  import org.apache.commons.httpclient.Header;
42  import org.apache.commons.httpclient.HttpMethodBase;
43  import org.apache.commons.httpclient.HttpStatus;
44  import org.apache.commons.lang.StringUtils;
45  import org.archive.crawler.Heritrix;
46  import org.archive.crawler.datamodel.CoreAttributeConstants;
47  import org.archive.crawler.datamodel.CrawlURI;
48  import org.archive.crawler.datamodel.FetchStatusCodes;
49  import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
50  import org.archive.crawler.event.CrawlStatusListener;
51  import org.archive.crawler.extractor.Link;
52  import org.archive.crawler.framework.WriterPoolProcessor;
53  import org.archive.crawler.settings.SimpleType;
54  import org.archive.crawler.settings.Type;
55  import org.archive.io.ReplayInputStream;
56  import org.archive.io.WriterPoolMember;
57  import org.archive.io.WriterPoolSettings;
58  import org.archive.io.warc.WARCWriter;
59  import org.archive.io.warc.WARCConstants;
60  import org.archive.io.warc.WARCWriterPool;
61  import org.archive.uid.GeneratorFactory;
62  import org.archive.util.ArchiveUtils;
63  import org.archive.util.XmlUtils;
64  import org.archive.util.anvl.ANVLRecord;
65  import org.w3c.dom.Document;
66  
67  /***
68   * Experimental WARCWriterProcessor.
69   * Goes against the 0.17 version of the WARC specification. 
70   * See http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc .
71   * 
72   * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822
73   * (commons-httpclient?) or find something else.
74   * 
75   * @author stack
76   */
77  public class WARCWriterProcessor extends WriterPoolProcessor
78  implements CoreAttributeConstants, CrawlStatusListener,
79  WriterPoolSettings, FetchStatusCodes, WARCConstants {
80      private static final long serialVersionUID = 6182850087635847443L;
81  
82      private final Logger logger = Logger.getLogger(this.getClass().getName());
83      
84      /***
85       * Key for whether to write 'request' type records where possible
86       */
87      public static final String ATTR_WRITE_REQUESTS =
88          "write-requests";
89      
90      /***
91       * Key for whether to write 'metadata' type records where possible
92       */
93      public static final String ATTR_WRITE_METADATA =
94          "write-metadata";
95      
96      /***
97       * Key for whether to write 'revisit' type records when
98       * consecutive identical digest
99       */
100     public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS =
101         "write-revisit-for-identical-digests";
102     
103     /***
104      * Key for whether to write 'revisit' type records for server
105      * "304 not modified" responses
106      */
107     public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED =
108         "write-revisit-for-not-modified";
109     
110     /***
111      * Default path list.
112      */
113     private static final String [] DEFAULT_PATH = {"warcs"};
114 
115     protected String [] getDefaultPath() {
116         return DEFAULT_PATH;
117     }
118     
119     /***
120      * @param name Name of this writer.
121      */
122     public WARCWriterProcessor(final String name) {
123         super(name, "Experimental WARCWriter processor (Version 0.17)");
124         Type e = addElementToDefinition(
125                 new SimpleType(ATTR_WRITE_REQUESTS,
126                 "Whether to write 'request' type records. " +
127                 "Default is true.", new Boolean(true)));
128         e.setOverrideable(true);
129         e.setExpertSetting(true);
130         e = addElementToDefinition(
131                 new SimpleType(ATTR_WRITE_METADATA,
132                 "Whether to write 'metadata' type records. " +
133                 "Default is true.", new Boolean(true)));
134         e.setOverrideable(true);
135         e.setExpertSetting(true);
136         e = addElementToDefinition(
137                 new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
138                 "Whether to write 'revisit' type records when a URI's " +
139                 "history indicates the previous fetch had an identical " +
140                 "content digest. " +
141                 "Default is true.", new Boolean(true)));
142         e.setOverrideable(true);
143         e.setExpertSetting(true);
144         e = addElementToDefinition(
145                 new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED,
146                 "Whether to write 'revisit' type records when a " +
147                 "304-Not Modified response is received. " +
148                 "Default is true.", new Boolean(true)));
149         e.setOverrideable(true);
150         e.setExpertSetting(true);
151     }
152 
153     protected void setupPool(final AtomicInteger serialNo) {
154 		setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
155             getPoolMaximumWait()));
156     }
157     
158     /***
159      * Writes a CrawlURI and its associated data to store file.
160      * 
161      * Currently this method understands the following uri types: dns, http, and
162      * https.
163      * 
164      * @param curi CrawlURI to process.
165      * 
166      */
167     protected void innerProcess(CrawlURI curi) {
168         // If failure, or we haven't fetched the resource yet, return
169         if (curi.getFetchStatus() <= 0) {
170             return;
171         }
172         
173         // If no recorded content at all, don't write record.
174         long recordLength = curi.getContentSize();
175         if (recordLength <= 0) {
176             // getContentSize() should be > 0 if any material (even just
177             // HTTP headers with zero-length body) is available. 
178         	return;
179         }
180         
181         String scheme = curi.getUURI().getScheme().toLowerCase();
182         try {
183             if (shouldWrite(curi)) {
184                 write(scheme, curi);
185             } else {
186                 logger.info("This writer does not write out scheme " +
187                         scheme + " content");
188             }
189         } catch (IOException e) {
190             curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
191                 curi.toString());
192             logger.log(Level.SEVERE, "Failed write of Record: " +
193                 curi.toString(), e);
194         }
195     }
196     
197     protected void write(final String lowerCaseScheme, final CrawlURI curi)
198     throws IOException {
199         WriterPoolMember writer = getPool().borrowFile();
200         long position = writer.getPosition();
201         // See if we need to open a new file because we've exceeed maxBytes.
202         // Call to checkFileSize will open new file if we're at maximum for
203         // current file.
204         writer.checkSize();
205         if (writer.getPosition() != position) {
206             // We just closed the file because it was larger than maxBytes.
207             // Add to the totalBytesWritten the size of the first record
208             // in the file, if any.
209             setTotalBytesWritten(getTotalBytesWritten() +
210             	(writer.getPosition() - position));
211             position = writer.getPosition();
212         }
213         
214         WARCWriter w = (WARCWriter)writer;
215         try {
216             // Write a request, response, and metadata all in the one
217             // 'transaction'.
218             final URI baseid = getRecordID();
219             final String timestamp =
220                 ArchiveUtils.getLog14Date(curi.getLong(A_FETCH_BEGAN_TIME));
221             if (lowerCaseScheme.startsWith("http")) {
222                 // Add named fields for ip, checksum, and relate the metadata
223                 // and request to the resource field.
224                 // TODO: Use other than ANVL (or rename ANVL as NameValue or
225                 // use RFC822 (commons-httpclient?).
226                 ANVLRecord headers = new ANVLRecord(5);
227                 if (curi.getContentDigest() != null) {
228                     headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
229                         curi.getContentDigestSchemeString());
230                 }
231                 headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
232                 URI rid;
233                 
234                 if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) && 
235                         ((Boolean)getUncheckedAttribute(curi, 
236                                 ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
237                     rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE,
238                             baseid, curi, headers);
239                 } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && 
240                         ((Boolean)getUncheckedAttribute(curi, 
241                                 ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED))) {
242                     rid = writeRevisitNotModified(w, timestamp,
243                             baseid, curi, headers);
244                 } else {
245                     if (curi.isTruncatedFetch()) {
246                         String value = curi.isTimeTruncatedFetch()?
247                             NAMED_FIELD_TRUNCATED_VALUE_TIME:
248                             curi.isLengthTruncatedFetch()?
249                                 NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
250                                 curi.isHeaderTruncatedFetch()?
251                                     NAMED_FIELD_TRUNCATED_VALUE_HEAD:
252                             // TODO: Add this to spec.
253                             TRUNCATED_VALUE_UNSPECIFIED;
254                         headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
255                     }
256                     rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
257                     	baseid, curi, headers);
258                 }
259                 
260                 headers = new ANVLRecord(1);
261                 headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
262                     '<' + rid.toString() + '>');
263 
264                 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {
265                     writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
266                             baseid, curi, headers);
267                 }
268                 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
269                     writeMetadata(w, timestamp, baseid, curi, headers);
270                 } 
271             } else if (lowerCaseScheme.equals("dns")) {
272                 ANVLRecord headers = null;
273                 String ip = curi.getString(A_DNS_SERVER_IP_LABEL);
274                 if (ip != null && ip.length() > 0) {
275                     headers = new ANVLRecord(1);
276                     headers.addLabelValue(HEADER_KEY_IP, ip);
277                 }
278                 writeResponse(w, timestamp, curi.getContentType(), baseid,
279                     curi, headers);
280             } else {
281                 logger.warning("No handler for scheme " + lowerCaseScheme);
282             }
283         } catch (IOException e) {
284             // Invalidate this file (It gets a '.invalid' suffix).
285             getPool().invalidateFile(writer);
286             // Set the writer to null otherwise the pool accounting
287             // of how many active writers gets skewed if we subsequently
288             // do a returnWriter call on this object in the finally block.
289             writer = null;
290             throw e;
291         } finally {
292             if (writer != null) {
293             	setTotalBytesWritten(getTotalBytesWritten() +
294             	     (writer.getPosition() - position));
295                 getPool().returnFile(writer);
296             }
297         }
298         checkBytesWritten();
299     }
300     
301     protected URI writeRequest(final WARCWriter w,
302             final String timestamp, final String mimetype,
303             final URI baseid, final CrawlURI curi,
304             final ANVLRecord namedFields) 
305     throws IOException {
306         final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);
307         ReplayInputStream ris =
308             curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();
309         try {
310             w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
311                 namedFields, ris,
312                 curi.getHttpRecorder().getRecordedOutput().getSize());
313         } finally {
314             if (ris != null) {
315                 ris.close();
316             }
317         }
318         return uid;
319     }
320     
321     protected URI writeResponse(final WARCWriter w,
322             final String timestamp, final String mimetype,
323             final URI baseid, final CrawlURI curi,
324             final ANVLRecord namedFields) 
325     throws IOException {
326         ReplayInputStream ris =
327             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
328         try {
329             w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
330                 namedFields, ris,
331                 curi.getHttpRecorder().getRecordedInput().getSize());
332         } finally {
333             if (ris != null) {
334                 ris.close();
335             }
336         }
337         return baseid;
338     }
339     
340     protected URI writeResource(final WARCWriter w,
341             final String timestamp, final String mimetype,
342             final URI baseid, final CrawlURI curi,
343             final ANVLRecord namedFields) 
344     throws IOException {
345         ReplayInputStream ris =
346             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
347         try {
348             w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,
349                 namedFields, ris,
350                 curi.getHttpRecorder().getRecordedInput().getSize());
351         } finally {
352             if (ris != null) {
353                 ris.close();
354             }
355         }
356         return baseid;
357     }
358     
359     protected URI writeRevisitDigest(final WARCWriter w,
360             final String timestamp, final String mimetype,
361             final URI baseid, final CrawlURI curi,
362             final ANVLRecord namedFields) 
363     throws IOException {
364         long revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();
365         revisedLength = revisedLength > 0 
366             ? revisedLength 
367             : curi.getHttpRecorder().getRecordedInput().getSize();
368         namedFields.addLabelValue(
369         		HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);
370         namedFields.addLabelValue(
371         		HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
372         ReplayInputStream ris =
373             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
374         try {
375             w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,
376                 namedFields, ris, revisedLength);
377         } finally {
378             if (ris != null) {
379                 ris.close();
380             }
381         }
382         return baseid;
383     }
384     
385     protected URI writeRevisitNotModified(final WARCWriter w,
386             final String timestamp, 
387             final URI baseid, final CrawlURI curi,
388             final ANVLRecord namedFields) 
389     throws IOException {
390         namedFields.addLabelValue(
391         		HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);
392         // save just enough context to understand basis of not-modified
393         if(curi.containsKey(A_HTTP_TRANSACTION)) {
394             HttpMethodBase method = 
395                 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
396             saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);
397             saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,
398             		HEADER_KEY_LAST_MODIFIED);
399         }
400         // truncate to zero-length (all necessary info is above)
401         namedFields.addLabelValue(HEADER_KEY_TRUNCATED,
402             NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
403         ReplayInputStream ris =
404             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
405         try {
406             w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,
407                 namedFields, ris, 0);
408         } finally {
409             if (ris !=  null) {
410                 ris.close();
411             }
412         }
413         return baseid;
414     }
415     
416     /***
417      * Save a header from the given HTTP operation into the 
418      * provider headers under a new name
419      * 
420      * @param origName header name to get if present
421      * @param method http operation containing headers
422      */
423     protected void saveHeader(String origName, HttpMethodBase method, 
424     		ANVLRecord headers, String newName) {
425         Header header = method.getResponseHeader(origName);
426         if(header!=null) {
427             headers.addLabelValue(newName, header.getValue());
428         }
429     }
430 
431 	protected URI writeMetadata(final WARCWriter w,
432             final String timestamp,
433             final URI baseid, final CrawlURI curi,
434             final ANVLRecord namedFields) 
435     throws IOException {
436         final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
437         // Get some metadata from the curi.
438         // TODO: Get all curi metadata.
439         // TODO: Use other than ANVL (or rename ANVL as NameValue or use
440         // RFC822 (commons-httpclient?).
441         ANVLRecord r = new ANVLRecord();
442         if (curi.isSeed()) {
443             r.addLabel("seed");
444         } else {
445         	if (curi.forceFetch()) {
446         		r.addLabel("force-fetch");
447         	}
448             r.addLabelValue("via", curi.flattenVia());
449             r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
450             if (curi.containsKey(A_SOURCE_TAG)) {
451                 r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));
452             }
453         }
454         long duration = curi.getFetchDuration();
455         if(duration>-1) {
456             r.addLabelValue("fetchTimeMs", Long.toString(duration));
457         }
458         
459         // Add outlinks though they are effectively useless without anchor text.
460         Collection<Link> links = curi.getOutLinks();
461         if (links != null && links.size() > 0) {
462             for (Link link: links) {
463                 r.addLabelValue("outlink", link.toString());
464             }
465         }
466         
467         // TODO: Other curi fields to write to metadata.
468         // 
469         // Credentials
470         // 
471         // fetch-began-time: 1154569278774
472         // fetch-completed-time: 1154569281816
473         //
474         // Annotations.
475         
476         byte [] b = r.getUTF8Bytes();
477         w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
478             uid, namedFields, new ByteArrayInputStream(b), b.length);
479         return uid;
480     }
481     
482     protected URI getRecordID() throws IOException {
483         URI result;
484         try {
485             result = GeneratorFactory.getFactory().getRecordID();
486         } catch (URISyntaxException e) {
487             throw new IOException(e.toString());
488         }
489         return result;
490     }
491     
492     protected URI qualifyRecordID(final URI base, final String key,
493             final String value)
494     throws IOException {
495         URI result;
496         Map<String, String> qualifiers = new HashMap<String, String>(1);
497         qualifiers.put(key, value);
498         try {
499             result = GeneratorFactory.getFactory().
500                 qualifyRecordID(base, qualifiers);
501         } catch (URISyntaxException e) {
502             throw new IOException(e.toString());
503         }
504         return result;
505     }  
506     
507     @Override
508     protected String getFirstrecordStylesheet() {
509         return "/warcinfobody.xsl";
510     }
511 
512     /***
513      * Return relevant values as header-like fields (here ANVLRecord, but 
514      * spec-defined "application/warc-fields" type when written). Field
515      * names from from DCMI Terms and the WARC/0.17 specification.
516      * 
517      * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)
518      */
519     @Override
520     protected String getFirstrecordBody(File orderFile) {
521         ANVLRecord record = new ANVLRecord(7);
522         record.addLabelValue("software", "Heritrix/" +
523                 Heritrix.getVersion() + " http://crawler.archive.org");
524         try {
525             InetAddress host = InetAddress.getLocalHost();
526             record.addLabelValue("ip", host.getHostAddress());
527             record.addLabelValue("hostname", host.getHostName());
528         } catch (UnknownHostException e) {
529             logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
530         }
531         record.addLabelValue("format","WARC File Format 0.17");
532         record.addLabelValue("conformsTo","http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc");
533         // Get other values from order.xml 
534         try {
535             Document doc = XmlUtils.getDocument(orderFile);
536             addIfNotBlank(record,"operator",
537                     XmlUtils.xpathOrNull(doc,"//meta/operator"));
538             addIfNotBlank(record,"publisher",
539                     XmlUtils.xpathOrNull(doc,"//meta/organization"));
540             addIfNotBlank(record,"audience",
541                     XmlUtils.xpathOrNull(doc,"//meta/audience"));
542             addIfNotBlank(record,"isPartOf",
543                     XmlUtils.xpathOrNull(doc,"//meta/name"));
544             String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date");
545             if(StringUtils.isNotBlank(rawDate)) {
546                 Date date;
547                 try {
548                     date = ArchiveUtils.parse14DigitDate(rawDate);
549                     addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
550                 } catch (ParseException e) {
551                     logger.log(Level.WARNING,"obtaining warc created date",e);
552                 }
553             }
554             addIfNotBlank(record,"description",
555                     XmlUtils.xpathOrNull(doc,"//meta/description"));
556             addIfNotBlank(record,"robots",
557                     XmlUtils.xpathOrNull(doc, 
558                             "//newObject[@name='robots-honoring-policy']/string[@name='type']"));
559             addIfNotBlank(record,"http-header-user-agent",
560                     XmlUtils.xpathOrNull(doc, 
561                             "//map[@name='http-headers']/string[@name='user-agent']"));
562             addIfNotBlank(record,"http-header-from",
563                     XmlUtils.xpathOrNull(doc, 
564                             "//map[@name='http-headers']/string[@name='from']"));
565         } catch (IOException e) {
566             logger.log(Level.WARNING,"obtaining warcinfo",e);
567         } 
568         // really ugly to return as string, when it may just be merged with 
569         // a couple other fields at write time, but changing would require 
570         // larger refactoring
571         return record.toString();
572     }
573 
574 
575     protected void addIfNotBlank(ANVLRecord record, String label, String value) {
576         if(StringUtils.isNotBlank(value)) {
577             record.addLabelValue(label, value);
578         }
579     }
580 }