View Javadoc

1   /* FetchHistoryProcessor
2    * 
3    * Created on Feb 12, 2005
4    *
5    * Copyright (C) 2007 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor.recrawl;
24  
25  import org.apache.commons.httpclient.Header;
26  import org.apache.commons.httpclient.HttpMethodBase;
27  import org.apache.commons.httpclient.HttpState;
28  import org.apache.commons.httpclient.HttpStatus;
29  import org.archive.crawler.datamodel.CoreAttributeConstants;
30  import org.archive.crawler.datamodel.CrawlURI;
31  import org.archive.crawler.framework.Processor;
32  import org.archive.crawler.settings.SimpleType;
33  
34  import st.ata.util.AList;
35  import st.ata.util.HashtableAList;
36  
37  /***
38   * Maintain a history of fetch information inside the CrawlURI's attributes. 
39   * 
40   * @author gojomo
41   * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
42   */
43  public class FetchHistoryProcessor extends Processor implements CoreAttributeConstants {
44      private static final long serialVersionUID = 8476621038669163983L;
45      
46      /*** setting for desired history array length */
47      public static final String ATTR_HISTORY_LENGTH = "history-length";
48      /*** default history array length */ 
49      public static final Integer DEFAULT_HISTORY_LENGTH = 2; 
50      
51      /***
52       * Usual constructor
53       * 
54       * @param name
55       */
56      public FetchHistoryProcessor(String name) {
57          super(name, "FetchHistoryProcessor. Maintain a history of fetch " +
58                  "information inside the CrawlURI's attributes..");
59          
60          addElementToDefinition(new SimpleType(ATTR_HISTORY_LENGTH,
61                  "Number of previous fetch entries to retain in the URI " +
62                  "history. The current fetch becomes a history entry at " +
63                  "this Processor step, so the smallest useful value is " +
64                  "'2' (including the current fetch). Default is '2'.", 
65                  DEFAULT_HISTORY_LENGTH));
66      }
67  
68      @Override
69      protected void innerProcess(CrawlURI curi) throws InterruptedException {
70          AList latestFetch = new HashtableAList();
71          
72          // save status
73          latestFetch.putInt(A_STATUS,curi.getFetchStatus());
74          // save fetch start time
75          latestFetch.putLong(A_FETCH_BEGAN_TIME,curi.getLong(A_FETCH_BEGAN_TIME));
76          // save digest
77          String digest = curi.getContentDigestSchemeString();
78          if(digest!=null) {
79              latestFetch.putString(
80                      A_CONTENT_DIGEST,digest);
81          }
82          // save relevant HTTP headers, if available
83          if(curi.containsKey(A_HTTP_TRANSACTION)) {
84              HttpMethodBase method = 
85                  (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
86              saveHeader(A_ETAG_HEADER,method,latestFetch);
87              saveHeader(A_LAST_MODIFIED_HEADER,method,latestFetch);
88              // save reference length (real or virtual)
89              long referenceLength; 
90              if(curi.containsKey(A_REFERENCE_LENGTH) ) {
91                  // reuse previous length if available (see FetchHTTP#setSizes). 
92                  referenceLength = curi.getLong(A_REFERENCE_LENGTH);
93              } else {
94                  // normally, use content-length
95                  referenceLength = curi.getContentLength();
96              }
97              latestFetch.putLong(A_REFERENCE_LENGTH,referenceLength);
98          }
99          
100         // get or create proper-sized history array
101         int targetHistoryLength = 
102             (Integer) getUncheckedAttribute(curi, ATTR_HISTORY_LENGTH);
103         AList[] history = 
104             curi.getAList().containsKey(A_FETCH_HISTORY) 
105                 ? curi.getAList().getAListArray(A_FETCH_HISTORY) 
106                 : new AList[targetHistoryLength];
107         if(history.length != targetHistoryLength) {
108             AList[] newHistory = new AList[targetHistoryLength];
109             System.arraycopy(
110                     history,0,
111                     newHistory,0,
112                     Math.min(history.length,newHistory.length));
113             history = newHistory; 
114         }
115         
116         // rotate all history entries up one slot, insert new at [0]
117         for(int i = history.length-1; i >0; i--) {
118             history[i] = history[i-1];
119         }
120         history[0]=latestFetch;
121         
122         curi.getAList().putAListArray(A_FETCH_HISTORY,history);
123     }
124 
125     /***
126      * Save a header from the given HTTP operation into the AList.
127      * 
128      * @param name header name to save into history AList
129      * @param method http operation containing headers
130      * @param latestFetch AList to get header
131      */
132     protected void saveHeader(String name, HttpMethodBase method, AList latestFetch) {
133         Header header = method.getResponseHeader(name);
134         if(header!=null) {
135             latestFetch.putString(name, header.getValue());
136         }
137     }
138 
139     @Override
140     protected void initialTasks() {
141         // ensure history info persists across enqueues and recrawls
142         CrawlURI.addAlistPersistentMember(A_FETCH_HISTORY);
143     }
144     
145     
146 }