1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor.recrawl;
24
25 import org.apache.commons.httpclient.Header;
26 import org.apache.commons.httpclient.HttpMethodBase;
27 import org.apache.commons.httpclient.HttpState;
28 import org.apache.commons.httpclient.HttpStatus;
29 import org.archive.crawler.datamodel.CoreAttributeConstants;
30 import org.archive.crawler.datamodel.CrawlURI;
31 import org.archive.crawler.framework.Processor;
32 import org.archive.crawler.settings.SimpleType;
33
34 import st.ata.util.AList;
35 import st.ata.util.HashtableAList;
36
37 /***
38 * Maintain a history of fetch information inside the CrawlURI's attributes.
39 *
40 * @author gojomo
41 * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
42 */
43 public class FetchHistoryProcessor extends Processor implements CoreAttributeConstants {
44 private static final long serialVersionUID = 8476621038669163983L;
45
46 /*** setting for desired history array length */
47 public static final String ATTR_HISTORY_LENGTH = "history-length";
48 /*** default history array length */
49 public static final Integer DEFAULT_HISTORY_LENGTH = 2;
50
51 /***
52 * Usual constructor
53 *
54 * @param name
55 */
56 public FetchHistoryProcessor(String name) {
57 super(name, "FetchHistoryProcessor. Maintain a history of fetch " +
58 "information inside the CrawlURI's attributes..");
59
60 addElementToDefinition(new SimpleType(ATTR_HISTORY_LENGTH,
61 "Number of previous fetch entries to retain in the URI " +
62 "history. The current fetch becomes a history entry at " +
63 "this Processor step, so the smallest useful value is " +
64 "'2' (including the current fetch). Default is '2'.",
65 DEFAULT_HISTORY_LENGTH));
66 }
67
68 @Override
69 protected void innerProcess(CrawlURI curi) throws InterruptedException {
70 AList latestFetch = new HashtableAList();
71
72
73 latestFetch.putInt(A_STATUS,curi.getFetchStatus());
74
75 latestFetch.putLong(A_FETCH_BEGAN_TIME,curi.getLong(A_FETCH_BEGAN_TIME));
76
77 String digest = curi.getContentDigestSchemeString();
78 if(digest!=null) {
79 latestFetch.putString(
80 A_CONTENT_DIGEST,digest);
81 }
82
83 if(curi.containsKey(A_HTTP_TRANSACTION)) {
84 HttpMethodBase method =
85 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
86 saveHeader(A_ETAG_HEADER,method,latestFetch);
87 saveHeader(A_LAST_MODIFIED_HEADER,method,latestFetch);
88
89 long referenceLength;
90 if(curi.containsKey(A_REFERENCE_LENGTH) ) {
91
92 referenceLength = curi.getLong(A_REFERENCE_LENGTH);
93 } else {
94
95 referenceLength = curi.getContentLength();
96 }
97 latestFetch.putLong(A_REFERENCE_LENGTH,referenceLength);
98 }
99
100
101 int targetHistoryLength =
102 (Integer) getUncheckedAttribute(curi, ATTR_HISTORY_LENGTH);
103 AList[] history =
104 curi.getAList().containsKey(A_FETCH_HISTORY)
105 ? curi.getAList().getAListArray(A_FETCH_HISTORY)
106 : new AList[targetHistoryLength];
107 if(history.length != targetHistoryLength) {
108 AList[] newHistory = new AList[targetHistoryLength];
109 System.arraycopy(
110 history,0,
111 newHistory,0,
112 Math.min(history.length,newHistory.length));
113 history = newHistory;
114 }
115
116
117 for(int i = history.length-1; i >0; i--) {
118 history[i] = history[i-1];
119 }
120 history[0]=latestFetch;
121
122 curi.getAList().putAListArray(A_FETCH_HISTORY,history);
123 }
124
125 /***
126 * Save a header from the given HTTP operation into the AList.
127 *
128 * @param name header name to save into history AList
129 * @param method http operation containing headers
130 * @param latestFetch AList to get header
131 */
132 protected void saveHeader(String name, HttpMethodBase method, AList latestFetch) {
133 Header header = method.getResponseHeader(name);
134 if(header!=null) {
135 latestFetch.putString(name, header.getValue());
136 }
137 }
138
139 @Override
140 protected void initialTasks() {
141
142 CrawlURI.addAlistPersistentMember(A_FETCH_HISTORY);
143 }
144
145
146 }