1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.fetcher;
26
27 import it.unimi.dsi.mg4j.util.MutableString;
28
29 import java.io.File;
30 import java.io.FileNotFoundException;
31 import java.io.FileOutputStream;
32 import java.io.IOException;
33 import java.io.ObjectInputStream;
34 import java.io.ObjectOutputStream;
35 import java.io.RandomAccessFile;
36 import java.net.InetAddress;
37 import java.net.UnknownHostException;
38 import java.security.KeyManagementException;
39 import java.security.KeyStoreException;
40 import java.security.MessageDigest;
41 import java.security.NoSuchAlgorithmException;
42 import java.util.Collection;
43 import java.util.HashSet;
44 import java.util.Iterator;
45 import java.util.List;
46 import java.util.ListIterator;
47 import java.util.Map;
48 import java.util.Set;
49 import java.util.logging.Level;
50 import java.util.logging.Logger;
51
52 import javax.management.AttributeNotFoundException;
53 import javax.management.MBeanException;
54 import javax.management.ReflectionException;
55 import javax.net.ssl.SSLContext;
56 import javax.net.ssl.SSLSocketFactory;
57 import javax.net.ssl.TrustManager;
58
59 import org.apache.commons.httpclient.Cookie;
60 import org.apache.commons.httpclient.Header;
61 import org.apache.commons.httpclient.HostConfiguration;
62 import org.apache.commons.httpclient.HttpClient;
63 import org.apache.commons.httpclient.HttpConnection;
64 import org.apache.commons.httpclient.HttpConnectionManager;
65 import org.apache.commons.httpclient.HttpException;
66 import org.apache.commons.httpclient.HttpMethod;
67 import org.apache.commons.httpclient.HttpMethodBase;
68 import org.apache.commons.httpclient.HttpState;
69 import org.apache.commons.httpclient.HttpStatus;
70 import org.apache.commons.httpclient.HttpVersion;
71 import org.apache.commons.httpclient.auth.AuthChallengeParser;
72 import org.apache.commons.httpclient.auth.AuthScheme;
73 import org.apache.commons.httpclient.auth.BasicScheme;
74 import org.apache.commons.httpclient.auth.DigestScheme;
75 import org.apache.commons.httpclient.auth.MalformedChallengeException;
76 import org.apache.commons.httpclient.cookie.CookiePolicy;
77 import org.apache.commons.httpclient.params.HttpClientParams;
78 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
79 import org.apache.commons.httpclient.params.HttpMethodParams;
80 import org.apache.commons.httpclient.protocol.Protocol;
81 import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
82 import org.archive.crawler.Heritrix;
83 import org.archive.crawler.datamodel.CoreAttributeConstants;
84 import org.archive.crawler.datamodel.CrawlHost;
85 import org.archive.crawler.datamodel.CrawlOrder;
86 import org.archive.crawler.datamodel.CrawlServer;
87 import org.archive.crawler.datamodel.CrawlURI;
88 import org.archive.crawler.datamodel.CredentialStore;
89 import org.archive.crawler.datamodel.FetchStatusCodes;
90 import org.archive.crawler.datamodel.ServerCache;
91 import org.archive.crawler.datamodel.credential.Credential;
92 import org.archive.crawler.datamodel.credential.CredentialAvatar;
93 import org.archive.crawler.datamodel.credential.Rfc2617Credential;
94 import org.archive.crawler.deciderules.DecideRule;
95 import org.archive.crawler.deciderules.DecideRuleSequence;
96 import org.archive.crawler.event.CrawlStatusListener;
97 import org.archive.crawler.framework.Processor;
98 import org.archive.crawler.settings.SettingsHandler;
99 import org.archive.crawler.settings.SimpleType;
100 import org.archive.crawler.settings.StringList;
101 import org.archive.crawler.settings.Type;
102 import org.archive.httpclient.ConfigurableX509TrustManager;
103 import org.archive.httpclient.HttpRecorderGetMethod;
104 import org.archive.httpclient.HttpRecorderMethod;
105 import org.archive.httpclient.HttpRecorderPostMethod;
106 import org.archive.httpclient.SingleHttpConnectionManager;
107 import org.archive.io.ObjectPlusFilesInputStream;
108 import org.archive.io.RecorderLengthExceededException;
109 import org.archive.io.RecorderTimeoutException;
110 import org.archive.io.RecorderTooMuchHeaderException;
111 import org.archive.util.ArchiveUtils;
112 import org.archive.util.HttpRecorder;
113 import org.archive.util.bdbje.EnhancedEnvironment;
114
115 import st.ata.util.AList;
116
117 import com.sleepycat.bind.serial.SerialBinding;
118 import com.sleepycat.bind.serial.StoredClassCatalog;
119 import com.sleepycat.bind.tuple.StringBinding;
120 import com.sleepycat.collections.StoredSortedMap;
121 import com.sleepycat.je.Database;
122 import com.sleepycat.je.DatabaseConfig;
123 import com.sleepycat.je.DatabaseException;
124
125 /***
126 * HTTP fetcher that uses <a
127 * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons
128 * HttpClient</a> library.
129 *
130 * @author Gordon Mohr
131 * @author Igor Ranitovic
132 * @author others
133 * @version $Id: FetchHTTP.java 5798 2008-03-25 23:22:53Z gojomo $
134 */
135 public class FetchHTTP extends Processor
136 implements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {
137
138 private static final long serialVersionUID =
139 ArchiveUtils.classnameBasedUID(FetchHTTP.class,1);
140
141 private static Logger logger = Logger.getLogger(FetchHTTP.class.getName());
142
143 public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;
144 public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;
145 public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds";
146 public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms";
147 public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes";
148 public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file";
149 public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file";
150 public static final String ATTR_ACCEPT_HEADERS = "accept-headers";
151 public static final String ATTR_DEFAULT_ENCODING = "default-encoding";
152 public static final String ATTR_DIGEST_CONTENT = "digest-content";
153 public static final String ATTR_DIGEST_ALGORITHM = "digest-algorithm";
154 public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";
155
156 /***
157 * SSL trust level setting attribute name.
158 */
159 public static final String ATTR_TRUST = "trust-level";
160
161 private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200);
162 private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000);
163 private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0);
164 private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0;
165
166 /***
167 * This is the default value pre-1.4. Needs special handling else
168 * treated as negative number doing math later in processing.
169 */
170 private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;
171
172 /***
173 * Default character encoding to use for pages that do not specify.
174 */
175 private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;
176
177 /***
178 * Default whether to perform on-the-fly digest hashing of content-bodies.
179 */
180 static Boolean DEFAULT_DIGEST_CONTENT = new Boolean(true);
181
182 /***
183 * The different digest algorithms to choose between,
184 * SHA-1 or MD-5 at the moment.
185 */
186 public static final String SHA1 = "sha1";
187 public static final String MD5 = "md5";
188 public static String [] DIGEST_ALGORITHMS = {SHA1, MD5};
189
190 /***
191 * Default algorithm to use for message disgesting.
192 */
193 public static final String DEFAULT_DIGEST_ALGORITHM = SHA1;
194
195 private transient HttpClient http = null;
196
197 /***
198 * How many 'instant retries' of HttpRecoverableExceptions have occurred
199 *
200 * Would like it to be 'long', but longs aren't atomic
201 */
202 private int recoveryRetries = 0;
203
204 /***
205 * Count of crawl uris handled.
206 * Would like to be 'long', but longs aren't atomic
207 */
208 private int curisHandled = 0;
209
210 /***
211 * Rules to apply mid-fetch, just after receipt of the response
212 * headers before we start to download body.
213 */
214 public static final String ATTR_MIDFETCH_DECIDE_RULES = "midfetch-decide-rules";
215
216 /***
217 * What to log if midfetch abort.
218 */
219 private static final String MIDFETCH_ABORT_LOG = "midFetchAbort";
220
221 public static final String ATTR_SEND_CONNECTION_CLOSE =
222 "send-connection-close";
223 private static final Header HEADER_SEND_CONNECTION_CLOSE =
224 new Header("Connection", "close");
225 public static final String ATTR_SEND_REFERER = "send-referer";
226 public static final String ATTR_SEND_RANGE = "send-range";
227 public static final String ATTR_SEND_IF_MODIFIED_SINCE = "send-if-modified-since";
228 public static final String ATTR_SEND_IF_NONE_MATCH = "send-if-none-match";
229 public static final String REFERER = "Referer";
230 public static final String RANGE = "Range";
231 public static final String RANGE_PREFIX = "bytes=0-";
232 public static final String HTTP_SCHEME = "http";
233 public static final String HTTPS_SCHEME = "https";
234
235 public static final String ATTR_IGNORE_COOKIES = "ignore-cookies";
236 private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false);
237
238 public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies";
239 private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true);
240
241 public static final String ATTR_HTTP_BIND_ADDRESS = A_HTTP_BIND_ADDRESS;
242
243 /***
244 * Database backing cookie map, if using BDB
245 */
246 protected Database cookieDb;
247 /***
248 * Name of cookie BDB Database
249 */
250 public static final String COOKIEDB_NAME = "http_cookies";
251
252 static {
253 Protocol.registerProtocol("http", new Protocol("http",
254 new HeritrixProtocolSocketFactory(), 80));
255 try {
256 Protocol.registerProtocol("https",
257 new Protocol("https", ((ProtocolSocketFactory)
258 new HeritrixSSLProtocolSocketFactory()), 443));
259 } catch (KeyManagementException e) {
260 e.printStackTrace();
261 } catch (KeyStoreException e) {
262 e.printStackTrace();
263 } catch (NoSuchAlgorithmException e) {
264 e.printStackTrace();
265 }
266 }
267 static final String SERVER_CACHE_KEY = "heritrix.server.cache";
268 static final String SSL_FACTORY_KEY = "heritrix.ssl.factory";
269
270 /****
271 * Socket factory that has the configurable trust manager installed.
272 */
273 private SSLSocketFactory sslfactory = null;
274
275
276 /***
277 * Constructor.
278 *
279 * @param name Name of this processor.
280 */
281 public FetchHTTP(String name) {
282 super(name, "HTTP Fetcher");
283
284 addElementToDefinition(
285 new DecideRuleSequence(ATTR_MIDFETCH_DECIDE_RULES,
286 "DecideRules which, if final decision is REJECT, " +
287 "abort fetch after headers before all content is" +
288 "read."));
289
290 addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS,
291 "If the fetch is not completed in this number of seconds, "
292 + "even if it is making progress, give up. The URI will be "
293 + "annotated as timeTrunc. Set to zero for no timeout. "
294 + "(This is not recommended: threads could wait indefinitely "
295 + "for the fetch to end.)",
296 DEFAULT_TIMEOUT_SECONDS));
297 Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS,
298 "If a socket is unresponsive for this number of milliseconds, " +
299 "give up on that connects/read. (This does not necessarily give " +
300 "up on the fetch immediately; connects are subject to retries " +
301 "and reads will be retried until " + ATTR_TIMEOUT_SECONDS +
302 " have elapsed. Set to zero for no socket timeout. (This is " +
303 "note recommended: a socket operation could hand indefinitely.",
304 DEFAULT_SOTIMEOUT_MS));
305 e.setExpertSetting(true);
306 e = addElementToDefinition(new SimpleType(ATTR_FETCH_BANDWIDTH_MAX,
307 "The maximum KB/sec to use when fetching data from a server. " +
308 "0 means no maximum. Default: "+ DEFAULT_FETCH_BANDWIDTH_MAX
309 + ".", DEFAULT_FETCH_BANDWIDTH_MAX));
310 e.setExpertSetting(true);
311 e.setOverrideable(true);
312 addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES,
313 "Maximum length in bytes to fetch.\n" +
314 "Fetch is truncated at this length. A value of 0 means no limit.",
315 DEFAULT_MAX_LENGTH_BYTES));
316 e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,
317 "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));
318 e.setOverrideable(true);
319 e.setExpertSetting(true);
320 e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES,
321 "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));
322 e.setExpertSetting(true);
323
324 e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES,
325 "File to preload cookies from", ""));
326 e.setExpertSetting(true);
327 e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,
328 "When crawl finishes save cookies to this file", ""));
329 e.setExpertSetting(true);
330 e = addElementToDefinition(new SimpleType(ATTR_TRUST,
331 "SSL certificate trust level. Range is from the default 'open'"
332 + " (trust all certs including expired, selfsigned, and those for"
333 + " which we do not have a CA) through 'loose' (trust all valid"
334 + " certificates including selfsigned), 'normal' (all valid"
335 + " certificates not including selfsigned) to 'strict' (Cert is"
336 + " valid and DN must match servername)",
337 ConfigurableX509TrustManager.DEFAULT,
338 ConfigurableX509TrustManager.LEVELS_AS_ARRAY));
339 e.setOverrideable(false);
340 e.setExpertSetting(true);
341 e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS,
342 "Accept Headers to include in each request. Each must be the"
343 + " complete header, e.g., 'Accept-Language: en'"));
344 e.setExpertSetting(true);
345 e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,
346 "Proxy host IP (set only if needed).", ""));
347 e.setExpertSetting(true);
348 e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,
349 "Proxy port (set only if needed)", ""));
350 e.setExpertSetting(true);
351 e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING,
352 "The character encoding to use for files that do not have one" +
353 " specified in the HTTP response headers. Default: " +
354 DEFAULT_CONTENT_CHARSET + ".",
355 DEFAULT_CONTENT_CHARSET));
356 e.setExpertSetting(true);
357 e = addElementToDefinition(new SimpleType(ATTR_DIGEST_CONTENT,
358 "Whether or not to perform an on-the-fly digest hash of" +
359 " retrieved content-bodies.",
360 DEFAULT_DIGEST_CONTENT));
361 e.setExpertSetting(true);
362 e = addElementToDefinition(new SimpleType(ATTR_DIGEST_ALGORITHM,
363 "Which algorithm (for example MD5 or SHA-1) to use to perform an on-the-fly digest" +
364 " hash of retrieved content-bodies.",
365 DEFAULT_DIGEST_ALGORITHM, DIGEST_ALGORITHMS));
366 e.setExpertSetting(true);
367 e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_MODIFIED_SINCE,
368 "Send 'If-Modified-Since' header, if previous 'Last-Modified' " +
369 "fetch history information is available in URI history.",
370 new Boolean(true)));
371 e.setOverrideable(true);
372 e.setExpertSetting(true);
373 e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_NONE_MATCH,
374 "Send 'If-None-Match' header, if previous 'Etag' fetch " +
375 "history information is available in URI history.",
376 new Boolean(true)));
377 e.setOverrideable(true);
378 e.setExpertSetting(true);
379 e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE,
380 "Send 'Connection: close' header with every request.",
381 new Boolean(true)));
382 e.setOverrideable(true);
383 e.setExpertSetting(true);
384 e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER,
385 "Send 'Referer' header with every request.\n" +
386 "The 'Referer' header contans the location the crawler came " +
387 " from, " +
388 "the page the current URI was discovered in. The 'Referer' " +
389 "usually is " +
390 "logged on the remote server and can be of assistance to " +
391 "webmasters trying to figure how a crawler got to a " +
392 "particular area on a site.",
393 new Boolean(true)));
394 e.setOverrideable(true);
395 e.setExpertSetting(true);
396 e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE,
397 "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES +
398 ") on document size.\n" +
399 "Be polite to the HTTP servers and send the 'Range' header," +
400 "stating that you are only interested in the first n bytes. " +
401 "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " +
402 "Sending the 'Range' header results in a " +
403 "'206 Partial Content' status response, which is better than " +
404 "just cutting the response mid-download. On rare occasion, " +
405 " sending 'Range' will " +
406 "generate '416 Request Range Not Satisfiable' response.",
407 new Boolean(false)));
408 e.setOverrideable(true);
409 e.setExpertSetting(true);
410 e = addElementToDefinition(new SimpleType(ATTR_HTTP_BIND_ADDRESS,
411 "Local IP address or hostname to use when making connections " +
412 "(binding sockets). When not specified, uses default local" +
413 "address(es).", ""));
414 e.setExpertSetting(true);
415 }
416
417 protected void innerProcess(final CrawlURI curi)
418 throws InterruptedException {
419 if (!canFetch(curi)) {
420
421 return;
422 }
423
424 this.curisHandled++;
425
426
427 curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
428
429
430 HttpRecorder rec = HttpRecorder.getHttpRecorder();
431
432
433 boolean digestContent = ((Boolean)getUncheckedAttribute(curi,
434 ATTR_DIGEST_CONTENT)).booleanValue();
435 String algorithm = null;
436 if (digestContent) {
437 algorithm = ((String)getUncheckedAttribute(curi,
438 ATTR_DIGEST_ALGORITHM));
439 rec.getRecordedInput().setDigest(algorithm);
440 } else {
441
442 rec.getRecordedInput().setDigest((MessageDigest)null);
443 }
444
445
446
447 String curiString = curi.getUURI().toString();
448 HttpMethodBase method = null;
449 if (curi.isPost()) {
450 method = new HttpRecorderPostMethod(curiString, rec) {
451 protected void readResponseBody(HttpState state,
452 HttpConnection conn)
453 throws IOException, HttpException {
454 addResponseContent(this, curi);
455 if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {
456 doAbort(curi, this, MIDFETCH_ABORT_LOG);
457 } else {
458 super.readResponseBody(state, conn);
459 }
460 }
461 };
462 } else {
463 method = new HttpRecorderGetMethod(curiString, rec) {
464 protected void readResponseBody(HttpState state,
465 HttpConnection conn)
466 throws IOException, HttpException {
467 addResponseContent(this, curi);
468 if (checkMidfetchAbort(curi, this.httpRecorderMethod,
469 conn)) {
470 doAbort(curi, this, MIDFETCH_ABORT_LOG);
471 } else {
472 super.readResponseBody(state, conn);
473 }
474 }
475 };
476 }
477
478 HostConfiguration customConfigOrNull = configureMethod(curi, method);
479
480
481
482 curi.setHttpRecorder(rec);
483
484
485 boolean addedCredentials = populateCredentials(curi, method);
486 method.setDoAuthentication(addedCredentials);
487
488
489 long hardMax = getMaxLength(curi);
490
491 long timeoutMs = 1000 * getTimeout(curi);
492
493 long maxRateKBps = getMaxFetchRate(curi);
494 rec.getRecordedInput().setLimits(hardMax, timeoutMs, maxRateKBps);
495
496 try {
497 this.http.executeMethod(customConfigOrNull, method);
498 } catch (RecorderTooMuchHeaderException ex) {
499
500 doAbort(curi, method, HEADER_TRUNC);
501 } catch (IOException e) {
502 failedExecuteCleanup(method, curi, e);
503 return;
504 } catch (ArrayIndexOutOfBoundsException e) {
505
506
507
508
509 failedExecuteCleanup(method, curi, e);
510 return;
511 }
512
513
514 long softMax = method.getResponseContentLength();
515
516 try {
517 if (!method.isAborted()) {
518
519
520 rec.getRecordedInput().readFullyOrUntil(softMax);
521 }
522 } catch (RecorderTimeoutException ex) {
523 doAbort(curi, method, TIMER_TRUNC);
524 } catch (RecorderLengthExceededException ex) {
525 doAbort(curi, method, LENGTH_TRUNC);
526 } catch (IOException e) {
527 cleanup(curi, e, "readFully", S_CONNECT_LOST);
528 return;
529 } catch (ArrayIndexOutOfBoundsException e) {
530
531
532
533 cleanup(curi, e, "readFully", S_CONNECT_LOST);
534 return;
535 } finally {
536
537 rec.closeRecorders();
538 if (!method.isAborted()) {
539 method.releaseConnection();
540 }
541
542 curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
543
544 setCharacterEncoding(rec, method);
545 setSizes(curi, rec);
546 }
547
548 if (digestContent) {
549 curi.setContentDigest(algorithm,
550 rec.getRecordedInput().getDigestValue());
551 }
552 if (logger.isLoggable(Level.INFO)) {
553 logger.info((curi.isPost()? "POST": "GET") + " " +
554 curi.getUURI().toString() + " " + method.getStatusCode() +
555 " " + rec.getRecordedInput().getSize() + " " +
556 curi.getContentType());
557 }
558
559 if (curi.isSuccess() && addedCredentials) {
560
561
562
563 promoteCredentials(curi);
564 if (logger.isLoggable(Level.FINE)) {
565
566 Header setCookie = method.getResponseHeader("set-cookie");
567 if (setCookie != null) {
568 logger.fine(setCookie.toString().trim());
569 }
570 }
571 } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
572
573 handle401(method, curi);
574 }
575
576 if (rec.getRecordedInput().isOpen()) {
577 logger.severe(curi.toString() + " RIS still open. Should have" +
578 " been closed by method release: " +
579 Thread.currentThread().getName());
580 try {
581 rec.getRecordedInput().close();
582 } catch (IOException e) {
583 logger.log(Level.SEVERE,"second-chance RIS close failed",e);
584 }
585 }
586 }
587
588 /***
589 * Update CrawlURI internal sizes based on current transaction (and
590 * in the case of 304s, history)
591 *
592 * @param curi CrawlURI
593 * @param rec HttpRecorder
594 */
595 protected void setSizes(final CrawlURI curi, HttpRecorder rec) {
596
597 curi.setContentSize(rec.getRecordedInput().getSize());
598
599 if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED
600 && curi.containsKey(A_FETCH_HISTORY)) {
601 AList history[] = curi.getAList().getAListArray(A_FETCH_HISTORY);
602 if (history[0] != null
603 && history[0]
604 .containsKey(CoreAttributeConstants.A_REFERENCE_LENGTH)) {
605 long referenceLength = history[0].getLong(A_REFERENCE_LENGTH);
606
607 curi.putLong(A_REFERENCE_LENGTH, referenceLength);
608
609 curi.setContentSize(rec.getRecordedInput().getSize()
610 + referenceLength);
611 }
612 }
613 }
614
615 protected void doAbort(CrawlURI curi, HttpMethod method,
616 String annotation) {
617 curi.addAnnotation(annotation);
618 curi.getHttpRecorder().close();
619 method.abort();
620 }
621
622 protected boolean checkMidfetchAbort(CrawlURI curi,
623 HttpRecorderMethod method, HttpConnection conn) {
624 if (curi.isPrerequisite() || rulesAccept(getMidfetchRule(curi), curi)) {
625 return false;
626 }
627 method.markContentBegin(conn);
628 return true;
629 }
630
631 protected DecideRule getMidfetchRule(Object o) {
632 try {
633 return (DecideRule)getAttribute(o, ATTR_MIDFETCH_DECIDE_RULES);
634 } catch (AttributeNotFoundException e) {
635 throw new RuntimeException(e);
636 }
637 }
638
639 /***
640 * This method populates <code>curi</code> with response status and
641 * content type.
642 * @param curi CrawlURI to populate.
643 * @param method Method to get response status and headers from.
644 */
645 protected void addResponseContent (HttpMethod method, CrawlURI curi) {
646 curi.setFetchStatus(method.getStatusCode());
647 Header ct = method.getResponseHeader("content-type");
648 curi.setContentType((ct == null)? null: ct.getValue());
649
650
651 curi.putObject(A_HTTP_TRANSACTION, method);
652 }
653
654 /***
655 * Set the character encoding based on the result headers or default.
656 *
657 * The HttpClient returns its own default encoding ("ISO-8859-1") if one
658 * isn't specified in the Content-Type response header. We give the user
659 * the option of overriding this, so we need to detect the case where the
660 * default is returned.
661 *
662 * Now, it may well be the case that the default returned by HttpClient
663 * and the default defined by the user are the same.
664 *
665 * @param rec Recorder for this request.
666 * @param method Method used for the request.
667 */
668 private void setCharacterEncoding(final HttpRecorder rec,
669 final HttpMethod method) {
670 String encoding = null;
671
672 try {
673 encoding = ((HttpMethodBase) method).getResponseCharSet();
674 if (encoding == null ||
675 encoding.equals(DEFAULT_CONTENT_CHARSET)) {
676 encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);
677 }
678 } catch (Exception e) {
679 logger.warning("Failed get default encoding: " +
680 e.getLocalizedMessage());
681 }
682 rec.setCharacterEncoding(encoding);
683 }
684
685 /***
686 * Cleanup after a failed method execute.
687 * @param curi CrawlURI we failed on.
688 * @param method Method we failed on.
689 * @param exception Exception we failed with.
690 */
691 private void failedExecuteCleanup(final HttpMethod method,
692 final CrawlURI curi, final Exception exception) {
693 cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED);
694 method.releaseConnection();
695 }
696
697 /***
698 * Cleanup after a failed method execute.
699 * @param curi CrawlURI we failed on.
700 * @param exception Exception we failed with.
701 * @param message Message to log with failure.
702 * @param status Status to set on the fetch.
703 */
704 private void cleanup(final CrawlURI curi, final Exception exception,
705 final String message, final int status) {
706 curi.addLocalizedError(this.getName(), exception, message);
707 curi.setFetchStatus(status);
708 curi.getHttpRecorder().close();
709 }
710
711 /***
712 * Can this processor fetch the given CrawlURI. May set a fetch
713 * status if this processor would usually handle the CrawlURI,
714 * but cannot in this instance.
715 *
716 * @param curi
717 * @return True if processor can fetch.
718 */
719 private boolean canFetch(CrawlURI curi) {
720 if(curi.getFetchStatus()<0) {
721
722
723 curi.skipToProcessorChain(getController().getPostprocessorChain());
724 return false;
725 }
726 String scheme = curi.getUURI().getScheme();
727 if (!(scheme.equals("http") || scheme.equals("https"))) {
728
729 return false;
730 }
731 CrawlHost host = getController().getServerCache().getHostFor(curi);
732
733 if (host.getIP() == null && host.hasBeenLookedUp()) {
734 curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
735 return false;
736 }
737 return true;
738 }
739
740 /***
741 * Configure the HttpMethod setting options and headers.
742 *
743 * @param curi CrawlURI from which we pull configuration.
744 * @param method The Method to configure.
745 * @return HostConfiguration copy customized for this CrawlURI
746 */
747 protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) {
748
749 method.setFollowRedirects(false);
750
751
752
753
754
755
756
757 method.getParams().setCookiePolicy(
758 (((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)).
759 booleanValue())?
760 CookiePolicy.IGNORE_COOKIES:
761 CookiePolicy.BROWSER_COMPATIBILITY);
762
763
764 method.getParams().setVersion(HttpVersion.HTTP_1_0);
765
766 CrawlOrder order = getSettingsHandler().getOrder();
767 String userAgent = curi.getUserAgent();
768 if (userAgent == null) {
769 userAgent = order.getUserAgent(curi);
770 }
771 method.setRequestHeader("User-Agent", userAgent);
772 method.setRequestHeader("From", order.getFrom(curi));
773
774
775 method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
776 new HeritrixHttpMethodRetryHandler());
777
778 final long maxLength = getMaxLength(curi);
779 if(maxLength > 0 &&
780 ((Boolean)getUncheckedAttribute(curi, ATTR_SEND_RANGE)).
781 booleanValue()) {
782 method.addRequestHeader(RANGE,
783 RANGE_PREFIX.concat(Long.toString(maxLength - 1)));
784 }
785
786 if (((Boolean)getUncheckedAttribute(curi,
787 ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {
788 method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);
789 }
790
791 if (((Boolean)getUncheckedAttribute(curi,
792 ATTR_SEND_REFERER)).booleanValue()) {
793
794
795 String via = curi.flattenVia();
796 if (via != null && via.length() > 0 &&
797 !(via.startsWith(HTTPS_SCHEME) &&
798 curi.getUURI().getScheme().equals(HTTP_SCHEME))) {
799 method.setRequestHeader(REFERER, via);
800 }
801 }
802
803 if(!curi.isPrerequisite()) {
804 setConditionalGetHeader(curi, method, ATTR_SEND_IF_MODIFIED_SINCE,
805 CoreAttributeConstants.A_LAST_MODIFIED_HEADER, "If-Modified-Since");
806 setConditionalGetHeader(curi, method, ATTR_SEND_IF_NONE_MATCH,
807 CoreAttributeConstants.A_ETAG_HEADER, "If-None-Match");
808 }
809
810
811
812 setAcceptHeaders(curi, method);
813
814 HostConfiguration config = new HostConfiguration(http.getHostConfiguration());
815 configureProxy(curi, config);
816 configureBindAddress(curi, config);
817 return config;
818 }
819
820 /***
821 * Set the given conditional-GET header, if the setting is enabled and
822 * a suitable value is available in the URI history.
823 * @param curi source CrawlURI
824 * @param method HTTP operation pending
825 * @param setting true/false enablement setting name to consult
826 * @param sourceHeader header to consult in URI history
827 * @param targetHeader header to set if possible
828 */
829 protected void setConditionalGetHeader(CrawlURI curi, HttpMethod method,
830 String setting, String sourceHeader, String targetHeader) {
831 if(((Boolean)getUncheckedAttribute(curi,setting))) {
832 try {
833 int previousStatus = curi.getAList().getAListArray(
834 A_FETCH_HISTORY)[0].getInt(A_STATUS);
835 if(previousStatus<=0) {
836
837 return;
838 }
839 String previousValue = curi.getAList().getAListArray(
840 A_FETCH_HISTORY)[0].getString(sourceHeader);
841 if(previousValue!=null) {
842 method.setRequestHeader(targetHeader, previousValue);
843 }
844 } catch (RuntimeException e) {
845
846 }
847 }
848 }
849
850 /***
851 * Setup proxy, based on attributes in CrawlURI and settings,
852 * in the given HostConfiguration
853 */
854 private void configureProxy(CrawlURI curi, HostConfiguration config) {
855 String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST);
856 int port = -1;
857 if(proxy.length()==0) {
858 proxy = null;
859 } else {
860 String portString = (String)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT);
861 port = portString.length()>0 ? Integer.parseInt(portString) : -1;
862 }
863 if(proxy!=null) {
864 config.setProxy(proxy,port);
865 }
866 }
867
868 /***
869 * Setup local bind address, based on attributes in CrawlURI and settings,
870 * in the given HostConfiguration
871 */
872 private void configureBindAddress(CrawlURI curi, HostConfiguration config) {
873 String addressString = (String) getAttributeEither(curi, ATTR_HTTP_BIND_ADDRESS);
874 if(addressString != null && addressString.length() > 0) {
875 try {
876 InetAddress localAddress = InetAddress.getByName(addressString);
877 config.setLocalAddress(localAddress);
878 } catch (UnknownHostException e) {
879
880
881 throw new RuntimeException("Unknown host " + addressString
882 + " in " + ATTR_HTTP_BIND_ADDRESS);
883 }
884 }
885 }
886
887 /***
888 * Get a value either from inside the CrawlURI instance, or from
889 * settings (module attributes).
890 *
891 * @param curi CrawlURI to consult
892 * @param key key to lookup
893 * @return value from either CrawlURI (preferred) or settings
894 */
895 protected Object getAttributeEither(CrawlURI curi, String key) {
896 Object obj = curi!=null ? curi.getObject(key) : null;
897 if(obj==null) {
898 obj = getUncheckedAttribute(curi, key);
899 }
900 return obj;
901 }
902
903 /***
904 * Add credentials if any to passed <code>method</code>.
905 *
906 * Do credential handling. Credentials are in two places. 1. Credentials
907 * that succeeded are added to the CrawlServer (Or rather, avatars for
908 * credentials are whats added because its not safe to keep around
909 * references to credentials). 2. Credentials to be tried are in the curi.
910 * Returns true if found credentials to be tried.
911 *
912 * @param curi Current CrawlURI.
913 * @param method The method to add to.
914 * @return True if prepopulated <code>method</code> with credentials AND the
915 * credentials came from the <code>curi</code>, not from the CrawlServer.
916 * The former is special in that if the <code>curi</curi> credentials
917 * succeed, then the caller needs to promote them from the CrawlURI to the
918 * CrawlServer so they are available for all subsequent CrawlURIs on this
919 * server.
920 */
921 private boolean populateCredentials(CrawlURI curi, HttpMethod method) {
922
923
924
925 CrawlServer server =
926 getController().getServerCache().getServerFor(curi);
927 if (server.hasCredentialAvatars()) {
928 Set avatars = server.getCredentialAvatars();
929 for (Iterator i = avatars.iterator(); i.hasNext();) {
930 CredentialAvatar ca = (CredentialAvatar)i.next();
931 Credential c = ca.getCredential(getSettingsHandler(), curi);
932 if (c.isEveryTime()) {
933 c.populate(curi, this.http, method, ca.getPayload());
934 }
935 }
936 }
937
938 boolean result = false;
939
940
941
942
943 if (curi.hasCredentialAvatars()) {
944 Set avatars = curi.getCredentialAvatars();
945 for (Iterator i = avatars.iterator(); i.hasNext();) {
946 CredentialAvatar ca = (CredentialAvatar)i.next();
947 Credential c = ca.getCredential(getSettingsHandler(), curi);
948 if (c.populate(curi, this.http, method, ca.getPayload())) {
949 result = true;
950 }
951 }
952 }
953
954 return result;
955 }
956
957 /***
958 * Promote successful credential to the server.
959 *
960 * @param curi CrawlURI whose credentials we are to promote.
961 */
962 private void promoteCredentials(final CrawlURI curi) {
963 if (!curi.hasCredentialAvatars()) {
964 logger.severe("No credentials to promote when there should be " +
965 curi);
966 } else {
967 Set avatars = curi.getCredentialAvatars();
968 for (Iterator i = avatars.iterator(); i.hasNext();) {
969 CredentialAvatar ca = (CredentialAvatar)i.next();
970 curi.removeCredentialAvatar(ca);
971
972
973
974
975
976 Credential c = ca.getCredential(getSettingsHandler(), curi);
977 String cd = null;
978 try {
979 cd = c.getCredentialDomain(curi);
980 }
981 catch (AttributeNotFoundException e) {
982 logger.severe("Failed to get cred domain for " + curi +
983 " for " + ca + ": " + e.getMessage());
984 }
985 if (cd != null) {
986 CrawlServer cs
987 = getController().getServerCache().getServerFor(cd);
988 if (cs != null) {
989 cs.addCredentialAvatar(ca);
990 }
991 }
992 }
993 }
994 }
995
996 /***
997 * Server is looking for basic/digest auth credentials (RFC2617). If we have
998 * any, put them into the CrawlURI and have it come around again. Presence
999 * of the credential serves as flag to frontier to requeue promptly. If we
1000 * already tried this domain and still got a 401, then our credentials are
1001 * bad. Remove them and let this curi die.
1002 *
1003 * @param method Method that got a 401.
1004 * @param curi CrawlURI that got a 401.
1005 */
1006 protected void handle401(final HttpMethod method, final CrawlURI curi) {
1007 AuthScheme authscheme = getAuthScheme(method, curi);
1008 if (authscheme == null) {
1009 return;
1010 }
1011 String realm = authscheme.getRealm();
1012
1013
1014
1015
1016 Set curiRfc2617Credentials = getCredentials(getSettingsHandler(),
1017 curi, Rfc2617Credential.class);
1018 Rfc2617Credential extant = Rfc2617Credential.
1019 getByRealm(curiRfc2617Credentials, realm, curi);
1020 if (extant != null) {
1021
1022
1023
1024
1025 extant.detachAll(curi);
1026 logger.warning("Auth failed (401) though supplied realm " +
1027 realm + " to " + curi.toString());
1028 } else {
1029
1030
1031
1032
1033
1034
1035 CredentialStore cs =
1036 CredentialStore.getCredentialStore(getSettingsHandler());
1037 if (cs == null) {
1038 logger.severe("No credential store for " + curi);
1039 } else {
1040 CrawlServer server = getController().getServerCache().
1041 getServerFor(curi);
1042 Set storeRfc2617Credentials = cs.subset(curi,
1043 Rfc2617Credential.class, server.getName());
1044 if (storeRfc2617Credentials == null ||
1045 storeRfc2617Credentials.size() <= 0) {
1046 logger.info("No rfc2617 credentials for " + curi);
1047 } else {
1048 Rfc2617Credential found = Rfc2617Credential.
1049 getByRealm(storeRfc2617Credentials, realm, curi);
1050 if (found == null) {
1051 logger.info("No rfc2617 credentials for realm " +
1052 realm + " in " + curi);
1053 } else {
1054 found.attach(curi, authscheme.getRealm());
1055 logger.info("Found credential for realm " + realm +
1056 " in store for " + curi.toString());
1057 }
1058 }
1059 }
1060 }
1061 }
1062
1063 /***
1064 * @param method Method that got a 401.
1065 * @param curi CrawlURI that got a 401.
1066 * @return Returns first wholesome authscheme found else null.
1067 */
1068 protected AuthScheme getAuthScheme(final HttpMethod method,
1069 final CrawlURI curi) {
1070 Header [] headers = method.getResponseHeaders("WWW-Authenticate");
1071 if (headers == null || headers.length <= 0) {
1072 logger.info("We got a 401 but no WWW-Authenticate challenge: " +
1073 curi.toString());
1074 return null;
1075 }
1076
1077 Map authschemes = null;
1078 try {
1079 authschemes = AuthChallengeParser.parseChallenges(headers);
1080 } catch(MalformedChallengeException e) {
1081 logger.info("Failed challenge parse: " + e.getMessage());
1082 }
1083 if (authschemes == null || authschemes.size() <= 0) {
1084 logger.info("We got a 401 and WWW-Authenticate challenge" +
1085 " but failed parse of the header " + curi.toString());
1086 return null;
1087 }
1088
1089 AuthScheme result = null;
1090
1091 for (Iterator i = authschemes.keySet().iterator();
1092 result == null && i.hasNext();) {
1093 String key = (String)i.next();
1094 String challenge = (String)authschemes.get(key);
1095 if (key == null || key.length() <= 0 || challenge == null ||
1096 challenge.length() <= 0) {
1097 logger.warning("Empty scheme: " + curi.toString() +
1098 ": " + headers);
1099 }
1100 AuthScheme authscheme = null;
1101 if (key.equals("basic")) {
1102 authscheme = new BasicScheme();
1103 } else if (key.equals("digest")) {
1104 authscheme = new DigestScheme();
1105 } else {
1106 logger.info("Unsupported scheme: " + key);
1107 continue;
1108 }
1109
1110 try {
1111 authscheme.processChallenge(challenge);
1112 } catch (MalformedChallengeException e) {
1113 logger.info(e.getMessage() + " " + curi + " " + headers);
1114 continue;
1115 }
1116 if (authscheme.isConnectionBased()) {
1117 logger.info("Connection based " + authscheme);
1118 continue;
1119 }
1120
1121 if (authscheme.getRealm() == null ||
1122 authscheme.getRealm().length() <= 0) {
1123 logger.info("Empty realm " + authscheme + " for " + curi);
1124 continue;
1125 }
1126 result = authscheme;
1127 }
1128
1129 return result;
1130 }
1131
1132 /***
1133 * @param handler Settings Handler.
1134 * @param curi CrawlURI that got a 401.
1135 * @param type Class of credential to get from curi.
1136 * @return Set of credentials attached to this curi.
1137 */
1138 private Set<Credential> getCredentials(SettingsHandler handler,
1139 CrawlURI curi, Class type) {
1140 Set<Credential> result = null;
1141
1142 if (curi.hasCredentialAvatars()) {
1143 for (Iterator i = curi.getCredentialAvatars().iterator();
1144 i.hasNext();) {
1145 CredentialAvatar ca = (CredentialAvatar)i.next();
1146 if (ca.match(type)) {
1147 if (result == null) {
1148 result = new HashSet<Credential>();
1149 }
1150 result.add(ca.getCredential(handler, curi));
1151 }
1152 }
1153 }
1154 return result;
1155 }
1156
1157 public void initialTasks() {
1158 super.initialTasks();
1159 this.getController().addCrawlStatusListener(this);
1160 configureHttp();
1161
1162
1163 loadCookies();
1164
1165
1166
1167
1168 try {
1169 SSLContext context = SSLContext.getInstance("SSL");
1170 context.init(null, new TrustManager[] {
1171 new ConfigurableX509TrustManager((String)
1172 getAttribute(ATTR_TRUST))}, null);
1173 this.sslfactory = context.getSocketFactory();
1174 } catch (Exception e) {
1175 logger.log(Level.WARNING, "Failed configure of ssl context "
1176 + e.getMessage(), e);
1177 }
1178 }
1179
1180 public void finalTasks() {
1181
1182 saveCookies();
1183 cleanupHttp();
1184 super.finalTasks();
1185 }
1186
1187 /***
1188 * Perform any final cleanup related to the HttpClient instance.
1189 */
1190 protected void cleanupHttp() {
1191 if(cookieDb!=null) {
1192 try {
1193 cookieDb.sync();
1194 cookieDb.close();
1195 } catch (DatabaseException e) {
1196
1197 e.printStackTrace();
1198 }
1199 }
1200 }
1201
1202 protected void configureHttp() throws RuntimeException {
1203
1204 int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0;
1205
1206
1207 HttpConnectionManager cm = new SingleHttpConnectionManager();
1208
1209
1210
1211 HttpConnectionManagerParams hcmp = cm.getParams();
1212 hcmp.setConnectionTimeout(timeout);
1213 hcmp.setStaleCheckingEnabled(true);
1214
1215
1216
1217 hcmp.setTcpNoDelay(false);
1218
1219 this.http = new HttpClient(cm);
1220 HttpClientParams hcp = this.http.getParams();
1221
1222 hcp.setSoTimeout(timeout);
1223
1224 hcp.setVersion(HttpVersion.HTTP_1_0);
1225
1226 configureHttpCookies();
1227
1228
1229 this.http.getParams().setParameter(
1230 HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean(true));
1231 this.http.getParams().setParameter(
1232 HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean(false));
1233 this.http.getParams().setParameter(
1234 HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean(false));
1235 this.http.getParams().setIntParameter(
1236 HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);
1237
1238
1239 HostConfiguration config = this.http.getHostConfiguration();
1240 configureProxy(null, config);
1241 configureBindAddress(null,config);
1242
1243
1244
1245 final ServerCache cache = getController().getServerCache();
1246 hcmp.setParameter(SERVER_CACHE_KEY, cache);
1247 hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory);
1248 }
1249
1250 /***
1251 * Set the HttpClient HttpState instance to use a BDB-backed
1252 * StoredSortedMap for cookie storage, if that option is chosen.
1253 */
1254 private void configureHttpCookies() {
1255
1256 if(((Boolean)getUncheckedAttribute(null, ATTR_BDB_COOKIES)).
1257 booleanValue()) {
1258 try {
1259 EnhancedEnvironment env = getController().getBdbEnvironment();
1260 StoredClassCatalog classCatalog = env.getClassCatalog();
1261 DatabaseConfig dbConfig = new DatabaseConfig();
1262 dbConfig.setTransactional(false);
1263 dbConfig.setAllowCreate(true);
1264 dbConfig.setDeferredWrite(true);
1265 cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig);
1266 StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb,
1267 new StringBinding(), new SerialBinding(classCatalog,
1268 Cookie.class), true);
1269 this.http.getState().setCookiesMap(cookiesMap);
1270 } catch (DatabaseException e) {
1271
1272 logger.severe(e.getMessage());
1273 e.printStackTrace();
1274 }
1275 }
1276 }
1277
1278 /***
1279 * @param curi Current CrawlURI. Used to get context.
1280 * @return Socket timeout value.
1281 */
1282 private int getSoTimeout(CrawlURI curi) {
1283 Integer res = null;
1284 try {
1285 res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi);
1286 } catch (Exception e) {
1287 res = DEFAULT_SOTIMEOUT_MS;
1288 }
1289 return res.intValue();
1290 }
1291
1292 /***
1293 * @param curi Current CrawlURI. Used to get context.
1294 * @return Timeout value for total request.
1295 */
1296 private int getTimeout(CrawlURI curi) {
1297 Integer res;
1298 try {
1299 res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi);
1300 } catch (Exception e) {
1301 res = DEFAULT_TIMEOUT_SECONDS;
1302 }
1303 return res.intValue();
1304 }
1305
1306 private int getMaxFetchRate(CrawlURI curi) {
1307 Integer res;
1308 try {
1309 res = (Integer)getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);
1310 }
1311 catch (Exception e) {
1312 res = DEFAULT_FETCH_BANDWIDTH_MAX;
1313 }
1314 return res.intValue();
1315 }
1316
1317 private long getMaxLength(CrawlURI curi) {
1318 Long res;
1319 try {
1320 res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);
1321 if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {
1322 res = DEFAULT_MAX_LENGTH_BYTES;
1323 }
1324 } catch (Exception e) {
1325 res = DEFAULT_MAX_LENGTH_BYTES;
1326 }
1327 return res.longValue();
1328 }
1329
1330 /***
1331 * Load cookies from a file before the first fetch.
1332 * <p>
1333 * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1334 * Example entry of cookies.txt file:<br>
1335 * <br>
1336 * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1337 * <br>
1338 * Each line has 7 tab-separated fields:<br>
1339 * <li>1. DOMAIN: The domain that created and have access to the cookie
1340 * value.
1341 * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1342 * domain can access the cookie value.
1343 * <li>3. PATH: The path within the domain that the cookie value is valid
1344 * for.
1345 * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1346 * connection to access the cookie value.
1347 * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1348 * <li>6. NAME: The name of the cookie value
1349 * <li>7. VALUE: The cookie value
1350 *
1351 * @param cookiesFile file in the Netscape's 'cookies.txt' format.
1352 */
1353 public void loadCookies(String cookiesFile) {
1354
1355 if (cookiesFile == null || cookiesFile.length() <= 0) {
1356 return;
1357 }
1358 RandomAccessFile raf = null;
1359 try {
1360 raf = new RandomAccessFile(cookiesFile, "r");
1361 String[] cookieParts;
1362 String line;
1363 Cookie cookie = null;
1364 while ((line = raf.readLine()) != null) {
1365
1366 if (!line.startsWith("#")) {
1367 cookieParts = line.split("//t");
1368 if (cookieParts.length == 7) {
1369
1370
1371 cookie =
1372 new Cookie(cookieParts[0], cookieParts[5],
1373 cookieParts[6], cookieParts[2], -1,
1374 Boolean.valueOf(cookieParts[3]).booleanValue());
1375
1376 if (cookieParts[1].toLowerCase().equals("true")) {
1377 cookie.setDomainAttributeSpecified(true);
1378 } else {
1379 cookie.setDomainAttributeSpecified(false);
1380 }
1381 this.http.getState().addCookie(cookie);
1382 logger.fine(
1383 "Adding cookie: " + cookie.toExternalForm());
1384 }
1385 }
1386 }
1387 } catch (FileNotFoundException e) {
1388
1389 System.out.println("Could not find file: " + cookiesFile
1390 + " (Element: " + ATTR_LOAD_COOKIES + ")");
1391
1392 } catch (IOException e) {
1393
1394 e.printStackTrace();
1395 } finally {
1396 try {
1397 if (raf != null) {
1398 raf.close();
1399 }
1400 } catch (IOException e) {
1401 e.printStackTrace();
1402 }
1403 }
1404 }
1405
1406
1407
1408
1409 public String report() {
1410 StringBuffer ret = new StringBuffer();
1411 ret.append("Processor: org.archive.crawler.fetcher.FetchHTTP\n");
1412 ret.append(" Function: Fetch HTTP URIs\n");
1413 ret.append(" CrawlURIs handled: " + this.curisHandled + "\n");
1414 ret.append(" Recovery retries: " + this.recoveryRetries + "\n\n");
1415
1416 return ret.toString();
1417 }
1418
1419
1420 /***
1421 * Load cookies from the file specified in the order file.
1422 *
1423 * <p>
1424 * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1425 * Example entry of cookies.txt file:<br>
1426 * <br>
1427 * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1428 * <br>
1429 * Each line has 7 tab-separated fields:<br>
1430 * <li>1. DOMAIN: The domain that created and have access to the cookie
1431 * value.
1432 * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1433 * domain can access the cookie value.
1434 * <li>3. PATH: The path within the domain that the cookie value is valid
1435 * for.
1436 * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1437 * connection to access the cookie value.
1438 * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1439 * <li>6. NAME: The name of the cookie value
1440 * <li>7. VALUE: The cookie value
1441 */
1442 public void loadCookies() {
1443 try {
1444 loadCookies((String) getAttribute(ATTR_LOAD_COOKIES));
1445 } catch (MBeanException e) {
1446 logger.warning(e.getLocalizedMessage());
1447 } catch (ReflectionException e) {
1448 logger.warning(e.getLocalizedMessage());
1449 } catch (AttributeNotFoundException e) {
1450 logger.warning(e.getLocalizedMessage());
1451 }
1452 }
1453 /***
1454 * Saves cookies to the file specified in the order file.
1455 *
1456 * Output file is in the Netscape 'cookies.txt' format.
1457 *
1458 */
1459 public void saveCookies() {
1460 try {
1461 saveCookies((String) getAttribute(ATTR_SAVE_COOKIES));
1462 } catch (MBeanException e) {
1463 logger.warning(e.getLocalizedMessage());
1464 } catch (ReflectionException e) {
1465 logger.warning(e.getLocalizedMessage());
1466 } catch (AttributeNotFoundException e) {
1467 logger.warning(e.getLocalizedMessage());
1468 }
1469 }
1470 /***
1471 * Saves cookies to a file.
1472 *
1473 * Output file is in the Netscape 'cookies.txt' format.
1474 *
1475 * @param saveCookiesFile output file.
1476 */
1477 public void saveCookies(String saveCookiesFile) {
1478
1479 if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
1480 return;
1481 }
1482
1483 FileOutputStream out = null;
1484 try {
1485 out = new FileOutputStream(new File(saveCookiesFile));
1486 @SuppressWarnings("unchecked")
1487 Map<String,Cookie> cookies = http.getState().getCookiesMap();
1488 String tab ="\t";
1489 out.write("# Heritrix Cookie File\n".getBytes());
1490 out.write(
1491 "# This file is the Netscape cookies.txt format\n\n".getBytes());
1492 for (Cookie cookie: cookies.values()) {
1493 MutableString line =
1494 new MutableString(1024 * 2
1495 line.append(cookie.getDomain());
1496 line.append(tab);
1497 line.append(
1498 cookie.isDomainAttributeSpecified() == true
1499 ? "TRUE"
1500 : "FALSE");
1501 line.append(tab);
1502 line.append(cookie.getPath());
1503 line.append(tab);
1504 line.append(
1505 cookie.getSecure() == true ? "TRUE" : "FALSE");
1506 line.append(tab);
1507 line.append(cookie.getName());
1508 line.append(tab);
1509 line.append((null==cookie.getValue())?"":cookie.getValue());
1510 line.append("\n");
1511 out.write(line.toString().getBytes());
1512 }
1513 } catch (FileNotFoundException e) {
1514
1515 System.out.println("Could not find file: " + saveCookiesFile
1516 + " (Element: " + ATTR_SAVE_COOKIES + ")");
1517 } catch (IOException e) {
1518 e.printStackTrace();
1519 } finally {
1520 try {
1521 if (out != null) {
1522 out.close();
1523 }
1524 } catch (IOException e) {
1525 e.printStackTrace();
1526 }
1527 }
1528 }
1529
1530
1531
1532
1533 protected void listUsedFiles(List<String> list) {
1534
1535
1536 try {
1537 String tmp = (String)getAttribute(ATTR_LOAD_COOKIES);
1538 if(tmp != null && tmp.length() > 0){
1539 File file = getSettingsHandler().
1540 getPathRelativeToWorkingDirectory(tmp);
1541 list.add(file.getAbsolutePath());
1542 }
1543 tmp = (String)getAttribute(ATTR_SAVE_COOKIES);
1544 if(tmp != null && tmp.length() > 0){
1545 File file = getSettingsHandler().
1546 getPathRelativeToWorkingDirectory(tmp);
1547 list.add(file.getAbsolutePath());
1548 }
1549 } catch (AttributeNotFoundException e) {
1550
1551 e.printStackTrace();
1552 } catch (MBeanException e) {
1553
1554 e.printStackTrace();
1555 } catch (ReflectionException e) {
1556
1557 e.printStackTrace();
1558 }
1559 }
1560
1561 private void setAcceptHeaders(CrawlURI curi, HttpMethod get) {
1562 try {
1563 StringList accept_headers = (StringList) getAttribute(ATTR_ACCEPT_HEADERS, curi);
1564 if (!accept_headers.isEmpty()) {
1565 for (ListIterator i = accept_headers.listIterator(); i.hasNext();) {
1566 String hdr = (String) i.next();
1567 String[] nvp = hdr.split(": +");
1568 if (nvp.length == 2) {
1569 get.setRequestHeader(nvp[0], nvp[1]);
1570 }
1571 else {
1572 logger.warning("Invalid accept header: " + hdr);
1573 }
1574 }
1575 }
1576 }
1577 catch (AttributeNotFoundException e) {
1578 logger.severe(e.getMessage());
1579 }
1580 }
1581
1582
1583 private void writeObject(ObjectOutputStream stream) throws IOException {
1584 stream.defaultWriteObject();
1585
1586 @SuppressWarnings("unchecked")
1587 Collection<Cookie> c = http.getState().getCookiesMap().values();
1588 Cookie[] cookies = c.toArray(new Cookie[c.size()]);
1589 stream.writeObject(cookies);
1590 }
1591
1592 private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
1593 stream.defaultReadObject();
1594 Cookie cookies[] = (Cookie[]) stream.readObject();
1595 ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream)stream;
1596 coistream.registerFinishTask( new PostRestore(cookies) );
1597 }
1598
1599 /***
1600 * @return Returns the http instance.
1601 */
1602 protected HttpClient getHttp() {
1603 return this.http;
1604 }
1605
1606 class PostRestore implements Runnable {
1607 Cookie cookies[];
1608 public PostRestore(Cookie cookies[]) {
1609 this.cookies = cookies;
1610 }
1611 public void run() {
1612 configureHttp();
1613 for(int i = 0; i < cookies.length; i++) {
1614 getHttp().getState().addCookie(cookies[i]);
1615 }
1616 }
1617 }
1618
1619
1620
1621
1622 public void crawlStarted(String message) {
1623
1624 }
1625
1626
1627
1628
1629 public void crawlCheckpoint(File checkpointDir) {
1630 if(cookieDb!=null) {
1631 try {
1632 cookieDb.sync();
1633 } catch (DatabaseException e) {
1634
1635 throw new RuntimeException(e);
1636 }
1637 }
1638 }
1639
1640
1641
1642
1643 public void crawlEnding(String sExitMessage) {
1644
1645 }
1646
1647
1648
1649
1650 public void crawlEnded(String sExitMessage) {
1651 this.http = null;
1652 }
1653
1654
1655
1656
1657 public void crawlPausing(String statusMessage) {
1658
1659 }
1660
1661
1662
1663
1664 public void crawlPaused(String statusMessage) {
1665
1666 }
1667
1668
1669
1670
1671 public void crawlResuming(String statusMessage) {
1672
1673 }
1674 }