1   /* FetchHTTP.java
2    *
3    * $Id: FetchHTTP.java 5798 2008-03-25 23:22:53Z gojomo $
4    *
5    * Created on Jun 5, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.fetcher;
26  
27  import it.unimi.dsi.mg4j.util.MutableString;
28  
29  import java.io.File;
30  import java.io.FileNotFoundException;
31  import java.io.FileOutputStream;
32  import java.io.IOException;
33  import java.io.ObjectInputStream;
34  import java.io.ObjectOutputStream;
35  import java.io.RandomAccessFile;
36  import java.net.InetAddress;
37  import java.net.UnknownHostException;
38  import java.security.KeyManagementException;
39  import java.security.KeyStoreException;
40  import java.security.MessageDigest;
41  import java.security.NoSuchAlgorithmException;
42  import java.util.Collection;
43  import java.util.HashSet;
44  import java.util.Iterator;
45  import java.util.List;
46  import java.util.ListIterator;
47  import java.util.Map;
48  import java.util.Set;
49  import java.util.logging.Level;
50  import java.util.logging.Logger;
51  
52  import javax.management.AttributeNotFoundException;
53  import javax.management.MBeanException;
54  import javax.management.ReflectionException;
55  import javax.net.ssl.SSLContext;
56  import javax.net.ssl.SSLSocketFactory;
57  import javax.net.ssl.TrustManager;
58  
59  import org.apache.commons.httpclient.Cookie;
60  import org.apache.commons.httpclient.Header;
61  import org.apache.commons.httpclient.HostConfiguration;
62  import org.apache.commons.httpclient.HttpClient;
63  import org.apache.commons.httpclient.HttpConnection;
64  import org.apache.commons.httpclient.HttpConnectionManager;
65  import org.apache.commons.httpclient.HttpException;
66  import org.apache.commons.httpclient.HttpMethod;
67  import org.apache.commons.httpclient.HttpMethodBase;
68  import org.apache.commons.httpclient.HttpState;
69  import org.apache.commons.httpclient.HttpStatus;
70  import org.apache.commons.httpclient.HttpVersion;
71  import org.apache.commons.httpclient.auth.AuthChallengeParser;
72  import org.apache.commons.httpclient.auth.AuthScheme;
73  import org.apache.commons.httpclient.auth.BasicScheme;
74  import org.apache.commons.httpclient.auth.DigestScheme;
75  import org.apache.commons.httpclient.auth.MalformedChallengeException;
76  import org.apache.commons.httpclient.cookie.CookiePolicy;
77  import org.apache.commons.httpclient.params.HttpClientParams;
78  import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
79  import org.apache.commons.httpclient.params.HttpMethodParams;
80  import org.apache.commons.httpclient.protocol.Protocol;
81  import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
82  import org.archive.crawler.Heritrix;
83  import org.archive.crawler.datamodel.CoreAttributeConstants;
84  import org.archive.crawler.datamodel.CrawlHost;
85  import org.archive.crawler.datamodel.CrawlOrder;
86  import org.archive.crawler.datamodel.CrawlServer;
87  import org.archive.crawler.datamodel.CrawlURI;
88  import org.archive.crawler.datamodel.CredentialStore;
89  import org.archive.crawler.datamodel.FetchStatusCodes;
90  import org.archive.crawler.datamodel.ServerCache;
91  import org.archive.crawler.datamodel.credential.Credential;
92  import org.archive.crawler.datamodel.credential.CredentialAvatar;
93  import org.archive.crawler.datamodel.credential.Rfc2617Credential;
94  import org.archive.crawler.deciderules.DecideRule;
95  import org.archive.crawler.deciderules.DecideRuleSequence;
96  import org.archive.crawler.event.CrawlStatusListener;
97  import org.archive.crawler.framework.Processor;
98  import org.archive.crawler.settings.SettingsHandler;
99  import org.archive.crawler.settings.SimpleType;
100 import org.archive.crawler.settings.StringList;
101 import org.archive.crawler.settings.Type;
102 import org.archive.httpclient.ConfigurableX509TrustManager;
103 import org.archive.httpclient.HttpRecorderGetMethod;
104 import org.archive.httpclient.HttpRecorderMethod;
105 import org.archive.httpclient.HttpRecorderPostMethod;
106 import org.archive.httpclient.SingleHttpConnectionManager;
107 import org.archive.io.ObjectPlusFilesInputStream;
108 import org.archive.io.RecorderLengthExceededException;
109 import org.archive.io.RecorderTimeoutException;
110 import org.archive.io.RecorderTooMuchHeaderException;
111 import org.archive.util.ArchiveUtils;
112 import org.archive.util.HttpRecorder;
113 import org.archive.util.bdbje.EnhancedEnvironment;
114 
115 import st.ata.util.AList;
116 
117 import com.sleepycat.bind.serial.SerialBinding;
118 import com.sleepycat.bind.serial.StoredClassCatalog;
119 import com.sleepycat.bind.tuple.StringBinding;
120 import com.sleepycat.collections.StoredSortedMap;
121 import com.sleepycat.je.Database;
122 import com.sleepycat.je.DatabaseConfig;
123 import com.sleepycat.je.DatabaseException;
124 
125 /***
126  * HTTP fetcher that uses <a
127  * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons
128  * HttpClient</a> library.
129  *
130  * @author Gordon Mohr
131  * @author Igor Ranitovic
132  * @author others
133  * @version $Id: FetchHTTP.java 5798 2008-03-25 23:22:53Z gojomo $
134  */
135 public class FetchHTTP extends Processor
136 implements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {
137     // be robust against trivial implementation changes
138     private static final long serialVersionUID =
139         ArchiveUtils.classnameBasedUID(FetchHTTP.class,1);
140     
141     private static Logger logger = Logger.getLogger(FetchHTTP.class.getName());
142 
143     public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;
144     public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;
145     public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds";
146     public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms";
147     public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes";
148     public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file";
149     public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file";
150     public static final String ATTR_ACCEPT_HEADERS = "accept-headers";
151     public static final String ATTR_DEFAULT_ENCODING = "default-encoding";
152     public static final String ATTR_DIGEST_CONTENT = "digest-content";
153     public static final String ATTR_DIGEST_ALGORITHM = "digest-algorithm";
154     public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";
155    
156     /***
157      * SSL trust level setting attribute name.
158      */
159     public static final String ATTR_TRUST = "trust-level";
160     
161     private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200);
162     private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000);
163     private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0);
164     private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0;
165 
166     /***
167      * This is the default value pre-1.4. Needs special handling else
168      * treated as negative number doing math later in processing.
169      */
170     private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;
171 
172     /***
173      * Default character encoding to use for pages that do not specify.
174      */
175     private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;
176 
177     /***
178      * Default whether to perform on-the-fly digest hashing of content-bodies.
179      */
180     static Boolean DEFAULT_DIGEST_CONTENT = new Boolean(true);
181           
182     /***
183      * The different digest algorithms to choose between, 
184      * SHA-1 or MD-5 at the moment. 
185      */
186     public static final String SHA1 = "sha1";
187     public static final String MD5 = "md5";
188     public static String [] DIGEST_ALGORITHMS = {SHA1, MD5};
189    
190     /***
191      * Default algorithm to use for message disgesting.
192      */
193     public static final String  DEFAULT_DIGEST_ALGORITHM = SHA1; 
194     
195     private transient HttpClient http = null;
196 
197     /***
198      * How many 'instant retries' of HttpRecoverableExceptions have occurred
199      * 
200      * Would like it to be 'long', but longs aren't atomic
201      */
202     private int recoveryRetries = 0;
203 
204     /***
205      * Count of crawl uris handled.
206      * Would like to be 'long', but longs aren't atomic
207      */
208     private int curisHandled = 0;
209         
210     /***
211      * Rules to apply mid-fetch, just after receipt of the response
212      * headers before we start to download body.
213      */
214     public static final String ATTR_MIDFETCH_DECIDE_RULES = "midfetch-decide-rules";
215     
216     /***
217      * What to log if midfetch abort.
218      */
219     private static final String MIDFETCH_ABORT_LOG = "midFetchAbort";
220     
221     public static final String ATTR_SEND_CONNECTION_CLOSE =
222         "send-connection-close";
223     private static final Header HEADER_SEND_CONNECTION_CLOSE =
224         new Header("Connection", "close");
225     public static final String ATTR_SEND_REFERER = "send-referer";
226     public static final String ATTR_SEND_RANGE = "send-range";
227     public static final String ATTR_SEND_IF_MODIFIED_SINCE = "send-if-modified-since";
228     public static final String ATTR_SEND_IF_NONE_MATCH = "send-if-none-match";
229     public static final String REFERER = "Referer";
230     public static final String RANGE = "Range";
231     public static final String RANGE_PREFIX = "bytes=0-";
232     public static final String HTTP_SCHEME = "http";
233     public static final String HTTPS_SCHEME = "https";
234     
235     public static final String ATTR_IGNORE_COOKIES = "ignore-cookies";
236     private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false);
237 
238     public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies";
239     private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true);
240     
241     public static final String ATTR_HTTP_BIND_ADDRESS = A_HTTP_BIND_ADDRESS;
242     
243     /***
244      * Database backing cookie map, if using BDB
245      */
246     protected Database cookieDb; 
247     /***
248      * Name of cookie BDB Database
249      */
250     public static final String COOKIEDB_NAME = "http_cookies";
251     
252     static {
253     	Protocol.registerProtocol("http", new Protocol("http",
254             new HeritrixProtocolSocketFactory(), 80));
255     	try {
256 			Protocol.registerProtocol("https",
257 			    new Protocol("https", ((ProtocolSocketFactory)
258 			        new HeritrixSSLProtocolSocketFactory()), 443));
259 		} catch (KeyManagementException e) {
260 			e.printStackTrace();
261 		} catch (KeyStoreException e) {
262 			e.printStackTrace();
263 		} catch (NoSuchAlgorithmException e) {
264 			e.printStackTrace();
265 		}
266     }
267     static final String SERVER_CACHE_KEY = "heritrix.server.cache";
268     static final String SSL_FACTORY_KEY = "heritrix.ssl.factory";
269     
270     /****
271      * Socket factory that has the configurable trust manager installed.
272      */
273     private SSLSocketFactory sslfactory = null;
274     
275 
276     /***
277      * Constructor.
278      *
279      * @param name Name of this processor.
280      */
281     public FetchHTTP(String name) {
282         super(name, "HTTP Fetcher");
283 
284         addElementToDefinition(
285             new DecideRuleSequence(ATTR_MIDFETCH_DECIDE_RULES, 
286                 "DecideRules which, if final decision is REJECT, " +
287                 "abort fetch after headers before all content is" +
288                 "read."));
289         
290         addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS,
291             "If the fetch is not completed in this number of seconds, "
292             + "even if it is making progress, give up. The URI will be "
293             + "annotated as timeTrunc. Set to zero for no timeout. "
294             + "(This is not recommended: threads could wait indefinitely "
295             + "for the fetch to end.)",
296             DEFAULT_TIMEOUT_SECONDS));
297         Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS,
298             "If a socket is unresponsive for this number of milliseconds, " +
299             "give up on that connects/read. (This does not necessarily give " +
300             "up on the fetch immediately; connects are subject to retries " +
301             "and reads will be retried until " + ATTR_TIMEOUT_SECONDS +
302             " have elapsed. Set to zero for no socket timeout. (This is " +
303             "note recommended: a socket operation could hand indefinitely.",
304                 DEFAULT_SOTIMEOUT_MS));
305         e.setExpertSetting(true);
306         e = addElementToDefinition(new SimpleType(ATTR_FETCH_BANDWIDTH_MAX,
307             "The maximum KB/sec to use when fetching data from a server. " +
308             "0 means no maximum.  Default: "+ DEFAULT_FETCH_BANDWIDTH_MAX
309              + ".", DEFAULT_FETCH_BANDWIDTH_MAX));
310         e.setExpertSetting(true);
311         e.setOverrideable(true);
312         addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES,
313             "Maximum length in bytes to fetch.\n" +
314             "Fetch is truncated at this length. A value of 0 means no limit.",
315             DEFAULT_MAX_LENGTH_BYTES));
316         e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,
317             "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));
318         e.setOverrideable(true);
319         e.setExpertSetting(true);
320         e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES,
321                 "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));
322         e.setExpertSetting(true);
323 
324         e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES,
325             "File to preload cookies from", ""));
326         e.setExpertSetting(true);
327         e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,
328             "When crawl finishes save cookies to this file", ""));
329         e.setExpertSetting(true);
330         e = addElementToDefinition(new SimpleType(ATTR_TRUST,
331             "SSL certificate trust level.  Range is from the default 'open'"
332             + " (trust all certs including expired, selfsigned, and those for"
333             + " which we do not have a CA) through 'loose' (trust all valid"
334             + " certificates including selfsigned), 'normal' (all valid"
335             + " certificates not including selfsigned) to 'strict' (Cert is"
336             + " valid and DN must match servername)",
337             ConfigurableX509TrustManager.DEFAULT,
338             ConfigurableX509TrustManager.LEVELS_AS_ARRAY));
339         e.setOverrideable(false);
340         e.setExpertSetting(true);
341         e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS,
342             "Accept Headers to include in each request. Each must be the"
343             + " complete header, e.g., 'Accept-Language: en'"));
344         e.setExpertSetting(true);
345         e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,
346             "Proxy host IP (set only if needed).", ""));
347         e.setExpertSetting(true);
348         e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,
349             "Proxy port (set only if needed)", ""));
350         e.setExpertSetting(true);
351         e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING,
352             "The character encoding to use for files that do not have one" +
353             " specified in the HTTP response headers.  Default: " +
354             DEFAULT_CONTENT_CHARSET + ".",
355             DEFAULT_CONTENT_CHARSET));
356         e.setExpertSetting(true);
357         e = addElementToDefinition(new SimpleType(ATTR_DIGEST_CONTENT,
358                 "Whether or not to perform an on-the-fly digest hash of" +
359                 " retrieved content-bodies.",
360                 DEFAULT_DIGEST_CONTENT));
361         e.setExpertSetting(true);
362         e = addElementToDefinition(new SimpleType(ATTR_DIGEST_ALGORITHM,
363                 "Which algorithm (for example MD5 or SHA-1) to use to perform an on-the-fly digest" +
364                 " hash of retrieved content-bodies.",
365                 DEFAULT_DIGEST_ALGORITHM, DIGEST_ALGORITHMS));
366         e.setExpertSetting(true);
367         e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_MODIFIED_SINCE,
368                 "Send 'If-Modified-Since' header, if previous 'Last-Modified' " +
369                 "fetch history information is available in URI history.",
370                  new Boolean(true)));
371         e.setOverrideable(true);
372         e.setExpertSetting(true);
373         e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_NONE_MATCH,
374                 "Send 'If-None-Match' header, if previous 'Etag' fetch " +
375                 "history information is available in URI history.",
376                  new Boolean(true)));
377         e.setOverrideable(true);
378         e.setExpertSetting(true);
379         e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE,
380             "Send 'Connection: close' header with every request.",
381              new Boolean(true)));
382         e.setOverrideable(true);
383         e.setExpertSetting(true);
384         e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER,
385              "Send 'Referer' header with every request.\n" +
386              "The 'Referer' header contans the location the crawler came " +
387              " from, " +
388              "the page the current URI was discovered in. The 'Referer' " +
389              "usually is " +
390              "logged on the remote server and can be of assistance to " +
391              "webmasters trying to figure how a crawler got to a " +
392              "particular area on a site.",
393              new Boolean(true)));
394         e.setOverrideable(true);
395         e.setExpertSetting(true);
396         e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE,
397               "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES +
398               ") on document size.\n" +
399               "Be polite to the HTTP servers and send the 'Range' header," +
400               "stating that you are only interested in the first n bytes. " +
401               "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " +
402               "Sending the 'Range' header results in a " +
403               "'206 Partial Content' status response, which is better than " +
404               "just cutting the response mid-download. On rare occasion, " +
405               " sending 'Range' will " +
406               "generate '416 Request Range Not Satisfiable' response.",
407               new Boolean(false)));
408            e.setOverrideable(true);
409            e.setExpertSetting(true);
410            e = addElementToDefinition(new SimpleType(ATTR_HTTP_BIND_ADDRESS,
411                "Local IP address or hostname to use when making connections " +
412                "(binding sockets). When not specified, uses default local" +
413                "address(es).", ""));
414            e.setExpertSetting(true);
415     }
416 
417     protected void innerProcess(final CrawlURI curi)
418     throws InterruptedException {
419         if (!canFetch(curi)) {
420             // Cannot fetch this, due to protocol, retries, or other problems
421             return;
422         }
423 
424         this.curisHandled++;
425 
426         // Note begin time
427         curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
428 
429         // Get a reference to the HttpRecorder that is set into this ToeThread.
430         HttpRecorder rec = HttpRecorder.getHttpRecorder();
431         
432         // Shall we get a digest on the content downloaded?
433         boolean digestContent  = ((Boolean)getUncheckedAttribute(curi,
434                 ATTR_DIGEST_CONTENT)).booleanValue();
435         String algorithm = null;
436         if (digestContent) {
437             algorithm = ((String)getUncheckedAttribute(curi,
438                 ATTR_DIGEST_ALGORITHM));
439             rec.getRecordedInput().setDigest(algorithm);
440         } else {
441             // clear
442             rec.getRecordedInput().setDigest((MessageDigest)null);
443         }        
444         
445         // Below we do two inner classes that add check of midfetch
446         // filters just as we're about to receive the response body.
447         String curiString = curi.getUURI().toString();
448         HttpMethodBase method = null;
449         if (curi.isPost()) {
450             method = new HttpRecorderPostMethod(curiString, rec) {
451                 protected void readResponseBody(HttpState state,
452                         HttpConnection conn)
453                 throws IOException, HttpException {
454                     addResponseContent(this, curi);
455                     if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {
456                         doAbort(curi, this, MIDFETCH_ABORT_LOG);
457                     } else {
458                         super.readResponseBody(state, conn);
459                     }
460                 }
461             };
462         } else {
463             method = new HttpRecorderGetMethod(curiString, rec) {
464                 protected void readResponseBody(HttpState state,
465                         HttpConnection conn)
466                 throws IOException, HttpException {
467                     addResponseContent(this, curi);
468                     if (checkMidfetchAbort(curi, this.httpRecorderMethod,
469                             conn)) {
470                         doAbort(curi, this, MIDFETCH_ABORT_LOG);
471                     } else {
472                         super.readResponseBody(state, conn);
473                     }
474                 }
475             };
476         }
477 
478         HostConfiguration customConfigOrNull = configureMethod(curi, method);
479         
480         // Set httpRecorder into curi. Subsequent code both here and later
481         // in extractors expects to find the HttpRecorder in the CrawlURI.
482         curi.setHttpRecorder(rec);
483         
484         // Populate credentials. Set config so auth. is not automatic.
485         boolean addedCredentials = populateCredentials(curi, method);
486         method.setDoAuthentication(addedCredentials);
487         
488         // set hardMax on bytes (if set by operator)
489         long hardMax = getMaxLength(curi);
490         // set overall timeout (if set by operator)
491         long timeoutMs = 1000 * getTimeout(curi);
492         // Get max fetch rate (bytes/ms). It comes in in KB/sec
493         long maxRateKBps = getMaxFetchRate(curi);
494         rec.getRecordedInput().setLimits(hardMax, timeoutMs, maxRateKBps);
495         
496         try {
497             this.http.executeMethod(customConfigOrNull, method);
498         } catch (RecorderTooMuchHeaderException ex) {
499             // when too much header material, abort like other truncations
500             doAbort(curi, method, HEADER_TRUNC);
501         } catch (IOException e) {
502         	failedExecuteCleanup(method, curi, e);
503         	return;
504         } catch (ArrayIndexOutOfBoundsException e) {
505             // For weird windows-only ArrayIndex exceptions in native
506             // code... see
507             // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
508             // treating as if it were an IOException
509             failedExecuteCleanup(method, curi, e);
510             return;
511         }
512         
513         // set softMax on bytes to get (if implied by content-length) 
514         long softMax = method.getResponseContentLength();
515 
516         try {
517             if (!method.isAborted()) {
518                 // Force read-to-end, so that any socket hangs occur here,
519                 // not in later modules.
520                 rec.getRecordedInput().readFullyOrUntil(softMax);
521             }
522         } catch (RecorderTimeoutException ex) {
523             doAbort(curi, method, TIMER_TRUNC);
524         } catch (RecorderLengthExceededException ex) {
525             doAbort(curi, method, LENGTH_TRUNC);
526         } catch (IOException e) {
527             cleanup(curi, e, "readFully", S_CONNECT_LOST);
528             return;
529         } catch (ArrayIndexOutOfBoundsException e) {
530             // For weird windows-only ArrayIndex exceptions from native code
531             // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
532             // treating as if it were an IOException
533             cleanup(curi, e, "readFully", S_CONNECT_LOST);
534             return;
535         } finally {
536             // ensure recording has stopped
537             rec.closeRecorders();
538             if (!method.isAborted()) {
539                 method.releaseConnection();
540             }
541             // Note completion time
542             curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
543             // Set the response charset into the HttpRecord if available.
544             setCharacterEncoding(rec, method);
545             setSizes(curi, rec);
546         }
547  
548         if (digestContent) {
549             curi.setContentDigest(algorithm,
550                 rec.getRecordedInput().getDigestValue());
551         }
552         if (logger.isLoggable(Level.INFO)) {
553             logger.info((curi.isPost()? "POST": "GET") + " " +
554                 curi.getUURI().toString() + " " + method.getStatusCode() +
555                 " " + rec.getRecordedInput().getSize() + " " +
556                 curi.getContentType());
557         }
558 
559         if (curi.isSuccess() && addedCredentials) {
560             // Promote the credentials from the CrawlURI to the CrawlServer
561             // so they are available for all subsequent CrawlURIs on this
562             // server.
563             promoteCredentials(curi);
564             if (logger.isLoggable(Level.FINE)) {
565                 // Print out the cookie.  Might help with the debugging.
566                 Header setCookie = method.getResponseHeader("set-cookie");
567                 if (setCookie != null) {
568                     logger.fine(setCookie.toString().trim());
569                 }
570             }
571         } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
572             // 401 is not 'success'.
573             handle401(method, curi);
574         }
575         
576         if (rec.getRecordedInput().isOpen()) {
577             logger.severe(curi.toString() + " RIS still open. Should have" +
578                 " been closed by method release: " +
579                 Thread.currentThread().getName());
580             try {
581                 rec.getRecordedInput().close();
582             } catch (IOException e) {
583                 logger.log(Level.SEVERE,"second-chance RIS close failed",e);
584             }
585         }
586     }
587 
588     /***
589      * Update CrawlURI internal sizes based on current transaction (and
590      * in the case of 304s, history) 
591      * 
592      * @param curi CrawlURI
593      * @param rec HttpRecorder
594      */
595     protected void setSizes(final CrawlURI curi, HttpRecorder rec) {
596         // set reporting size
597         curi.setContentSize(rec.getRecordedInput().getSize());
598         // special handling for 304-not modified
599         if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED
600                 && curi.containsKey(A_FETCH_HISTORY)) {
601             AList history[] = curi.getAList().getAListArray(A_FETCH_HISTORY);
602             if (history[0] != null
603                     && history[0]
604                             .containsKey(CoreAttributeConstants.A_REFERENCE_LENGTH)) {
605                 long referenceLength = history[0].getLong(A_REFERENCE_LENGTH);
606                 // carry-forward previous 'reference-length' for future
607                 curi.putLong(A_REFERENCE_LENGTH, referenceLength);
608                 // increase content-size to virtual-size for reporting
609                 curi.setContentSize(rec.getRecordedInput().getSize()
610                         + referenceLength);
611             }
612         }
613     }
614     
615     protected void doAbort(CrawlURI curi, HttpMethod method,
616             String annotation) {
617         curi.addAnnotation(annotation);
618         curi.getHttpRecorder().close();
619         method.abort();
620     }
621     
622     protected boolean checkMidfetchAbort(CrawlURI curi,
623             HttpRecorderMethod method, HttpConnection conn) {
624         if (curi.isPrerequisite() || rulesAccept(getMidfetchRule(curi), curi)) {
625             return false;
626         }
627         method.markContentBegin(conn);
628         return true;
629     }
630     
631     protected DecideRule getMidfetchRule(Object o) {
632         try {
633             return (DecideRule)getAttribute(o, ATTR_MIDFETCH_DECIDE_RULES);
634         } catch (AttributeNotFoundException e) {
635             throw new RuntimeException(e);
636         }
637     }
638     
639     /***
640      * This method populates <code>curi</code> with response status and
641      * content type.
642      * @param curi CrawlURI to populate.
643      * @param method Method to get response status and headers from.
644      */
645     protected void addResponseContent (HttpMethod method, CrawlURI curi) {
646         curi.setFetchStatus(method.getStatusCode());
647         Header ct = method.getResponseHeader("content-type");
648         curi.setContentType((ct == null)? null: ct.getValue());
649         // Save method into curi too.  Midfetch filters may want to leverage
650         // info in here.
651         curi.putObject(A_HTTP_TRANSACTION, method);
652     }
653 
654     /***
655      * Set the character encoding based on the result headers or default.
656      *
657      * The HttpClient returns its own default encoding ("ISO-8859-1") if one
658      * isn't specified in the Content-Type response header. We give the user
659      * the option of overriding this, so we need to detect the case where the
660      * default is returned.
661      *
662      * Now, it may well be the case that the default returned by HttpClient
663      * and the default defined by the user are the same.
664      * 
665      * @param rec Recorder for this request.
666      * @param method Method used for the request.
667      */
668     private void setCharacterEncoding(final HttpRecorder rec,
669         final HttpMethod method) {
670         String encoding = null;
671 
672         try {
673             encoding = ((HttpMethodBase) method).getResponseCharSet();
674             if (encoding == null ||
675                     encoding.equals(DEFAULT_CONTENT_CHARSET)) {
676                 encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);
677             }
678         } catch (Exception e) {
679             logger.warning("Failed get default encoding: " +
680                 e.getLocalizedMessage());
681         }
682         rec.setCharacterEncoding(encoding);
683     }
684 
685     /***
686      * Cleanup after a failed method execute.
687      * @param curi CrawlURI we failed on.
688      * @param method Method we failed on.
689      * @param exception Exception we failed with.
690      */
691     private void failedExecuteCleanup(final HttpMethod method,
692             final CrawlURI curi, final Exception exception) {
693         cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED);
694         method.releaseConnection();
695     }
696     
697     /***
698      * Cleanup after a failed method execute.
699      * @param curi CrawlURI we failed on.
700      * @param exception Exception we failed with.
701      * @param message Message to log with failure.
702      * @param status Status to set on the fetch.
703      */
704     private void cleanup(final CrawlURI curi, final Exception exception,
705             final String message, final int status) {
706         curi.addLocalizedError(this.getName(), exception, message);
707         curi.setFetchStatus(status);
708         curi.getHttpRecorder().close();
709     }
710 
711     /***
712      * Can this processor fetch the given CrawlURI. May set a fetch
713      * status if this processor would usually handle the CrawlURI,
714      * but cannot in this instance.
715      *
716      * @param curi
717      * @return True if processor can fetch.
718      */
719     private boolean canFetch(CrawlURI curi) {
720         if(curi.getFetchStatus()<0) {
721             // already marked as errored, this pass through
722             // skip to end
723             curi.skipToProcessorChain(getController().getPostprocessorChain());
724             return false;             
725         }
726         String scheme = curi.getUURI().getScheme();
727          if (!(scheme.equals("http") || scheme.equals("https"))) {
728              // handles only plain http and https
729              return false;
730          }
731          CrawlHost host = getController().getServerCache().getHostFor(curi);
732          // make sure the dns lookup succeeded
733          if (host.getIP() == null && host.hasBeenLookedUp()) {
734              curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
735              return false;
736          }
737         return true;
738     }
739 
740     /***
741      * Configure the HttpMethod setting options and headers.
742      *
743      * @param curi CrawlURI from which we pull configuration.
744      * @param method The Method to configure.
745      * @return HostConfiguration copy customized for this CrawlURI
746      */
747     protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) {
748         // Don't auto-follow redirects
749         method.setFollowRedirects(false);
750         
751 //        // set soTimeout
752 //        method.getParams().setSoTimeout(
753 //                ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))
754 //                        .intValue());
755         
756         // Set cookie policy.
757         method.getParams().setCookiePolicy(
758             (((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)).
759                 booleanValue())?
760                     CookiePolicy.IGNORE_COOKIES:
761                 CookiePolicy.BROWSER_COMPATIBILITY);
762 
763         // Use only HTTP/1.0 (to avoid receiving chunked responses)
764         method.getParams().setVersion(HttpVersion.HTTP_1_0);
765 
766         CrawlOrder order = getSettingsHandler().getOrder();
767         String userAgent = curi.getUserAgent();
768         if (userAgent == null) {
769             userAgent = order.getUserAgent(curi);
770         }
771         method.setRequestHeader("User-Agent", userAgent);
772         method.setRequestHeader("From", order.getFrom(curi));
773         
774         // Set retry handler.
775         method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
776             new HeritrixHttpMethodRetryHandler());
777         
778         final long maxLength = getMaxLength(curi);
779         if(maxLength > 0 &&
780                 ((Boolean)getUncheckedAttribute(curi, ATTR_SEND_RANGE)).
781                     booleanValue()) {
782             method.addRequestHeader(RANGE,
783                 RANGE_PREFIX.concat(Long.toString(maxLength - 1)));
784         }
785         
786         if (((Boolean)getUncheckedAttribute(curi,
787                 ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {
788             method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);
789         }
790         
791         if (((Boolean)getUncheckedAttribute(curi,
792                 ATTR_SEND_REFERER)).booleanValue()) {
793             // RFC2616 says no referer header if referer is https and the url
794             // is not
795             String via = curi.flattenVia();
796             if (via != null && via.length() > 0 &&
797                 !(via.startsWith(HTTPS_SCHEME) &&
798                     curi.getUURI().getScheme().equals(HTTP_SCHEME))) {
799                 method.setRequestHeader(REFERER, via);
800             }
801         }
802         
803         if(!curi.isPrerequisite()) {
804             setConditionalGetHeader(curi, method, ATTR_SEND_IF_MODIFIED_SINCE, 
805                     CoreAttributeConstants.A_LAST_MODIFIED_HEADER, "If-Modified-Since");
806             setConditionalGetHeader(curi, method, ATTR_SEND_IF_NONE_MATCH, 
807                     CoreAttributeConstants.A_ETAG_HEADER, "If-None-Match");
808         }
809         
810         // TODO: What happens if below method adds a header already
811         // added above: e.g. Connection, Range, or Referer?
812         setAcceptHeaders(curi, method);
813         
814         HostConfiguration config = new HostConfiguration(http.getHostConfiguration());
815         configureProxy(curi, config);
816         configureBindAddress(curi, config);
817         return config;
818     }
819 
820     /***
821      * Set the given conditional-GET header, if the setting is enabled and
822      * a suitable value is available in the URI history. 
823      * @param curi source CrawlURI
824      * @param method HTTP operation pending
825      * @param setting true/false enablement setting name to consult
826      * @param sourceHeader header to consult in URI history
827      * @param targetHeader header to set if possible
828      */
829     protected void setConditionalGetHeader(CrawlURI curi, HttpMethod method, 
830             String setting, String sourceHeader, String targetHeader) {
831         if(((Boolean)getUncheckedAttribute(curi,setting))) {
832             try {
833                 int previousStatus = curi.getAList().getAListArray(
834                         A_FETCH_HISTORY)[0].getInt(A_STATUS);
835                 if(previousStatus<=0) {
836                     // do not reuse headers from any broken fetch
837                     return; 
838                 }
839                 String previousValue = curi.getAList().getAListArray(
840                         A_FETCH_HISTORY)[0].getString(sourceHeader);
841                 if(previousValue!=null) {
842                     method.setRequestHeader(targetHeader, previousValue);
843                 }
844             } catch (RuntimeException e) {
845                 // for absent key, bad index, etc. just do nothing
846             }
847         }
848     }
849 
850     /***
851      * Setup proxy, based on attributes in CrawlURI and settings, 
852      * in the given HostConfiguration
853      */
854     private void configureProxy(CrawlURI curi, HostConfiguration config) {
855         String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST);
856         int port = -1; 
857         if(proxy.length()==0) {
858             proxy = null; 
859         } else {
860             String portString = (String)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT);
861             port = portString.length()>0 ? Integer.parseInt(portString) : -1; 
862         }
863         if(proxy!=null) {
864             config.setProxy(proxy,port);
865         }
866     }
867 
868     /***
869      * Setup local bind address, based on attributes in CrawlURI and settings, 
870      * in the given HostConfiguration 
871      */
872     private void configureBindAddress(CrawlURI curi, HostConfiguration config) {
873         String addressString = (String) getAttributeEither(curi, ATTR_HTTP_BIND_ADDRESS);
874         if(addressString != null && addressString.length() > 0) {
875             try {
876                 InetAddress localAddress = InetAddress.getByName(addressString);
877                 config.setLocalAddress(localAddress);
878             } catch (UnknownHostException e) {
879                 // Convert all to RuntimeException so get an exception out
880                 // if initialization fails.
881                 throw new RuntimeException("Unknown host " + addressString
882                     + " in " + ATTR_HTTP_BIND_ADDRESS);
883             }
884         }
885     }
886     
887     /***
888      * Get a value either from inside the CrawlURI instance, or from 
889      * settings (module attributes). 
890      * 
891      * @param curi CrawlURI to consult
892      * @param key key to lookup
893      * @return value from either CrawlURI (preferred) or settings
894      */
895     protected Object getAttributeEither(CrawlURI curi, String key) {
896         Object obj = curi!=null ? curi.getObject(key) : null;
897         if(obj==null) {
898             obj = getUncheckedAttribute(curi, key);
899         }
900         return obj;
901     }
902 
903     /***
904      * Add credentials if any to passed <code>method</code>.
905      *
906      * Do credential handling.  Credentials are in two places.  1. Credentials
907      * that succeeded are added to the CrawlServer (Or rather, avatars for
908      * credentials are whats added because its not safe to keep around
909      * references to credentials).  2. Credentials to be tried are in the curi.
910      * Returns true if found credentials to be tried.
911      *
912      * @param curi Current CrawlURI.
913      * @param method The method to add to.
914      * @return True if prepopulated <code>method</code> with credentials AND the
915      * credentials came from the <code>curi</code>, not from the CrawlServer.
916      * The former is  special in that if the <code>curi</curi> credentials
917      * succeed, then the caller needs to promote them from the CrawlURI to the
918      * CrawlServer so they are available for all subsequent CrawlURIs on this
919      * server.
920      */
921     private boolean populateCredentials(CrawlURI curi, HttpMethod method) {
922         // First look at the server avatars. Add any that are to be volunteered
923         // on every request (e.g. RFC2617 credentials).  Every time creds will
924         // return true when we call 'isEveryTime().
925         CrawlServer server =
926             getController().getServerCache().getServerFor(curi);
927         if (server.hasCredentialAvatars()) {
928             Set avatars = server.getCredentialAvatars();
929             for (Iterator i = avatars.iterator(); i.hasNext();) {
930                 CredentialAvatar ca = (CredentialAvatar)i.next();
931                 Credential c = ca.getCredential(getSettingsHandler(), curi);
932                 if (c.isEveryTime()) {
933                     c.populate(curi, this.http, method, ca.getPayload());
934                 }
935             }
936         }
937 
938         boolean result = false;
939 
940         // Now look in the curi.  The Curi will have credentials loaded either
941         // by the handle401 method if its a rfc2617 or it'll have been set into
942         // the curi by the preconditionenforcer as this login uri came through.
943         if (curi.hasCredentialAvatars()) {
944             Set avatars = curi.getCredentialAvatars();
945             for (Iterator i = avatars.iterator(); i.hasNext();) {
946                 CredentialAvatar ca = (CredentialAvatar)i.next();
947                 Credential c = ca.getCredential(getSettingsHandler(), curi);
948                 if (c.populate(curi, this.http, method, ca.getPayload())) {
949                     result = true;
950                 }
951             }
952         }
953 
954         return result;
955     }
956 
957     /***
958      * Promote successful credential to the server.
959      *
960      * @param curi CrawlURI whose credentials we are to promote.
961      */
962     private void promoteCredentials(final CrawlURI curi) {
963         if (!curi.hasCredentialAvatars()) {
964             logger.severe("No credentials to promote when there should be " +
965                 curi);
966         } else {
967             Set avatars = curi.getCredentialAvatars();
968             for (Iterator i = avatars.iterator(); i.hasNext();) {
969                 CredentialAvatar ca = (CredentialAvatar)i.next();
970                 curi.removeCredentialAvatar(ca);
971                 // The server to attach too may not be the server that hosts
972                 // this passed curi.  It might be of another subdomain.
973                 // The avatar needs to be added to the server that is dependent
974                 // on this precondition.  Find it by name.  Get the name from
975                 // the credential this avatar represents.
976                 Credential c = ca.getCredential(getSettingsHandler(), curi);
977                 String cd = null;
978                 try {
979                     cd = c.getCredentialDomain(curi);
980                 }
981                 catch (AttributeNotFoundException e) {
982                     logger.severe("Failed to get cred domain for " + curi +
983                         " for " + ca + ": " + e.getMessage());
984                 }
985                 if (cd != null) {
986                     CrawlServer cs
987                         = getController().getServerCache().getServerFor(cd);
988                     if (cs != null) {
989                         cs.addCredentialAvatar(ca);
990                     }
991                 }
992             }
993         }
994     }
995 
996     /***
997      * Server is looking for basic/digest auth credentials (RFC2617). If we have
998      * any, put them into the CrawlURI and have it come around again. Presence
999      * of the credential serves as flag to frontier to requeue promptly. If we
1000      * already tried this domain and still got a 401, then our credentials are
1001      * bad. Remove them and let this curi die.
1002      *
1003      * @param method Method that got a 401.
1004      * @param curi CrawlURI that got a 401.
1005      */
1006     protected void handle401(final HttpMethod method, final CrawlURI curi) {
1007         AuthScheme authscheme = getAuthScheme(method, curi);
1008         if (authscheme == null) {
1009         	return;
1010         }
1011         String realm = authscheme.getRealm();
1012         
1013         // Look to see if this curi had rfc2617 avatars loaded.  If so, are
1014         // any of them for this realm?  If so, then the credential failed
1015         // if we got a 401 and it should be let die a natural 401 death.
1016         Set curiRfc2617Credentials = getCredentials(getSettingsHandler(),
1017         		curi, Rfc2617Credential.class);
1018         Rfc2617Credential extant = Rfc2617Credential.
1019 		    getByRealm(curiRfc2617Credentials, realm, curi);
1020         if (extant != null) {
1021         	// Then, already tried this credential.  Remove ANY rfc2617
1022         	// credential since presence of a rfc2617 credential serves
1023         	// as flag to frontier to requeue this curi and let the curi
1024         	// die a natural death.
1025         	extant.detachAll(curi);
1026         	logger.warning("Auth failed (401) though supplied realm " +
1027         			realm + " to " + curi.toString());
1028         } else {
1029         	// Look see if we have a credential that corresponds to this
1030         	// realm in credential store.  Filter by type and credential
1031         	// domain.  If not, let this curi die. Else, add it to the
1032         	// curi and let it come around again. Add in the AuthScheme
1033         	// we got too.  Its needed when we go to run the Auth on
1034         	// second time around.
1035         	CredentialStore cs =
1036         		CredentialStore.getCredentialStore(getSettingsHandler());
1037         	if (cs == null) {
1038         		logger.severe("No credential store for " + curi);
1039         	} else {
1040                 CrawlServer server = getController().getServerCache().
1041                     getServerFor(curi);
1042         		Set storeRfc2617Credentials = cs.subset(curi,
1043         		    Rfc2617Credential.class, server.getName());
1044         		if (storeRfc2617Credentials == null ||
1045         				storeRfc2617Credentials.size() <= 0) {
1046         			logger.info("No rfc2617 credentials for " + curi);
1047         		} else {
1048         			Rfc2617Credential found = Rfc2617Credential.
1049 					    getByRealm(storeRfc2617Credentials, realm, curi);
1050         			if (found == null) {
1051         				logger.info("No rfc2617 credentials for realm " +
1052         						realm + " in " + curi);
1053         			} else {
1054         				found.attach(curi, authscheme.getRealm());
1055         				logger.info("Found credential for realm " + realm +
1056         				    " in store for " + curi.toString());
1057         			}
1058         		}
1059         	}
1060         }
1061     }
1062     
1063     /***
1064      * @param method Method that got a 401.
1065      * @param curi CrawlURI that got a 401.
1066      * @return Returns first wholesome authscheme found else null.
1067      */
1068     protected AuthScheme getAuthScheme(final HttpMethod method,
1069             final CrawlURI curi) {
1070         Header [] headers = method.getResponseHeaders("WWW-Authenticate");
1071         if (headers == null || headers.length <= 0) {
1072             logger.info("We got a 401 but no WWW-Authenticate challenge: " +
1073                 curi.toString());
1074             return null;
1075         }
1076 
1077         Map authschemes = null;
1078         try {
1079             authschemes = AuthChallengeParser.parseChallenges(headers);
1080         } catch(MalformedChallengeException e) {
1081             logger.info("Failed challenge parse: " + e.getMessage());
1082         }
1083         if (authschemes == null || authschemes.size() <= 0) {
1084             logger.info("We got a 401 and WWW-Authenticate challenge" +
1085                 " but failed parse of the header " + curi.toString());
1086             return null;
1087         }            
1088          
1089         AuthScheme result = null;
1090         // Use the first auth found.
1091         for (Iterator i = authschemes.keySet().iterator();
1092                 result == null && i.hasNext();) {
1093         	String key = (String)i.next();
1094             String challenge = (String)authschemes.get(key);
1095             if (key == null || key.length() <= 0 || challenge == null ||
1096                   challenge.length() <= 0) {
1097             	logger.warning("Empty scheme: " + curi.toString() +
1098                   ": " + headers);
1099             }
1100         	AuthScheme authscheme = null;
1101         	if (key.equals("basic")) {
1102         		authscheme = new BasicScheme();
1103         	} else if (key.equals("digest")) {
1104         		authscheme = new DigestScheme();
1105         	} else {
1106         		logger.info("Unsupported scheme: " + key);
1107         		continue;
1108         	}
1109             
1110             try {
1111 				authscheme.processChallenge(challenge);
1112 			} catch (MalformedChallengeException e) {
1113 				logger.info(e.getMessage() + " " + curi + " " + headers);
1114                 continue;
1115 			}
1116         	if (authscheme.isConnectionBased()) {
1117         		logger.info("Connection based " + authscheme);
1118         		continue;
1119         	}
1120         	
1121         	if (authscheme.getRealm() == null ||
1122         			authscheme.getRealm().length() <= 0) {
1123         		logger.info("Empty realm " + authscheme + " for " + curi);
1124         		continue;
1125         	}
1126         	result = authscheme;
1127         }
1128         
1129         return result;
1130     }
1131         
1132     /***
1133      * @param handler Settings Handler.
1134      * @param curi CrawlURI that got a 401.
1135      * @param type Class of credential to get from curi.
1136      * @return Set of credentials attached to this curi.
1137      */
1138     private Set<Credential> getCredentials(SettingsHandler handler, 
1139             CrawlURI curi, Class type) {
1140         Set<Credential> result = null;
1141 
1142         if (curi.hasCredentialAvatars()) {
1143             for (Iterator i = curi.getCredentialAvatars().iterator();
1144                     i.hasNext();) {
1145                 CredentialAvatar ca = (CredentialAvatar)i.next();
1146                 if (ca.match(type)) {
1147                     if (result == null) {
1148                         result = new HashSet<Credential>();
1149                     }
1150                     result.add(ca.getCredential(handler, curi));
1151                 }
1152             }
1153         }
1154         return result;
1155     }
1156 
1157     public void initialTasks() {
1158         super.initialTasks();
1159         this.getController().addCrawlStatusListener(this);
1160         configureHttp();
1161 
1162         // load cookies from a file if specified in the order file.
1163         loadCookies();
1164 
1165         // I tried to get the default KeyManagers but doesn't work unless you
1166         // point at a physical keystore. Passing null seems to do the right
1167         // thing so we'll go w/ that.
1168         try {
1169         	SSLContext context = SSLContext.getInstance("SSL");
1170 			context.init(null, new TrustManager[] {
1171 			    new ConfigurableX509TrustManager((String)
1172 			        getAttribute(ATTR_TRUST))}, null);
1173 	        this.sslfactory = context.getSocketFactory();
1174 		} catch (Exception e) {
1175 			logger.log(Level.WARNING, "Failed configure of ssl context "
1176 			    + e.getMessage(), e);
1177 		}
1178     }
1179     
1180     public void finalTasks() {
1181         // At the end save cookies to the file specified in the order file.
1182         saveCookies();
1183         cleanupHttp();
1184         super.finalTasks();
1185     }
1186 
1187     /***
1188      * Perform any final cleanup related to the HttpClient instance.
1189      */
1190     protected void cleanupHttp() {
1191         if(cookieDb!=null) {
1192             try {
1193                 cookieDb.sync();
1194                 cookieDb.close();
1195             } catch (DatabaseException e) {
1196                 // TODO Auto-generated catch block
1197                 e.printStackTrace();
1198             }
1199         }
1200     }
1201 
1202     protected void configureHttp() throws RuntimeException {
1203         // Get timeout.  Use it for socket and for connection timeout.
1204         int timeout = (getSoTimeout(null) > 0)? getSoTimeout(null): 0;
1205         
1206         // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager();
1207         HttpConnectionManager cm = new SingleHttpConnectionManager();
1208         
1209         // TODO: The following settings should be made in the corresponding
1210         // HttpConnectionManager, not here.
1211         HttpConnectionManagerParams hcmp = cm.getParams();
1212         hcmp.setConnectionTimeout(timeout);
1213         hcmp.setStaleCheckingEnabled(true);
1214         // Minimizes bandwidth usage.  Setting to true disables Nagle's
1215         // algorithm.  IBM JVMs < 142 give an NPE setting this boolean
1216         // on ssl sockets.
1217         hcmp.setTcpNoDelay(false);
1218         
1219         this.http = new HttpClient(cm);
1220         HttpClientParams hcp = this.http.getParams();
1221         // Set default socket timeout.
1222         hcp.setSoTimeout(timeout);
1223         // Set client to be version 1.0.
1224         hcp.setVersion(HttpVersion.HTTP_1_0);
1225 
1226 		configureHttpCookies();
1227         
1228         // Configure how we want the method to act.
1229         this.http.getParams().setParameter(
1230             HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean(true));
1231         this.http.getParams().setParameter(
1232             HttpMethodParams.UNAMBIGUOUS_STATUS_LINE , new Boolean(false));
1233         this.http.getParams().setParameter(
1234             HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean(false));
1235         this.http.getParams().setIntParameter(
1236             HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);
1237         
1238         // modify the default config with any global settings
1239         HostConfiguration config = this.http.getHostConfiguration();
1240         configureProxy(null, config);
1241         configureBindAddress(null,config);
1242         
1243         // Use our own protocol factory, one that gets IP to use from
1244         // heritrix cache (They're cached in CrawlHost instances).
1245         final ServerCache cache = getController().getServerCache();
1246         hcmp.setParameter(SERVER_CACHE_KEY, cache);
1247         hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory);
1248 	}
1249 
1250     /***
1251      * Set the HttpClient HttpState instance to use a BDB-backed
1252      * StoredSortedMap for cookie storage, if that option is chosen.
1253      */
1254     private void configureHttpCookies() {
1255         // If Bdb-backed cookies chosen, replace map in HttpState
1256         if(((Boolean)getUncheckedAttribute(null, ATTR_BDB_COOKIES)).
1257                 booleanValue()) {
1258             try {
1259                 EnhancedEnvironment env = getController().getBdbEnvironment();
1260                 StoredClassCatalog classCatalog = env.getClassCatalog();
1261                 DatabaseConfig dbConfig = new DatabaseConfig();
1262                 dbConfig.setTransactional(false);
1263                 dbConfig.setAllowCreate(true);
1264                 dbConfig.setDeferredWrite(true);
1265                 cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig);
1266                 StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb,
1267                         new StringBinding(), new SerialBinding(classCatalog,
1268                                 Cookie.class), true);
1269                 this.http.getState().setCookiesMap(cookiesMap);
1270             } catch (DatabaseException e) {
1271                 // TODO Auto-generated catch block
1272                 logger.severe(e.getMessage());
1273                 e.printStackTrace();
1274             }
1275         }
1276     }
1277 
1278     /***
1279      * @param curi Current CrawlURI.  Used to get context.
1280      * @return Socket timeout value.
1281      */
1282     private int getSoTimeout(CrawlURI curi) {
1283         Integer res = null;
1284         try {
1285             res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi);
1286         } catch (Exception e) {
1287             res = DEFAULT_SOTIMEOUT_MS;
1288         }
1289         return res.intValue();
1290     }
1291 
1292     /***
1293      * @param curi Current CrawlURI.  Used to get context.
1294      * @return Timeout value for total request.
1295      */
1296     private int getTimeout(CrawlURI curi) {
1297         Integer res;
1298         try {
1299             res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi);
1300         } catch (Exception e) {
1301             res = DEFAULT_TIMEOUT_SECONDS;
1302         }
1303         return res.intValue();
1304     }
1305 
1306     private int getMaxFetchRate(CrawlURI curi) {
1307         Integer res;
1308         try {
1309             res = (Integer)getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);
1310         }
1311         catch (Exception e) {
1312             res = DEFAULT_FETCH_BANDWIDTH_MAX;
1313         }
1314         return res.intValue();
1315     }
1316 
1317     private long getMaxLength(CrawlURI curi) {
1318         Long res;
1319         try {
1320             res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);
1321             if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {
1322                 res = DEFAULT_MAX_LENGTH_BYTES;
1323             }
1324         } catch (Exception e) {
1325             res = DEFAULT_MAX_LENGTH_BYTES;
1326         }
1327         return res.longValue();
1328     }
1329 
1330     /***
1331      * Load cookies from a file before the first fetch.
1332      * <p>
1333      * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1334      * Example entry of cookies.txt file:<br>
1335      * <br>
1336      * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1337      * <br>
1338      * Each line has 7 tab-separated fields:<br>
1339      * <li>1. DOMAIN: The domain that created and have access to the cookie
1340      * value.
1341      * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1342      * domain can access the cookie value.
1343      * <li>3. PATH: The path within the domain that the cookie value is valid
1344      * for.
1345      * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1346      * connection to access the cookie value.
1347      * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1348      * <li>6. NAME: The name of the cookie value
1349      * <li>7. VALUE: The cookie value
1350      *
1351      * @param cookiesFile file in the Netscape's 'cookies.txt' format.
1352      */
1353     public void loadCookies(String cookiesFile) {
1354         // Do nothing if cookiesFile is not specified.
1355         if (cookiesFile == null || cookiesFile.length() <= 0) {
1356             return;
1357         }
1358         RandomAccessFile raf = null;
1359         try {
1360             raf = new RandomAccessFile(cookiesFile, "r");
1361             String[] cookieParts;
1362             String line;
1363             Cookie cookie = null;
1364             while ((line = raf.readLine()) != null) {
1365                 // Line that starts with # is commented line, therefore skip it.
1366                 if (!line.startsWith("#")) {
1367                     cookieParts = line.split("//t");
1368                     if (cookieParts.length == 7) {
1369                         // Create cookie with not expiration date (-1 value).
1370                         // TODO: add this as an option.
1371                         cookie =
1372                             new Cookie(cookieParts[0], cookieParts[5],
1373                                 cookieParts[6], cookieParts[2], -1,
1374                                 Boolean.valueOf(cookieParts[3]).booleanValue());
1375 
1376                         if (cookieParts[1].toLowerCase().equals("true")) {
1377                             cookie.setDomainAttributeSpecified(true);
1378                         } else {
1379                             cookie.setDomainAttributeSpecified(false);
1380                         }
1381                         this.http.getState().addCookie(cookie);
1382                         logger.fine(
1383                             "Adding cookie: " + cookie.toExternalForm());
1384                     }
1385                 }
1386             }
1387         } catch (FileNotFoundException e) {
1388             // We should probably throw FatalConfigurationException.
1389             System.out.println("Could not find file: " + cookiesFile
1390                     + " (Element: " + ATTR_LOAD_COOKIES + ")");
1391 
1392         } catch (IOException e) {
1393             // We should probably throw FatalConfigurationException.
1394             e.printStackTrace();
1395         } finally {
1396             try {
1397                 if (raf != null) {
1398                     raf.close();
1399                 }
1400             } catch (IOException e) {
1401                 e.printStackTrace();
1402             }
1403         }
1404     }
1405 
1406     /* (non-Javadoc)
1407      * @see org.archive.crawler.framework.Processor#report()
1408      */
1409     public String report() {
1410         StringBuffer ret = new StringBuffer();
1411         ret.append("Processor: org.archive.crawler.fetcher.FetchHTTP\n");
1412         ret.append("  Function:          Fetch HTTP URIs\n");
1413         ret.append("  CrawlURIs handled: " + this.curisHandled + "\n");
1414         ret.append("  Recovery retries:   " + this.recoveryRetries + "\n\n");
1415 
1416         return ret.toString();
1417     }
1418 
1419 
1420     /***
1421      * Load cookies from the file specified in the order file.
1422      *
1423      * <p>
1424      * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1425      * Example entry of cookies.txt file:<br>
1426      * <br>
1427      * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1428      * <br>
1429      * Each line has 7 tab-separated fields:<br>
1430      * <li>1. DOMAIN: The domain that created and have access to the cookie
1431      * value.
1432      * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1433      * domain can access the cookie value.
1434      * <li>3. PATH: The path within the domain that the cookie value is valid
1435      * for.
1436      * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1437      * connection to access the cookie value.
1438      * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1439      * <li>6. NAME: The name of the cookie value
1440      * <li>7. VALUE: The cookie value
1441      */
1442     public void loadCookies() {
1443         try {
1444             loadCookies((String) getAttribute(ATTR_LOAD_COOKIES));
1445         } catch (MBeanException e) {
1446             logger.warning(e.getLocalizedMessage());
1447         } catch (ReflectionException e) {
1448             logger.warning(e.getLocalizedMessage());
1449         } catch (AttributeNotFoundException e) {
1450             logger.warning(e.getLocalizedMessage());
1451         }
1452     }
1453     /***
1454      * Saves cookies to the file specified in the order file.
1455      *
1456      * Output file is in the Netscape 'cookies.txt' format.
1457      *
1458      */
1459     public void saveCookies() {
1460         try {
1461             saveCookies((String) getAttribute(ATTR_SAVE_COOKIES));
1462         } catch (MBeanException e) {
1463             logger.warning(e.getLocalizedMessage());
1464         } catch (ReflectionException e) {
1465             logger.warning(e.getLocalizedMessage());
1466         } catch (AttributeNotFoundException e) {
1467             logger.warning(e.getLocalizedMessage());
1468         }
1469     }
1470     /***
1471      * Saves cookies to a file.
1472      *
1473      * Output file is in the Netscape 'cookies.txt' format.
1474      *
1475      * @param saveCookiesFile output file.
1476      */
1477     public void saveCookies(String saveCookiesFile) {
1478         // Do nothing if cookiesFile is not specified.
1479         if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
1480             return;
1481         }
1482 
1483         FileOutputStream out = null;
1484         try {
1485             out = new FileOutputStream(new File(saveCookiesFile));
1486             @SuppressWarnings("unchecked")
1487             Map<String,Cookie> cookies = http.getState().getCookiesMap();
1488             String tab ="\t";
1489             out.write("# Heritrix Cookie File\n".getBytes());
1490             out.write(
1491                 "# This file is the Netscape cookies.txt format\n\n".getBytes());
1492             for (Cookie cookie: cookies.values()) {
1493                 MutableString line =
1494                     new MutableString(1024 * 2 /*Guess an initial size*/);
1495                 line.append(cookie.getDomain());
1496                 line.append(tab);
1497                 line.append(
1498                     cookie.isDomainAttributeSpecified() == true
1499                         ? "TRUE"
1500                         : "FALSE");
1501                 line.append(tab);
1502                 line.append(cookie.getPath());
1503                 line.append(tab);
1504                 line.append(
1505                     cookie.getSecure() == true ? "TRUE" : "FALSE");
1506                 line.append(tab);
1507                 line.append(cookie.getName());
1508                 line.append(tab);
1509                 line.append((null==cookie.getValue())?"":cookie.getValue());
1510                 line.append("\n");
1511                 out.write(line.toString().getBytes());
1512             }
1513         } catch (FileNotFoundException e) {
1514             // We should probably throw FatalConfigurationException.
1515             System.out.println("Could not find file: " + saveCookiesFile
1516                     + " (Element: " + ATTR_SAVE_COOKIES + ")");
1517         } catch (IOException e) {
1518             e.printStackTrace();
1519         } finally {
1520             try {
1521                 if (out != null) {
1522                     out.close();
1523                 }
1524             } catch (IOException e) {
1525                 e.printStackTrace();
1526             }
1527         }
1528     }
1529 
1530     /* (non-Javadoc)
1531      * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
1532      */
1533     protected void listUsedFiles(List<String> list) {
1534         // List the cookies files
1535         // Add seed file
1536         try {
1537             String tmp = (String)getAttribute(ATTR_LOAD_COOKIES);
1538             if(tmp != null && tmp.length() > 0){
1539                 File file = getSettingsHandler().
1540                         getPathRelativeToWorkingDirectory(tmp);
1541                 list.add(file.getAbsolutePath());
1542             }
1543             tmp = (String)getAttribute(ATTR_SAVE_COOKIES);
1544             if(tmp != null && tmp.length() > 0){
1545                 File file = getSettingsHandler().
1546                         getPathRelativeToWorkingDirectory(tmp);
1547                 list.add(file.getAbsolutePath());
1548             }
1549         } catch (AttributeNotFoundException e) {
1550             // TODO Auto-generated catch block
1551             e.printStackTrace();
1552         } catch (MBeanException e) {
1553             // TODO Auto-generated catch block
1554             e.printStackTrace();
1555         } catch (ReflectionException e) {
1556             // TODO Auto-generated catch block
1557             e.printStackTrace();
1558         }
1559     }
1560     
1561     private void setAcceptHeaders(CrawlURI curi, HttpMethod get) {
1562         try {
1563             StringList accept_headers = (StringList) getAttribute(ATTR_ACCEPT_HEADERS, curi);
1564             if (!accept_headers.isEmpty()) {
1565                 for (ListIterator i = accept_headers.listIterator(); i.hasNext();) {
1566                     String hdr = (String) i.next();
1567                     String[] nvp = hdr.split(": +");
1568                     if (nvp.length == 2) {
1569                         get.setRequestHeader(nvp[0], nvp[1]);
1570                     }
1571                     else {
1572                         logger.warning("Invalid accept header: " + hdr);
1573                     }
1574                 }
1575             }
1576         }
1577         catch (AttributeNotFoundException e) {
1578             logger.severe(e.getMessage());
1579         }
1580     }
1581 
1582     // custom serialization
1583     private void writeObject(ObjectOutputStream stream) throws IOException {
1584         stream.defaultWriteObject();
1585         // save cookies
1586         @SuppressWarnings("unchecked")
1587         Collection<Cookie> c = http.getState().getCookiesMap().values();
1588         Cookie[] cookies = c.toArray(new Cookie[c.size()]);
1589         stream.writeObject(cookies);
1590     }
1591     
1592     private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
1593         stream.defaultReadObject();
1594         Cookie cookies[] = (Cookie[]) stream.readObject();
1595         ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream)stream;
1596         coistream.registerFinishTask( new PostRestore(cookies) );
1597     }
1598     
1599     /***
1600      * @return Returns the http instance.
1601      */
1602     protected HttpClient getHttp() {
1603         return this.http;
1604     }
1605     
1606     class PostRestore implements Runnable {
1607         Cookie cookies[];
1608         public PostRestore(Cookie cookies[]) {
1609             this.cookies = cookies;
1610         }
1611     	public void run() {
1612             configureHttp();
1613             for(int i = 0; i < cookies.length; i++) {
1614                 getHttp().getState().addCookie(cookies[i]);
1615             }
1616         }
1617     }
1618 
1619     /* (non-Javadoc)
1620      * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1621      */
1622     public void crawlStarted(String message) {
1623         // TODO Auto-generated method stub
1624     }
1625     
1626     /* (non-Javadoc)
1627      * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1628      */
1629     public void crawlCheckpoint(File checkpointDir) {
1630         if(cookieDb!=null) {
1631             try {
1632                 cookieDb.sync();
1633             } catch (DatabaseException e) {
1634                 // TODO Auto-generated catch block
1635                 throw new RuntimeException(e);
1636             }
1637         }
1638     }
1639 
1640     /* (non-Javadoc)
1641      * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1642      */
1643     public void crawlEnding(String sExitMessage) {
1644         // TODO Auto-generated method stub
1645     }
1646 
1647     /* (non-Javadoc)
1648      * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1649      */
1650     public void crawlEnded(String sExitMessage) {
1651         this.http = null;
1652     }
1653 
1654     /* (non-Javadoc)
1655      * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1656      */
1657     public void crawlPausing(String statusMessage) {
1658         // TODO Auto-generated method stub
1659     }
1660 
1661     /* (non-Javadoc)
1662      * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1663      */
1664     public void crawlPaused(String statusMessage) {
1665         // TODO Auto-generated method stub
1666     }
1667 
1668     /* (non-Javadoc)
1669      * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1670      */
1671     public void crawlResuming(String statusMessage) {
1672         // TODO Auto-generated method stub
1673     }
1674 }