1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimpleHTTPExtractor.java
20   * Created on Jul 3, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.extractor;
25  
26  import java.util.logging.Logger;
27  
28  import org.apache.commons.httpclient.Header;
29  import org.apache.commons.httpclient.HttpMethod;
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.crawler.datamodel.CoreAttributeConstants;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.framework.Processor;
34  
35  /***
36   * Extracts URIs from HTTP response headers.
37   * @author gojomo
38   */
39  public class ExtractorHTTP extends Processor
40  implements CoreAttributeConstants {
41  
42      private static final long serialVersionUID = 8499072198570554647L;
43  
44      private static final Logger LOGGER =
45          Logger.getLogger(ExtractorHTTP.class.getName());
46      protected long numberOfCURIsHandled = 0;
47      protected long numberOfLinksExtracted = 0;
48  
49      public ExtractorHTTP(String name) {
50          super(name,
51              "HTTP extractor. Extracts URIs from HTTP response headers.");
52      }
53  
54      public void innerProcess(CrawlURI curi) {
55          if (!curi.isHttpTransaction() || curi.getFetchStatus() <= 0) {
56              // If not http or if an error status code, skip.
57              return;
58          }
59          numberOfCURIsHandled++;
60          HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
61          addHeaderLink(curi, method.getResponseHeader("Location"));
62          addHeaderLink(curi, method.getResponseHeader("Content-Location"));
63      }
64  
65      protected void addHeaderLink(CrawlURI curi, Header loc) {
66          if (loc == null) {
67              // If null, return without adding anything.
68              return;
69          }
70          // TODO: consider possibility of multiple headers
71          try {
72              curi.createAndAddLink(loc.getValue(), loc.getName() + ":",
73                  Link.REFER_HOP);
74              numberOfLinksExtracted++;
75          } catch (URIException e) {
76              // There may not be a controller (e.g. If we're being run
77              // by the extractor tool).
78              if (getController() != null) {
79                  getController().logUriError(e, curi.getUURI(), loc.getValue());
80              } else {
81                  LOGGER.info(curi + ", " + loc.getValue() + ": " +
82                      e.getMessage());
83              }
84          }
85  
86      }
87  
88      public String report() {
89          StringBuffer ret = new StringBuffer();
90          ret.append("Processor: org.archive.crawler.extractor.ExtractorHTTP\n");
91          ret.append("  Function:          " +
92              "Extracts URIs from HTTP response headers\n");
93          ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
94          ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
95          return ret.toString();
96      }
97  }