1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.extractor;
25
26 import java.util.logging.Logger;
27
28 import org.apache.commons.httpclient.Header;
29 import org.apache.commons.httpclient.HttpMethod;
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CoreAttributeConstants;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.Processor;
34
35 /***
36 * Extracts URIs from HTTP response headers.
37 * @author gojomo
38 */
39 public class ExtractorHTTP extends Processor
40 implements CoreAttributeConstants {
41
42 private static final long serialVersionUID = 8499072198570554647L;
43
44 private static final Logger LOGGER =
45 Logger.getLogger(ExtractorHTTP.class.getName());
46 protected long numberOfCURIsHandled = 0;
47 protected long numberOfLinksExtracted = 0;
48
49 public ExtractorHTTP(String name) {
50 super(name,
51 "HTTP extractor. Extracts URIs from HTTP response headers.");
52 }
53
54 public void innerProcess(CrawlURI curi) {
55 if (!curi.isHttpTransaction() || curi.getFetchStatus() <= 0) {
56
57 return;
58 }
59 numberOfCURIsHandled++;
60 HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
61 addHeaderLink(curi, method.getResponseHeader("Location"));
62 addHeaderLink(curi, method.getResponseHeader("Content-Location"));
63 }
64
65 protected void addHeaderLink(CrawlURI curi, Header loc) {
66 if (loc == null) {
67
68 return;
69 }
70
71 try {
72 curi.createAndAddLink(loc.getValue(), loc.getName() + ":",
73 Link.REFER_HOP);
74 numberOfLinksExtracted++;
75 } catch (URIException e) {
76
77
78 if (getController() != null) {
79 getController().logUriError(e, curi.getUURI(), loc.getValue());
80 } else {
81 LOGGER.info(curi + ", " + loc.getValue() + ": " +
82 e.getMessage());
83 }
84 }
85
86 }
87
88 public String report() {
89 StringBuffer ret = new StringBuffer();
90 ret.append("Processor: org.archive.crawler.extractor.ExtractorHTTP\n");
91 ret.append(" Function: " +
92 "Extracts URIs from HTTP response headers\n");
93 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
94 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
95 return ret.toString();
96 }
97 }