1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.postprocessor;
24
25
26 import java.util.logging.Logger;
27
28 import org.apache.commons.httpclient.URIException;
29 import org.archive.crawler.datamodel.CoreAttributeConstants;
30 import org.archive.crawler.datamodel.CrawlHost;
31 import org.archive.crawler.datamodel.CrawlServer;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.datamodel.FetchStatusCodes;
34 import org.archive.crawler.framework.Processor;
35 import org.archive.crawler.framework.Frontier.FrontierGroup;
36
37
38 /***
39 * A step, late in the processing of a CrawlURI, for updating the per-host
40 * information that may have been affected by the fetch. This will initially
41 * be robots and ip address info; it could include other per-host stats that
42 * would affect the crawl (like total pages visited at the site) as well.
43 *
44 * @author gojomo
45 * @version $Date: 2007-08-28 05:15:25 +0000 (Tue, 28 Aug 2007) $, $Revision: 5439 $
46 */
47 public class CrawlStateUpdater extends Processor implements
48 CoreAttributeConstants, FetchStatusCodes {
49
50 private static final long serialVersionUID = -1072728147960180091L;
51
52 private static final Logger logger =
53 Logger.getLogger(CrawlStateUpdater.class.getName());
54
55 public CrawlStateUpdater(String name) {
56 super(name, "Crawl state updater");
57 }
58
59 protected void innerProcess(CrawlURI curi) {
60 CrawlServer server =
61 getController().getServerCache().getServerFor(curi);
62
63 String scheme = curi.getUURI().getScheme().toLowerCase();
64 if (scheme.equals("http") || scheme.equals("https") &&
65 server != null) {
66
67 if(curi.getFetchStatus() == S_CONNECT_FAILED) {
68 server.incrementConsecutiveConnectionErrors();
69 } else if (curi.getFetchStatus() > 0){
70 server.resetConsecutiveConnectionErrors();
71 }
72
73
74 try {
75 if (curi.getUURI().getPath() != null &&
76 curi.getUURI().getPath().equals("/robots.txt")) {
77
78 server.updateRobots(curi);
79 }
80 }
81 catch (URIException e) {
82 logger.severe("Failed get path on " + curi.getUURI());
83 }
84 }
85 }
86 }