1   /* CrawlStateUpdater
2    *
3    * Created on Jun 5, 2003
4    *
5    * Copyright (C) 2003 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.postprocessor;
24  
25  
26  import java.util.logging.Logger;
27  
28  import org.apache.commons.httpclient.URIException;
29  import org.archive.crawler.datamodel.CoreAttributeConstants;
30  import org.archive.crawler.datamodel.CrawlHost;
31  import org.archive.crawler.datamodel.CrawlServer;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.datamodel.FetchStatusCodes;
34  import org.archive.crawler.framework.Processor;
35  import org.archive.crawler.framework.Frontier.FrontierGroup;
36  
37  
38  /***
39   * A step, late in the processing of a CrawlURI, for updating the per-host
40   * information that may have been affected by the fetch. This will initially
41   * be robots and ip address info; it could include other per-host stats that
42   * would affect the crawl (like total pages visited at the site) as well.
43   *
44   * @author gojomo
45   * @version $Date: 2007-08-28 05:15:25 +0000 (Tue, 28 Aug 2007) $, $Revision: 5439 $
46   */
47  public class CrawlStateUpdater extends Processor implements
48          CoreAttributeConstants, FetchStatusCodes {
49  
50      private static final long serialVersionUID = -1072728147960180091L;
51  
52      private static final Logger logger =
53          Logger.getLogger(CrawlStateUpdater.class.getName());
54  
55      public CrawlStateUpdater(String name) {
56          super(name, "Crawl state updater");
57      }
58  
59      protected void innerProcess(CrawlURI curi) {
60          CrawlServer server =
61              getController().getServerCache().getServerFor(curi);
62          
63          String scheme = curi.getUURI().getScheme().toLowerCase();
64          if (scheme.equals("http") || scheme.equals("https") &&
65                  server != null) {
66              // Update connection problems counter
67              if(curi.getFetchStatus() == S_CONNECT_FAILED) {
68                  server.incrementConsecutiveConnectionErrors();
69              } else if (curi.getFetchStatus() > 0){
70                  server.resetConsecutiveConnectionErrors();
71              }
72  
73              // Update robots info
74              try {
75                  if (curi.getUURI().getPath() != null &&
76                          curi.getUURI().getPath().equals("/robots.txt")) {
77                      // Update server with robots info
78                      server.updateRobots(curi);
79                  }
80              }
81              catch (URIException e) {
82                  logger.severe("Failed get path on " + curi.getUURI());
83              }
84          }
85      }
86  }