1   /* CrawlSubstats
2   *
3   * $Id: CrawlSubstats.java 5439 2007-08-28 05:15:25Z gojomo $
4   *
5   * Created on Nov 4, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.datamodel;
26  
27  import java.io.Serializable;
28  
29  import org.apache.commons.httpclient.HttpStatus;
30  
31  /***
32   * Collector of statististics for a 'subset' of a crawl,
33   * such as a server (host:port), host, or frontier group 
34   * (eg queue). 
35   * 
36   * @author gojomo
37   */
38  public class CrawlSubstats implements Serializable, FetchStatusCodes {
39      private static final long serialVersionUID = 8624425657056569036L;
40  
41      public enum Stage {SCHEDULED, SUCCEEDED, RETRIED, DISREGARDED, FAILED};
42      
43      public interface HasCrawlSubstats {
44          public CrawlSubstats getSubstats();
45      }
46      
47      long totalScheduled;   // anything initially scheduled
48                             // (totalScheduled - (fetchSuccesses + fetchFailures)
49      long fetchSuccesses;   // anything disposed-success 
50                             // (HTTP 2XX response codes, other non-errors)
51      long fetchFailures;    // anything disposed-failure
52      long fetchDisregards;  // anything disposed-disregard
53      long fetchResponses;   // all positive responses (incl. 3XX, 4XX, 5XX)
54      long robotsDenials;    // all robots-precluded failures
55      long successBytes;     // total size of all success responses
56      long totalBytes;       // total size of all responses
57      long fetchNonResponses; // processing attempts resulting in no response
58                             // (both failures and temp deferrals)
59      
60      /***
61       * Examing the CrawlURI and based on its status and internal values,
62       * update tallies. 
63       * 
64       * @param curi
65       * @deprecated
66       */
67      public synchronized void tally(CrawlURI curi) {
68          if(curi.getFetchStatus()<=0) {
69              fetchNonResponses++;
70              return;
71          }
72          fetchResponses++;
73          totalBytes += curi.getContentSize();
74          if(curi.getFetchStatus()>=HttpStatus.SC_OK && 
75                  curi.getFetchStatus()<300) {
76              fetchSuccesses++;
77              successBytes += curi.getContentSize();
78          }
79      }
80      
81      public synchronized void tally(CrawlURI curi, Stage stage) {
82          switch(stage) {
83              case SCHEDULED:
84                  totalScheduled++;
85                  break;
86              case RETRIED:
87                  if(curi.getFetchStatus()<=0) {
88                      fetchNonResponses++;
89                      return;
90                  }
91                  break;
92              case SUCCEEDED:
93                  fetchSuccesses++;
94                  totalBytes += curi.getContentSize();
95                  successBytes += curi.getContentSize();
96                  break;
97              case DISREGARDED:
98                  fetchDisregards++;
99                  if(curi.getFetchStatus()==S_ROBOTS_PRECLUDED) {
100                     robotsDenials++;
101                 }
102                 break;
103             case FAILED:
104                 if(curi.getFetchStatus()<=0) {
105                     fetchNonResponses++;
106                 } else {
107                     totalBytes += curi.getContentSize();
108                 }
109                 fetchFailures++;
110                 break;
111         }
112     }
113     
114     public long getFetchSuccesses() {
115         return fetchSuccesses;
116     }
117     public long getFetchResponses() {
118         return fetchResponses;
119     }
120     public long getSuccessBytes() {
121         return successBytes;
122     }
123     public long getTotalBytes() {
124         return totalBytes;
125     }
126     public long getFetchNonResponses() {
127         return fetchNonResponses;
128     }
129     public long getTotalScheduled() {
130         return totalScheduled;
131     }
132     public long getFetchDisregards() {
133         return fetchDisregards;
134     }
135     public long getRobotsDenials() {
136         return robotsDenials;
137     }
138     
139     public long getRemaining() {
140         return totalScheduled - (fetchSuccesses + fetchFailures + fetchDisregards);
141     }
142     public long getRecordedFinishes() {
143         return fetchSuccesses + fetchFailures;
144     }
145 }