1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.datamodel;
26
27 import java.io.Serializable;
28
29 import org.apache.commons.httpclient.HttpStatus;
30
31 /***
32 * Collector of statististics for a 'subset' of a crawl,
33 * such as a server (host:port), host, or frontier group
34 * (eg queue).
35 *
36 * @author gojomo
37 */
38 public class CrawlSubstats implements Serializable, FetchStatusCodes {
39 private static final long serialVersionUID = 8624425657056569036L;
40
41 public enum Stage {SCHEDULED, SUCCEEDED, RETRIED, DISREGARDED, FAILED};
42
43 public interface HasCrawlSubstats {
44 public CrawlSubstats getSubstats();
45 }
46
47 long totalScheduled;
48
49 long fetchSuccesses;
50
51 long fetchFailures;
52 long fetchDisregards;
53 long fetchResponses;
54 long robotsDenials;
55 long successBytes;
56 long totalBytes;
57 long fetchNonResponses;
58
59
60 /***
61 * Examing the CrawlURI and based on its status and internal values,
62 * update tallies.
63 *
64 * @param curi
65 * @deprecated
66 */
67 public synchronized void tally(CrawlURI curi) {
68 if(curi.getFetchStatus()<=0) {
69 fetchNonResponses++;
70 return;
71 }
72 fetchResponses++;
73 totalBytes += curi.getContentSize();
74 if(curi.getFetchStatus()>=HttpStatus.SC_OK &&
75 curi.getFetchStatus()<300) {
76 fetchSuccesses++;
77 successBytes += curi.getContentSize();
78 }
79 }
80
81 public synchronized void tally(CrawlURI curi, Stage stage) {
82 switch(stage) {
83 case SCHEDULED:
84 totalScheduled++;
85 break;
86 case RETRIED:
87 if(curi.getFetchStatus()<=0) {
88 fetchNonResponses++;
89 return;
90 }
91 break;
92 case SUCCEEDED:
93 fetchSuccesses++;
94 totalBytes += curi.getContentSize();
95 successBytes += curi.getContentSize();
96 break;
97 case DISREGARDED:
98 fetchDisregards++;
99 if(curi.getFetchStatus()==S_ROBOTS_PRECLUDED) {
100 robotsDenials++;
101 }
102 break;
103 case FAILED:
104 if(curi.getFetchStatus()<=0) {
105 fetchNonResponses++;
106 } else {
107 totalBytes += curi.getContentSize();
108 }
109 fetchFailures++;
110 break;
111 }
112 }
113
114 public long getFetchSuccesses() {
115 return fetchSuccesses;
116 }
117 public long getFetchResponses() {
118 return fetchResponses;
119 }
120 public long getSuccessBytes() {
121 return successBytes;
122 }
123 public long getTotalBytes() {
124 return totalBytes;
125 }
126 public long getFetchNonResponses() {
127 return fetchNonResponses;
128 }
129 public long getTotalScheduled() {
130 return totalScheduled;
131 }
132 public long getFetchDisregards() {
133 return fetchDisregards;
134 }
135 public long getRobotsDenials() {
136 return robotsDenials;
137 }
138
139 public long getRemaining() {
140 return totalScheduled - (fetchSuccesses + fetchFailures + fetchDisregards);
141 }
142 public long getRecordedFinishes() {
143 return fetchSuccesses + fetchFailures;
144 }
145 }