1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.io.BufferedReader;
27 import java.io.IOException;
28 import java.io.InputStreamReader;
29 import java.io.ObjectInputStream;
30 import java.io.Serializable;
31 import java.io.StringReader;
32 import java.util.HashSet;
33 import java.util.Set;
34 import java.util.zip.Checksum;
35
36 import org.apache.commons.httpclient.URIException;
37 import org.archive.crawler.datamodel.credential.CredentialAvatar;
38 import org.archive.crawler.framework.Checkpointer;
39 import org.archive.crawler.framework.ToeThread;
40 import org.archive.crawler.settings.CrawlerSettings;
41 import org.archive.crawler.settings.SettingsHandler;
42 import org.archive.io.ReplayInputStream;
43 import org.archive.net.UURIFactory;
44
45 /***
46 * Represents a single remote "server".
47 *
48 * A server is a service on a host. There might be more than one service on a
49 * host differentiated by a port number.
50 *
51 * @author gojomo
52 */
53 public class CrawlServer implements Serializable, CrawlSubstats.HasCrawlSubstats {
54
55 private static final long serialVersionUID = -989714570750970369L;
56
57 public static final long ROBOTS_NOT_FETCHED = -1;
58 /*** only check if robots-fetch is perhaps superfluous
59 * after this many tries */
60 public static final long MIN_ROBOTS_RETRIES = 2;
61
62 private final String server;
63 private int port;
64 private transient SettingsHandler settingsHandler;
65 private RobotsExclusionPolicy robots;
66 long robotsFetched = ROBOTS_NOT_FETCHED;
67 boolean validRobots = false;
68 Checksum robotstxtChecksum;
69 CrawlSubstats substats = new CrawlSubstats();
70
71
72
73
74 protected int consecutiveConnectionErrors = 0;
75
76 /***
77 * Set of credential avatars.
78 */
79 private transient Set<CredentialAvatar> avatars = null;
80
81 /***
82 * Creates a new CrawlServer object.
83 *
84 * @param h the host string for the server.
85 */
86 public CrawlServer(String h) {
87
88 server = h;
89 int colonIndex = server.lastIndexOf(":");
90 if (colonIndex < 0) {
91 port = -1;
92 } else {
93 try {
94 port = Integer.parseInt(server.substring(colonIndex + 1));
95 } catch (NumberFormatException e) {
96 port = -1;
97 }
98 }
99 }
100
101 /*** Get the robots exclusion policy for this server.
102 *
103 * @return the robots exclusion policy for this server.
104 */
105 public RobotsExclusionPolicy getRobots() {
106 return robots;
107 }
108
109 /*** Set the robots exclusion policy for this server.
110 *
111 * @param policy the policy to set.
112 */
113 public void setRobots(RobotsExclusionPolicy policy) {
114 robots = policy;
115 }
116
117 public String toString() {
118 return "CrawlServer("+server+")";
119 }
120
121 /*** Update the robots exclusion policy.
122 *
123 * @param curi the crawl URI containing the fetched robots.txt
124 * @throws IOException
125 */
126 public void updateRobots(CrawlURI curi) {
127 RobotsHonoringPolicy honoringPolicy =
128 settingsHandler.getOrder().getRobotsHonoringPolicy();
129
130 robotsFetched = System.currentTimeMillis();
131
132 boolean gotSomething = curi.getFetchStatus() > 0
133 && curi.isHttpTransaction();
134 if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
135
136 validRobots = false;
137 return;
138 }
139
140 CrawlerSettings settings = getSettings(curi);
141 int type = honoringPolicy.getType(settings);
142 if (type == RobotsHonoringPolicy.IGNORE) {
143
144 robots = RobotsExclusionPolicy.ALLOWALL;
145 validRobots = true;
146 return;
147 }
148
149 if(!gotSomething) {
150
151 validRobots = false;
152 return;
153 }
154
155 if (!curi.is2XXSuccess()) {
156
157
158
159
160
161
162
163 robots = RobotsExclusionPolicy.ALLOWALL;
164 validRobots = true;
165 return;
166 }
167
168 ReplayInputStream contentBodyStream = null;
169 try {
170 try {
171 BufferedReader reader;
172 if (type == RobotsHonoringPolicy.CUSTOM) {
173 reader = new BufferedReader(new StringReader(honoringPolicy
174 .getCustomRobots(settings)));
175 } else {
176 contentBodyStream = curi.getHttpRecorder()
177 .getRecordedInput().getContentReplayInputStream();
178
179 contentBodyStream.setToResponseBodyStart();
180 reader = new BufferedReader(new InputStreamReader(
181 contentBodyStream));
182 }
183 robots = RobotsExclusionPolicy.policyFor(settings,
184 reader, honoringPolicy);
185 validRobots = true;
186 } finally {
187 if (contentBodyStream != null) {
188 contentBodyStream.close();
189 }
190 }
191 } catch (IOException e) {
192 robots = RobotsExclusionPolicy.ALLOWALL;
193 validRobots = true;
194 curi.addLocalizedError(getName(), e,
195 "robots.txt parsing IOException");
196 }
197 }
198
199 /***
200 * @return Returns the time when robots.txt was fetched.
201 */
202 public long getRobotsFetchedTime() {
203 return robotsFetched;
204 }
205
206 /***
207 * @return The server string which might include a port number.
208 */
209 public String getName() {
210 return server;
211 }
212
213 /*** Get the port number for this server.
214 *
215 * @return the port number or -1 if not known (uses default for protocol)
216 */
217 public int getPort() {
218 return port;
219 }
220
221 /***
222 * Called when object is being deserialized.
223 * In addition to the default java deserialization, this method
224 * re-establishes the references to settings handler and robots honoring
225 * policy.
226 *
227 * @param stream the stream to deserialize from.
228 * @throws IOException if I/O errors occur
229 * @throws ClassNotFoundException If the class for an object being restored
230 * cannot be found.
231 */
232 private void readObject(ObjectInputStream stream)
233 throws IOException, ClassNotFoundException {
234 stream.defaultReadObject();
235 settingsHandler = SettingsHandler.getThreadContextSettingsHandler();
236 postDeserialize();
237 }
238
239 private void postDeserialize() {
240 if (this.robots != null) {
241 RobotsHonoringPolicy honoringPolicy =
242 settingsHandler.getOrder().getRobotsHonoringPolicy();
243 this.robots.honoringPolicy = honoringPolicy;
244 }
245 }
246
247 /*** Get the settings handler.
248 *
249 * @return the settings handler.
250 */
251 public SettingsHandler getSettingsHandler() {
252 return this.settingsHandler;
253 }
254
255 /*** Get the settings object in effect for this server.
256 * @param curi
257 *
258 * @return the settings object in effect for this server.
259 * @throws URIException
260 */
261 private CrawlerSettings getSettings(CandidateURI curi) {
262 try {
263 return this.settingsHandler.
264 getSettings(curi.getUURI().getReferencedHost(),
265 curi.getUURI());
266 } catch (URIException e) {
267 return null;
268 }
269 }
270
271 /*** Set the settings handler to be used by this server.
272 *
273 * @param settingsHandler the settings handler to be used by this server.
274 */
275 public void setSettingsHandler(SettingsHandler settingsHandler) {
276 this.settingsHandler = settingsHandler;
277 }
278
279 public void incrementConsecutiveConnectionErrors() {
280 this.consecutiveConnectionErrors++;
281 }
282
283 public void resetConsecutiveConnectionErrors() {
284 this.consecutiveConnectionErrors = 0;
285 }
286
287 /***
288 * @return Credential avatars for this server. Returns null if none.
289 */
290 public Set getCredentialAvatars() {
291 return this.avatars;
292 }
293
294 /***
295 * @return True if there are avatars attached to this instance.
296 */
297 public boolean hasCredentialAvatars() {
298 return this.avatars != null && this.avatars.size() > 0;
299 }
300
301 /***
302 * Add an avatar.
303 *
304 * @param ca Credential avatar to add to set of avatars.
305 */
306 public void addCredentialAvatar(CredentialAvatar ca) {
307 if (this.avatars == null) {
308 this.avatars = new HashSet<CredentialAvatar>();
309 }
310 this.avatars.add(ca);
311 }
312
313 /***
314 * If true then valid robots.txt information has been retrieved. If false
315 * either no attempt has been made to fetch robots.txt or the attempt
316 * failed.
317 *
318 * @return Returns the validRobots.
319 */
320 public boolean isValidRobots() {
321 return validRobots;
322 }
323
324 /***
325 * Get key to use doing lookup on server instances.
326 * @param cauri CandidateURI we're to get server key for.
327 * @return String to use as server key.
328 * @throws URIException
329 */
330 public static String getServerKey(CandidateURI cauri)
331 throws URIException {
332
333
334
335 String key = cauri.getUURI().getAuthorityMinusUserinfo();
336 if (key == null) {
337
338
339
340 key = cauri.getUURI().getCurrentHierPath();
341 if(key != null && !key.matches("[-_//w//.:]+")) {
342
343
344 key = null;
345 }
346 }
347 if (key != null &&
348 cauri.getUURI().getScheme().equals(UURIFactory.HTTPS)) {
349
350
351 if (!key.matches(".+:[0-9]+")) {
352 key += ":" + UURIFactory.HTTPS_PORT;
353 }
354 }
355 return key;
356 }
357
358
359
360
361 public CrawlSubstats getSubstats() {
362 return substats;
363 }
364 }