1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlServer.java
20   * Created on Apr 17, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.BufferedReader;
27  import java.io.IOException;
28  import java.io.InputStreamReader;
29  import java.io.ObjectInputStream;
30  import java.io.Serializable;
31  import java.io.StringReader;
32  import java.util.HashSet;
33  import java.util.Set;
34  import java.util.zip.Checksum;
35  
36  import org.apache.commons.httpclient.URIException;
37  import org.archive.crawler.datamodel.credential.CredentialAvatar;
38  import org.archive.crawler.framework.Checkpointer;
39  import org.archive.crawler.framework.ToeThread;
40  import org.archive.crawler.settings.CrawlerSettings;
41  import org.archive.crawler.settings.SettingsHandler;
42  import org.archive.io.ReplayInputStream;
43  import org.archive.net.UURIFactory;
44  
45  /***
46   * Represents a single remote "server".
47   *
48   * A server is a service on a host. There might be more than one service on a
49   * host differentiated by a port number.
50   *
51   * @author gojomo
52   */
53  public class CrawlServer implements Serializable, CrawlSubstats.HasCrawlSubstats {
54  
55      private static final long serialVersionUID = -989714570750970369L;
56  
57      public static final long ROBOTS_NOT_FETCHED = -1;
58      /*** only check if robots-fetch is perhaps superfluous 
59       * after this many tries */
60      public static final long MIN_ROBOTS_RETRIES = 2;
61  
62      private final String server; // actually, host+port in the https case
63      private int port;
64      private transient SettingsHandler settingsHandler;
65      private RobotsExclusionPolicy robots;
66      long robotsFetched = ROBOTS_NOT_FETCHED;
67      boolean validRobots = false;
68      Checksum robotstxtChecksum;
69      CrawlSubstats substats = new CrawlSubstats();
70      
71      // how many consecutive connection errors have been encountered;
72      // used to drive exponentially increasing retry timeout or decision
73      // to 'freeze' entire class (queue) of URIs
74      protected int consecutiveConnectionErrors = 0;
75  
76      /***
77       * Set of credential avatars.
78       */
79      private transient Set<CredentialAvatar> avatars =  null;
80  
81      /***
82       * Creates a new CrawlServer object.
83       *
84       * @param h the host string for the server.
85       */
86      public CrawlServer(String h) {
87          // TODO: possibly check for illegal host string
88          server = h;
89          int colonIndex = server.lastIndexOf(":");
90          if (colonIndex < 0) {
91              port = -1;
92          } else {
93              try {
94                  port = Integer.parseInt(server.substring(colonIndex + 1));
95              } catch (NumberFormatException e) {
96                  port = -1;
97              }
98          }
99      }
100 
101     /*** Get the robots exclusion policy for this server.
102      *
103      * @return the robots exclusion policy for this server.
104      */
105     public RobotsExclusionPolicy getRobots() {
106         return robots;
107     }
108 
109     /*** Set the robots exclusion policy for this server.
110      *
111      * @param policy the policy to set.
112      */
113     public void setRobots(RobotsExclusionPolicy policy) {
114         robots = policy;
115     }
116 
117     public String toString() {
118         return "CrawlServer("+server+")";
119     }
120 
121     /*** Update the robots exclusion policy.
122      *
123      * @param curi the crawl URI containing the fetched robots.txt
124      * @throws IOException
125      */
126     public void updateRobots(CrawlURI curi) {
127         RobotsHonoringPolicy honoringPolicy =
128             settingsHandler.getOrder().getRobotsHonoringPolicy();
129 
130         robotsFetched = System.currentTimeMillis();
131 
132         boolean gotSomething = curi.getFetchStatus() > 0
133                 && curi.isHttpTransaction();
134         if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
135             // robots.txt lookup failed, no reason to consider IGNORE yet
136             validRobots = false;
137             return;
138         }
139         
140         CrawlerSettings settings = getSettings(curi);
141         int type = honoringPolicy.getType(settings);
142         if (type == RobotsHonoringPolicy.IGNORE) {
143             // IGNORE = ALLOWALL
144             robots = RobotsExclusionPolicy.ALLOWALL;
145             validRobots = true;
146             return;
147         }
148         
149         if(!gotSomething) {
150             // robots.txt lookup failed and policy not IGNORE
151             validRobots = false;
152             return;
153         }
154         
155         if (!curi.is2XXSuccess()) {
156             // Not found or anything but a status code in the 2xx range is
157             // treated as giving access to all of a sites' content.
158             // This is the prevailing practice of Google, since 4xx
159             // responses on robots.txt are usually indicative of a 
160             // misconfiguration or blanket-block, not an intentional
161             // indicator of partial blocking. 
162             // TODO: consider handling server errors, redirects differently
163             robots = RobotsExclusionPolicy.ALLOWALL;
164             validRobots = true;
165             return;
166         }
167 
168         ReplayInputStream contentBodyStream = null;
169         try {
170             try {
171                 BufferedReader reader;
172                 if (type == RobotsHonoringPolicy.CUSTOM) {
173                     reader = new BufferedReader(new StringReader(honoringPolicy
174                             .getCustomRobots(settings)));
175                 } else {
176                     contentBodyStream = curi.getHttpRecorder()
177                             .getRecordedInput().getContentReplayInputStream();
178 
179                     contentBodyStream.setToResponseBodyStart();
180                     reader = new BufferedReader(new InputStreamReader(
181                             contentBodyStream));
182                 }
183                 robots = RobotsExclusionPolicy.policyFor(settings,
184                         reader, honoringPolicy);
185                 validRobots = true;
186             } finally {
187                 if (contentBodyStream != null) {
188                     contentBodyStream.close();
189                 }
190             }
191         } catch (IOException e) {
192             robots = RobotsExclusionPolicy.ALLOWALL;
193             validRobots = true;
194             curi.addLocalizedError(getName(), e,
195                     "robots.txt parsing IOException");
196         }
197     }
198 
199     /***
200      * @return Returns the time when robots.txt was fetched.
201      */
202     public long getRobotsFetchedTime() {
203         return robotsFetched;
204     }
205 
206     /***
207      * @return The server string which might include a port number.
208      */
209     public String getName() {
210        return server;
211     }
212 
213     /*** Get the port number for this server.
214      *
215      * @return the port number or -1 if not known (uses default for protocol)
216      */
217     public int getPort() {
218         return port;
219     }
220 
221     /*** 
222      * Called when object is being deserialized.
223      * In addition to the default java deserialization, this method
224      * re-establishes the references to settings handler and robots honoring
225      * policy.
226      *
227      * @param stream the stream to deserialize from.
228      * @throws IOException if I/O errors occur
229      * @throws ClassNotFoundException If the class for an object being restored
230      *         cannot be found.
231      */
232     private void readObject(ObjectInputStream stream)
233             throws IOException, ClassNotFoundException {
234         stream.defaultReadObject();
235         settingsHandler = SettingsHandler.getThreadContextSettingsHandler();
236         postDeserialize();
237     }
238     
239     private void postDeserialize() {
240     	if (this.robots != null) {
241     		RobotsHonoringPolicy honoringPolicy =
242                 settingsHandler.getOrder().getRobotsHonoringPolicy();
243     		this.robots.honoringPolicy = honoringPolicy;
244     	}
245     }
246 
247     /*** Get the settings handler.
248      *
249      * @return the settings handler.
250      */
251     public SettingsHandler getSettingsHandler() {
252         return this.settingsHandler;
253     }
254 
255     /*** Get the settings object in effect for this server.
256      * @param curi
257      *
258      * @return the settings object in effect for this server.
259      * @throws URIException
260      */
261     private CrawlerSettings getSettings(CandidateURI curi) {
262         try {
263             return this.settingsHandler.
264                 getSettings(curi.getUURI().getReferencedHost(),
265                     curi.getUURI());
266         } catch (URIException e) {
267             return null;
268         }
269     }
270 
271     /*** Set the settings handler to be used by this server.
272      *
273      * @param settingsHandler the settings handler to be used by this server.
274      */
275     public void setSettingsHandler(SettingsHandler settingsHandler) {
276         this.settingsHandler = settingsHandler;
277     }
278 
279     public void incrementConsecutiveConnectionErrors() {
280         this.consecutiveConnectionErrors++;
281     }
282 
283     public void resetConsecutiveConnectionErrors() {
284         this.consecutiveConnectionErrors = 0;
285     }
286 
287     /***
288      * @return Credential avatars for this server.  Returns null if none.
289      */
290     public Set getCredentialAvatars() {
291         return this.avatars;
292     }
293 
294     /***
295      * @return True if there are avatars attached to this instance.
296      */
297     public boolean hasCredentialAvatars() {
298         return this.avatars != null && this.avatars.size() > 0;
299     }
300 
301     /***
302      * Add an avatar.
303      *
304      * @param ca Credential avatar to add to set of avatars.
305      */
306     public void addCredentialAvatar(CredentialAvatar ca) {
307         if (this.avatars == null) {
308             this.avatars = new HashSet<CredentialAvatar>();
309         }
310         this.avatars.add(ca);
311     }
312     
313 	/***
314      * If true then valid robots.txt information has been retrieved. If false
315      * either no attempt has been made to fetch robots.txt or the attempt
316      * failed.
317      *
318 	 * @return Returns the validRobots.
319 	 */
320 	public boolean isValidRobots() {
321 		return validRobots;
322 	}
323     
324     /***
325      * Get key to use doing lookup on server instances.
326      * @param cauri CandidateURI we're to get server key for.
327      * @return String to use as server key.
328      * @throws URIException
329      */
330 	public static String getServerKey(CandidateURI cauri)
331 	throws URIException {
332 	    // TODO: evaluate if this is really necessary -- why not 
333 	    // make the server of a dns CandidateURI the looked-up domain,
334 	    // also simplifying FetchDNS?
335 	    String key = cauri.getUURI().getAuthorityMinusUserinfo();
336 	    if (key == null) {
337 	        // Fallback for cases where getAuthority() fails (eg 'dns:'.
338 	        // DNS UURIs have the 'domain' in the 'path' parameter, not
339 	        // in the authority).
340 	        key = cauri.getUURI().getCurrentHierPath();
341 	        if(key != null && !key.matches("[-_//w//.:]+")) {
342 	            // Not just word chars and dots and colons and dashes and
343 	            // underscores; throw away
344 	            key = null;
345 	        }
346 	    }
347 	    if (key != null &&
348 	            cauri.getUURI().getScheme().equals(UURIFactory.HTTPS)) {
349 	        // If https and no port specified, add default https port to
350 	        // distinuish https from http server without a port.
351 	        if (!key.matches(".+:[0-9]+")) {
352 	            key += ":" + UURIFactory.HTTPS_PORT;
353 	        }
354 	    }
355 	    return key;
356 	}
357 
358     /* (non-Javadoc)
359      * @see org.archive.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats()
360      */
361     public CrawlSubstats getSubstats() {
362         return substats;
363     }
364 }