View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * RobotsExclusionPolicy.java
20   * Created on Apr 17, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.BufferedReader;
27  import java.io.IOException;
28  import java.io.ObjectInputStream;
29  import java.io.ObjectOutputStream;
30  import java.io.Serializable;
31  import java.util.ArrayList;
32  import java.util.HashMap;
33  import java.util.Iterator;
34  import java.util.LinkedList;
35  import java.util.List;
36  import java.util.logging.Level;
37  import java.util.logging.Logger;
38  
39  import org.apache.commons.httpclient.URIException;
40  import org.archive.crawler.settings.CrawlerSettings;
41  
42  /***
43   * RobotsExclusionPolicy represents the actual policy adopted with 
44   * respect to a specific remote server, usually constructed from 
45   * consulting the robots.txt, if any, the server provided. 
46   * 
47   * (The similarly named RobotsHonoringPolicy, on the other hand, 
48   * describes the strategy used by the crawler to determine to what
49   * extent it respects exclusion rules.)
50   * 
51   * The expiration of policies after a suitable amount of time has
52   * elapsed since last fetch is handled outside this class, in 
53   * CrawlServer itself. 
54   * 
55   * @author gojomo
56   *
57   */
58  public class RobotsExclusionPolicy implements Serializable {
59  
60      private static final long serialVersionUID = 6323907991237383113L;
61  
62      private static final Logger logger =
63          Logger.getLogger(RobotsExclusionPolicy.class.getName());
64  
65      private final static int NORMAL_TYPE = 0;
66      private final static int ALLOWALL_TYPE = 1;
67      private final static int DENYALL_TYPE = 2;
68      private transient int type = NORMAL_TYPE;
69  
70      public static RobotsExclusionPolicy ALLOWALL =
71          new RobotsExclusionPolicy(ALLOWALL_TYPE);
72      public static RobotsExclusionPolicy DENYALL =
73          new RobotsExclusionPolicy(DENYALL_TYPE);
74  
75      private LinkedList<String> userAgents = null;
76      private HashMap<String,List<String>> disallows = null;
77      transient RobotsHonoringPolicy honoringPolicy = null;
78  
79      private String lastUsedUserAgent = null;
80      private List<String> userAgentsToTest = null;
81  
82      /***
83       * @param settings 
84       * @param reader
85       * @param honoringPolicy
86       * @return Robot exclusion policy.
87       * @throws IOException
88       */
89      public static RobotsExclusionPolicy policyFor(CrawlerSettings settings,
90              BufferedReader reader, RobotsHonoringPolicy honoringPolicy)
91      throws IOException {
92          LinkedList<String> userAgents = new LinkedList<String>();
93          HashMap<String,List<String>> disallows
94           = new HashMap<String,List<String>>();
95          Robotstxt.parse(reader, userAgents, disallows);
96          return (disallows.isEmpty())?
97              ALLOWALL:
98              new RobotsExclusionPolicy(settings, userAgents, disallows,
99                  honoringPolicy);
100     }
101 
102 
103 
104     /***
105      * @param settings 
106      * @param u
107      * @param d
108      * @param honoringPolicy
109      */
110     public RobotsExclusionPolicy(CrawlerSettings settings, LinkedList<String> u,
111             HashMap<String,List<String>> d, 
112             RobotsHonoringPolicy honoringPolicy) {
113         userAgents = u;
114         disallows = d;
115         this.honoringPolicy = honoringPolicy;
116 
117         if(honoringPolicy == null) return;
118 
119         // If honoring policy is most favored user agent, all rules should be checked
120         if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED)) {
121             userAgentsToTest = userAgents;
122 
123         // IF honoring policy is most favored of set, then make a list with only the set as members
124         } else if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
125             userAgentsToTest = new ArrayList<String>();
126             Iterator userAgentSet = honoringPolicy.getUserAgents(settings).iterator();
127             while(userAgentSet.hasNext()) {
128                 String userAgent = (String) userAgentSet.next();
129 
130                 Iterator iter = userAgents.iterator();
131                 while ( iter.hasNext() ) {
132                     String ua = (String)iter.next();
133                     if (userAgent.indexOf(ua)>-1) {
134                         userAgentsToTest.add(ua);
135                         break;
136                     }
137                 }
138             }
139         }
140     }
141 
142     public RobotsExclusionPolicy(int type) {
143         this(null, null, null, null);
144         this.type = type;
145     }
146 
147     public boolean disallows(CrawlURI curi, String userAgent) {
148         if (this == ALLOWALL)
149             return false;
150         if (this == DENYALL)
151             return true;
152 
153         // In the common case with policy=Classic, the useragent is remembered from uri to uri on
154         // the same server
155         if((honoringPolicy.isType(curi, RobotsHonoringPolicy.CLASSIC) 
156                 || honoringPolicy.isType(curi, RobotsHonoringPolicy.CUSTOM))
157             && (lastUsedUserAgent == null
158             || !lastUsedUserAgent.equals(userAgent))) {
159 
160             lastUsedUserAgent = userAgent;
161             userAgentsToTest = new ArrayList<String>();
162             Iterator iter = userAgents.iterator();
163             String lowerCaseUserAgent = userAgent.toLowerCase();
164             while ( iter.hasNext() ) {
165                 String ua = (String)iter.next();
166                 // ua in below is already lowercase. See Robotstxt.java line 60. 
167                 if (lowerCaseUserAgent.indexOf(ua)>-1) {
168                     userAgentsToTest.add(ua);
169                     break; // consider no more sections
170                 }
171             }
172         }
173 
174         boolean disallow = false;
175         boolean examined = false;
176         String ua = null;
177 
178         // Go thru list of all user agents we might act as
179         Iterator uas = userAgentsToTest.iterator();
180         while(uas.hasNext() && examined == false) {
181             disallow = false;
182             ua = (String) uas.next();
183             Iterator dis = ((List) disallows.get(ua)).iterator();
184 
185             // Check if the current user agent is allowed to crawl
186             while(dis.hasNext() && examined == false && disallow == false) {
187                 String disallowedPath = (String) dis.next();
188                 if(disallowedPath.length() == 0) {
189                     // blanket allow
190                     examined = true;
191                     disallow = false;
192                     break;
193                 }
194                 try {
195                     String p = curi.getUURI().getPathQuery();
196                     if (p != null && p.startsWith(disallowedPath) ) {
197                         // the user agent tested isn't allowed to get this uri
198                         disallow = true;
199                     }
200                 }
201                 catch (URIException e) {
202                     logger.log(Level.SEVERE,"Failed getPathQuery from " + curi, e);
203                 }
204             }
205             if(disallow == false) {
206                 // the user agent tested is allowed
207                 examined = true;
208             }
209         }
210 
211         // Are we supposed to masquerade as the user agent to which restrictions
212         // we follow?
213         if(honoringPolicy.shouldMasquerade(curi) && ua != null && !ua.equals("")) {
214             curi.setUserAgent(ua);
215         }
216         return disallow;
217     }
218 
219     // Methods for object serialization.
220 
221     /*** If object is DENYALL or ALLOWALL, only the object identity and type
222      * is written in the serialization stream.
223      *
224      * @param stream the serialization stream.
225      * @throws IOException 
226      */
227     private void writeObject(ObjectOutputStream stream) throws IOException {
228         stream.writeInt(type);
229         if (type == NORMAL_TYPE) {
230             stream.defaultWriteObject();
231         }
232     }
233 
234     /*** If object is DENYALL or ALLOWALL, only the object identity and type
235      * is read from the serialization stream.
236      *
237      * @param stream the serialization stream.
238      * @throws IOException 
239      * @throws ClassNotFoundException 
240      */
241     private void readObject(ObjectInputStream stream)
242             throws IOException, ClassNotFoundException {
243         type = stream.readInt();
244         if (type == NORMAL_TYPE) {
245             stream.defaultReadObject();
246         }
247     }
248 
249     /*** If object is DENYALL or ALLOWALL, the object is replaced by constants
250      * so that check for object equality works.
251      * @return Object.
252      */
253     private Object readResolve() {
254         if (type == NORMAL_TYPE) {
255             return this;
256         } else if (type == ALLOWALL_TYPE) {
257             return ALLOWALL;
258         } else if (type == DENYALL_TYPE) {
259             return DENYALL;
260         }
261         return null;
262     }
263 
264 }