1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.io.BufferedReader;
27 import java.io.IOException;
28 import java.io.ObjectInputStream;
29 import java.io.ObjectOutputStream;
30 import java.io.Serializable;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.Iterator;
34 import java.util.LinkedList;
35 import java.util.List;
36 import java.util.logging.Level;
37 import java.util.logging.Logger;
38
39 import org.apache.commons.httpclient.URIException;
40 import org.archive.crawler.settings.CrawlerSettings;
41
42 /***
43 * RobotsExclusionPolicy represents the actual policy adopted with
44 * respect to a specific remote server, usually constructed from
45 * consulting the robots.txt, if any, the server provided.
46 *
47 * (The similarly named RobotsHonoringPolicy, on the other hand,
48 * describes the strategy used by the crawler to determine to what
49 * extent it respects exclusion rules.)
50 *
51 * The expiration of policies after a suitable amount of time has
52 * elapsed since last fetch is handled outside this class, in
53 * CrawlServer itself.
54 *
55 * @author gojomo
56 *
57 */
58 public class RobotsExclusionPolicy implements Serializable {
59
60 private static final long serialVersionUID = 6323907991237383113L;
61
62 private static final Logger logger =
63 Logger.getLogger(RobotsExclusionPolicy.class.getName());
64
65 private final static int NORMAL_TYPE = 0;
66 private final static int ALLOWALL_TYPE = 1;
67 private final static int DENYALL_TYPE = 2;
68 private transient int type = NORMAL_TYPE;
69
70 public static RobotsExclusionPolicy ALLOWALL =
71 new RobotsExclusionPolicy(ALLOWALL_TYPE);
72 public static RobotsExclusionPolicy DENYALL =
73 new RobotsExclusionPolicy(DENYALL_TYPE);
74
75 private LinkedList<String> userAgents = null;
76 private HashMap<String,List<String>> disallows = null;
77 transient RobotsHonoringPolicy honoringPolicy = null;
78
79 private String lastUsedUserAgent = null;
80 private List<String> userAgentsToTest = null;
81
82 /***
83 * @param settings
84 * @param reader
85 * @param honoringPolicy
86 * @return Robot exclusion policy.
87 * @throws IOException
88 */
89 public static RobotsExclusionPolicy policyFor(CrawlerSettings settings,
90 BufferedReader reader, RobotsHonoringPolicy honoringPolicy)
91 throws IOException {
92 LinkedList<String> userAgents = new LinkedList<String>();
93 HashMap<String,List<String>> disallows
94 = new HashMap<String,List<String>>();
95 Robotstxt.parse(reader, userAgents, disallows);
96 return (disallows.isEmpty())?
97 ALLOWALL:
98 new RobotsExclusionPolicy(settings, userAgents, disallows,
99 honoringPolicy);
100 }
101
102
103
104 /***
105 * @param settings
106 * @param u
107 * @param d
108 * @param honoringPolicy
109 */
110 public RobotsExclusionPolicy(CrawlerSettings settings, LinkedList<String> u,
111 HashMap<String,List<String>> d,
112 RobotsHonoringPolicy honoringPolicy) {
113 userAgents = u;
114 disallows = d;
115 this.honoringPolicy = honoringPolicy;
116
117 if(honoringPolicy == null) return;
118
119
120 if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED)) {
121 userAgentsToTest = userAgents;
122
123
124 } else if(honoringPolicy.isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
125 userAgentsToTest = new ArrayList<String>();
126 Iterator userAgentSet = honoringPolicy.getUserAgents(settings).iterator();
127 while(userAgentSet.hasNext()) {
128 String userAgent = (String) userAgentSet.next();
129
130 Iterator iter = userAgents.iterator();
131 while ( iter.hasNext() ) {
132 String ua = (String)iter.next();
133 if (userAgent.indexOf(ua)>-1) {
134 userAgentsToTest.add(ua);
135 break;
136 }
137 }
138 }
139 }
140 }
141
142 public RobotsExclusionPolicy(int type) {
143 this(null, null, null, null);
144 this.type = type;
145 }
146
147 public boolean disallows(CrawlURI curi, String userAgent) {
148 if (this == ALLOWALL)
149 return false;
150 if (this == DENYALL)
151 return true;
152
153
154
155 if((honoringPolicy.isType(curi, RobotsHonoringPolicy.CLASSIC)
156 || honoringPolicy.isType(curi, RobotsHonoringPolicy.CUSTOM))
157 && (lastUsedUserAgent == null
158 || !lastUsedUserAgent.equals(userAgent))) {
159
160 lastUsedUserAgent = userAgent;
161 userAgentsToTest = new ArrayList<String>();
162 Iterator iter = userAgents.iterator();
163 String lowerCaseUserAgent = userAgent.toLowerCase();
164 while ( iter.hasNext() ) {
165 String ua = (String)iter.next();
166
167 if (lowerCaseUserAgent.indexOf(ua)>-1) {
168 userAgentsToTest.add(ua);
169 break;
170 }
171 }
172 }
173
174 boolean disallow = false;
175 boolean examined = false;
176 String ua = null;
177
178
179 Iterator uas = userAgentsToTest.iterator();
180 while(uas.hasNext() && examined == false) {
181 disallow = false;
182 ua = (String) uas.next();
183 Iterator dis = ((List) disallows.get(ua)).iterator();
184
185
186 while(dis.hasNext() && examined == false && disallow == false) {
187 String disallowedPath = (String) dis.next();
188 if(disallowedPath.length() == 0) {
189
190 examined = true;
191 disallow = false;
192 break;
193 }
194 try {
195 String p = curi.getUURI().getPathQuery();
196 if (p != null && p.startsWith(disallowedPath) ) {
197
198 disallow = true;
199 }
200 }
201 catch (URIException e) {
202 logger.log(Level.SEVERE,"Failed getPathQuery from " + curi, e);
203 }
204 }
205 if(disallow == false) {
206
207 examined = true;
208 }
209 }
210
211
212
213 if(honoringPolicy.shouldMasquerade(curi) && ua != null && !ua.equals("")) {
214 curi.setUserAgent(ua);
215 }
216 return disallow;
217 }
218
219
220
221 /*** If object is DENYALL or ALLOWALL, only the object identity and type
222 * is written in the serialization stream.
223 *
224 * @param stream the serialization stream.
225 * @throws IOException
226 */
227 private void writeObject(ObjectOutputStream stream) throws IOException {
228 stream.writeInt(type);
229 if (type == NORMAL_TYPE) {
230 stream.defaultWriteObject();
231 }
232 }
233
234 /*** If object is DENYALL or ALLOWALL, only the object identity and type
235 * is read from the serialization stream.
236 *
237 * @param stream the serialization stream.
238 * @throws IOException
239 * @throws ClassNotFoundException
240 */
241 private void readObject(ObjectInputStream stream)
242 throws IOException, ClassNotFoundException {
243 type = stream.readInt();
244 if (type == NORMAL_TYPE) {
245 stream.defaultReadObject();
246 }
247 }
248
249 /*** If object is DENYALL or ALLOWALL, the object is replaced by constants
250 * so that check for object equality works.
251 * @return Object.
252 */
253 private Object readResolve() {
254 if (type == NORMAL_TYPE) {
255 return this;
256 } else if (type == ALLOWALL_TYPE) {
257 return ALLOWALL;
258 } else if (type == DENYALL_TYPE) {
259 return DENYALL;
260 }
261 return null;
262 }
263
264 }