1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.crawler.postprocessor;
27
28 import java.util.Collection;
29 import java.util.HashSet;
30 import java.util.Iterator;
31 import java.util.logging.Level;
32 import java.util.logging.Logger;
33
34 import javax.management.AttributeNotFoundException;
35
36 import org.apache.commons.httpclient.URIException;
37 import org.archive.crawler.datamodel.CandidateURI;
38 import org.archive.crawler.datamodel.CrawlURI;
39 import org.archive.crawler.datamodel.FetchStatusCodes;
40 import org.archive.crawler.deciderules.DecideRule;
41 import org.archive.crawler.deciderules.DecideRuleSequence;
42 import org.archive.crawler.extractor.Link;
43 import org.archive.crawler.framework.Filter;
44 import org.archive.crawler.framework.Scoper;
45 import org.archive.crawler.settings.MapType;
46 import org.archive.crawler.settings.SimpleType;
47 import org.archive.crawler.settings.Type;
48
49 /***
50 * Determine which extracted links are within scope.
51 * TODO: To test scope, requires that Link be converted to
52 * a CandidateURI. Make it so don't have to make a CandidateURI to test
53 * if Link is in scope.
54 * <p>Since this scoper has to create CandidateURIs, no sense
55 * discarding them since later in the processing chain CandidateURIs rather
56 * than Links are whats needed scheduling extracted links w/ the
57 * Frontier (Frontier#schedule expects CandidateURI, not Link). This class
58 * replaces Links w/ the CandidateURI that wraps the Link in the CrawlURI.
59 *
60 * @author gojomo
61 * @author stack
62 */
63 public class LinksScoper extends Scoper
64 implements FetchStatusCodes {
65
66 private static final long serialVersionUID = -4074442117992496793L;
67
68 private static Logger LOGGER =
69 Logger.getLogger(LinksScoper.class.getName());
70
71 private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS =
72 "seed-redirects-new-seed";
73
74 private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS =
75 new Boolean(true);
76
77 public static final String ATTR_REJECTLOG_DECIDE_RULES =
78 "scope-rejected-url-rules";
79
80 public static final String ATTR_PREFERENCE_DEPTH_HOPS =
81 "preference-depth-hops";
82
83 private final static Integer DEFAULT_PREFERENCE_DEPTH_HOPS =
84 new Integer(-1);
85
86 /***
87 * Instance of rejected uris log filters.
88 */
89 private MapType rejectLogFilters = null;
90
91 /***
92 * @param name Name of this filter.
93 */
94 public LinksScoper(String name) {
95 super(name, "LinksScoper. Rules on which extracted links " +
96 "are within configured scope.");
97
98 Type t;
99 t = addElementToDefinition(
100 new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS,
101 "If enabled, any URL found because a seed redirected to it " +
102 "(original seed returned 301 or 302), will also be treated " +
103 "as a seed.", DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
104 t.setExpertSetting(true);
105
106 t = addElementToDefinition(new SimpleType(ATTR_PREFERENCE_DEPTH_HOPS,
107 "Number of hops (of any sort) from a seed up to which a URI has higher " +
108 "priority scheduling than any remaining seed. For example, if set to 1 items one " +
109 "hop (link, embed, redirect, etc.) away from a seed will be scheduled " +
110 "with HIGH priority. If set to -1, no " +
111 "preferencing will occur, and a breadth-first search with seeds " +
112 "processed before discovered links will proceed. If set to zero, a " +
113 "purely depth-first search will proceed, with all discovered links processed " +
114 "before remaining seeds. Seed redirects are treated as one hop from a seed.",
115 DEFAULT_PREFERENCE_DEPTH_HOPS));
116 t.setExpertSetting(true);
117
118 addElementToDefinition(
119 new DecideRuleSequence(ATTR_REJECTLOG_DECIDE_RULES,
120 "DecideRules which, if their final decision on a link is " +
121 "not REJECT, cause the otherwise scope-rejected links to " +
122 "be logged"));
123
124 }
125
126 protected void innerProcess(final CrawlURI curi) {
127 if (LOGGER.isLoggable(Level.FINEST)) {
128 LOGGER.finest(getName() + " processing " + curi);
129 }
130
131
132 if (curi.hasPrerequisiteUri()) {
133 handlePrerequisite(curi);
134 return;
135 }
136
137
138 if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
139 curi.clearOutlinks();
140 return;
141 }
142
143 if (curi.outlinksSize() <= 0) {
144
145 return;
146 }
147
148 final boolean redirectsNewSeeds = ((Boolean)getUncheckedAttribute(curi,
149 ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
150 int preferenceDepthHops = ((Integer)getUncheckedAttribute(curi,
151 ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
152 Collection<CandidateURI> inScopeLinks = new HashSet<CandidateURI>();
153 for (final Iterator i = curi.getOutObjects().iterator(); i.hasNext();) {
154 Object o = i.next();
155 if(o instanceof Link){
156 final Link wref = (Link)o;
157 try {
158 final int directive = getSchedulingFor(curi, wref,
159 preferenceDepthHops);
160 final CandidateURI caURI =
161 curi.createCandidateURI(curi.getBaseURI(), wref,
162 directive,
163 considerAsSeed(curi, wref, redirectsNewSeeds));
164 if (isInScope(caURI)) {
165 inScopeLinks.add(caURI);
166 }
167 } catch (URIException e) {
168 getController().logUriError(e, curi.getUURI(),
169 wref.getDestination().toString());
170 }
171 } else if(o instanceof CandidateURI){
172 CandidateURI caURI = (CandidateURI)o;
173 if(isInScope(caURI)){
174 inScopeLinks.add(caURI);
175 }
176 } else {
177 LOGGER.severe("Unexpected type: " + o);
178 }
179 }
180
181
182 curi.replaceOutlinks(inScopeLinks);
183 }
184
185 /***
186 * The CrawlURI has a prerequisite; apply scoping and update
187 * Link to CandidateURI in manner analogous to outlink handling.
188 * @param curi CrawlURI with prereq to consider
189 */
190 protected void handlePrerequisite(CrawlURI curi) {
191 try {
192
193 CandidateURI caUri =
194 curi.createCandidateURI(curi.getBaseURI(),
195 (Link) curi.getPrerequisiteUri());
196 int prereqPriority = curi.getSchedulingDirective() - 1;
197 if (prereqPriority < 0) {
198 prereqPriority = 0;
199 LOGGER.severe("Unable to promote prerequisite " + caUri +
200 " above " + curi);
201 }
202 caUri.setSchedulingDirective(prereqPriority);
203 caUri.setForceFetch(true);
204 if(isInScope(caUri)) {
205
206 curi.setPrerequisiteUri(caUri);
207 } else {
208
209
210 curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
211 }
212 } catch (URIException ex) {
213 Object[] array = {curi, curi.getPrerequisiteUri()};
214 getController().uriErrors.log(Level.INFO,ex.getMessage(), array);
215 } catch (NumberFormatException e) {
216
217 Object[] array = {curi, curi.getPrerequisiteUri()};
218 getController().uriErrors.log(Level.INFO,e.getMessage(), array);
219 }
220 }
221
222 protected void outOfScope(CandidateURI caUri) {
223 super.outOfScope(caUri);
224 if (!LOGGER.isLoggable(Level.INFO)) {
225 return;
226 }
227
228 CrawlURI curi = (caUri instanceof CrawlURI)?
229 (CrawlURI)caUri:
230 new CrawlURI(caUri.getUURI());
231 if (rulesAccept(getRejectLogRules(curi), curi)) {
232 LOGGER.info(curi.getUURI().toString());
233 }
234 }
235
236 protected DecideRule getRejectLogRules(Object o) {
237 try {
238 return (DecideRule)getAttribute(o, ATTR_REJECTLOG_DECIDE_RULES);
239 } catch (AttributeNotFoundException e) {
240 throw new RuntimeException(e);
241 }
242 }
243
244 private boolean considerAsSeed(final CrawlURI curi, final Link wref,
245 final boolean redirectsNewSeeds) {
246
247 if (curi.isSeed()
248 && (curi.getFetchStatus() == 301 ||
249 curi.getFetchStatus() == 302)
250 && wref.getHopType() == Link.REFER_HOP) {
251
252 if (redirectsNewSeeds) {
253 return true;
254 }
255 }
256 return false;
257 }
258
259 /***
260 * Determine scheduling for the <code>curi</code>.
261 * As with the LinksScoper in general, this only handles extracted links,
262 * seeds do not pass through here, but are given MEDIUM priority.
263 * Imports into the frontier similarly do not pass through here,
264 * but are given NORMAL priority.
265 */
266 protected int getSchedulingFor(final CrawlURI curi, final Link wref,
267 final int preferenceDepthHops) {
268 final char c = wref.getHopType();
269 if (LOGGER.isLoggable(Level.FINEST)) {
270 LOGGER.finest(curi + " with path=" + curi.getPathFromSeed() +
271 " isSeed=" + curi.isSeed() + " with fetchStatus=" +
272 curi.getFetchStatus() + " -> " + wref.getDestination() +
273 " type " + c + " with context=" + wref.getContext());
274 }
275
276 switch (c) {
277 case Link.REFER_HOP:
278
279
280 return (preferenceDepthHops >= 0 ? CandidateURI.HIGH :
281 CandidateURI.MEDIUM);
282 default:
283 if (preferenceDepthHops == 0)
284 return CandidateURI.HIGH;
285
286
287
288
289 if (preferenceDepthHops > 0 &&
290 curi.getPathFromSeed().length() + 1 <= preferenceDepthHops)
291 return CandidateURI.HIGH;
292
293 return CandidateURI.NORMAL;
294 }
295 }
296 }