1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.crawler.postprocessor;
27
28 import java.util.Collection;
29 import java.util.HashSet;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32
33 import javax.management.AttributeNotFoundException;
34
35 import org.archive.crawler.datamodel.CandidateURI;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.deciderules.DecideRule;
38 import org.archive.crawler.deciderules.DecideRuleSequence;
39 import org.archive.crawler.framework.Filter;
40 import org.archive.crawler.framework.Scoper;
41 import org.archive.crawler.settings.MapType;
42
43 /***
44 * Run CandidateURI links carried in the passed CrawlURI through a filter
45 * and 'handle' rejections.
46 * Used to do supplementary processing of links after they've been scope
47 * processed and ruled 'in-scope' by LinkScoper. An example of
48 * 'supplementary processing' would check that a Link is intended for
49 * this host to crawl in a multimachine crawl setting. Configure filters to
50 * rule on links. Default handler writes rejected URLs to disk. Subclass
51 * to handle rejected URLs otherwise.
52 * @author stack
53 */
54 public class SupplementaryLinksScoper extends Scoper {
55
56 private static final long serialVersionUID = -775819977752790418L;
57
58 private static Logger LOGGER =
59 Logger.getLogger(SupplementaryLinksScoper.class.getName());
60
61 public static final String ATTR_LINKS_DECIDE_RULES = "link-rules";
62
63 /***
64 * @param name Name of this filter.
65 */
66 public SupplementaryLinksScoper(String name) {
67 super(name, "SupplementaryLinksScoper. Use to do supplementary " +
68 "processing of in-scope links. Will run each link through " +
69 "configured filters. Must be run after LinkScoper and " +
70 "before FrontierScheduler. " +
71 "Optionally logs rejected links (Enable " +
72 ATTR_OVERRIDE_LOGGER_ENABLED + " and set logger level " +
73 "at INFO or above).");
74
75 addElementToDefinition(
76 new DecideRuleSequence(ATTR_LINKS_DECIDE_RULES,
77 "DecideRules which if their final decision on a link is " +
78 "REJECT, cause the link to be ruled out-of-scope, even " +
79 "if it had previously been accepted by the main scope."));
80 }
81
82 protected void innerProcess(final CrawlURI curi) {
83
84 if (curi.hasPrerequisiteUri() || curi.outlinksSize() <= 0) {
85 return;
86 }
87
88 Collection<CandidateURI> inScopeLinks = new HashSet<CandidateURI>();
89 for (CandidateURI cauri: curi.getOutCandidates()) {
90 if (isInScope(cauri)) {
91 inScopeLinks.add(cauri);
92 }
93 }
94
95
96 curi.replaceOutlinks(inScopeLinks);
97 }
98
99 protected boolean isInScope(CandidateURI caUri) {
100
101 CrawlURI curi = (caUri instanceof CrawlURI)?
102 (CrawlURI)caUri:
103 new CrawlURI(caUri.getUURI());
104 boolean result = false;
105 if (rulesAccept(getLinkRules(curi), curi)) {
106 result = true;
107 if (LOGGER.isLoggable(Level.FINER)) {
108 LOGGER.finer("Accepted: " + caUri);
109 }
110 } else {
111 outOfScope(caUri);
112 }
113 return result;
114 }
115
116 protected DecideRule getLinkRules(Object o) {
117 try {
118 return (DecideRule)getAttribute(o, ATTR_LINKS_DECIDE_RULES);
119 } catch (AttributeNotFoundException e) {
120 throw new RuntimeException(e);
121 }
122 }
123
124 /***
125 * Called when a CandidateUri is ruled out of scope.
126 * @param caUri CandidateURI that is out of scope.
127 */
128 protected void outOfScope(CandidateURI caUri) {
129 if (!LOGGER.isLoggable(Level.INFO)) {
130 return;
131 }
132 LOGGER.info(caUri.getUURI().toString());
133 }
134 }