1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.framework;
25
26 import java.lang.reflect.Constructor;
27 import java.util.Iterator;
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import javax.management.AttributeNotFoundException;
32
33 import org.archive.crawler.datamodel.CrawlURI;
34 import org.archive.crawler.deciderules.DecideRule;
35 import org.archive.crawler.deciderules.DecideRuleSequence;
36 import org.archive.crawler.settings.MapType;
37 import org.archive.crawler.settings.ModuleType;
38 import org.archive.crawler.settings.SimpleType;
39
40 /***
41 * Base class for URI processing classes.
42 *
43 * <p> Each URI is processed by a user defined series of processors. This class
44 * provides the basic infrastructure for these but does not actually do
45 * anything. New processors can be easily created by subclassing this class.
46 *
47 * <p> Classes subclassing this one should not trap InterruptedExceptions.
48 * They should be allowed to propagate to the ToeThread executing the processor.
49 * Also they should immediately exit their main method (<tt>innerProcess()</tt>)
50 * if the <tt>interrupted</tt> flag is set.
51 *
52 * @author Gordon Mohr
53 *
54 * @see org.archive.crawler.framework.ToeThread
55 */
56 public class Processor extends ModuleType {
57
58 private static final long serialVersionUID = 6248563827413710226L;
59
60 /***
61 * Key to use asking settings for decide-rules value.
62 */
63 public static final String ATTR_DECIDE_RULES = "decide-rules";
64 /*** local name for decide-rules */
65 protected String attrDecideRules;
66
67 /***
68 * Key to use asking settings for enabled value.
69 */
70 public final static String ATTR_ENABLED = "enabled";
71
72 private Processor defaultNextProcessor = null;
73
74 private static Logger logger =
75 Logger.getLogger("org.archive.crawler.framework.Processor");
76
77 /***
78 * @param name
79 * @param description
80 */
81 public Processor(String name, String description) {
82 super(name, description);
83 addElementToDefinition(new SimpleType(ATTR_ENABLED,
84 "Is processor enabled", new Boolean(true)));
85 attrDecideRules = getName()+"#"+ATTR_DECIDE_RULES;
86 addElementToDefinition(
87 new DecideRuleSequence(attrDecideRules,
88 "DecideRules which, if their final decision is REJECT, " +
89 "prevent this Processor from running."));
90 }
91
92 /***
93 * Perform processing on the given CrawlURI.
94 *
95 * @param curi
96 * @throws InterruptedException
97 */
98 public final void process(CrawlURI curi) throws InterruptedException {
99
100 curi.setNextProcessor(getDefaultNextProcessor(curi));
101
102
103 try {
104 if (!((Boolean) getAttribute(ATTR_ENABLED, curi)).booleanValue()) {
105 return;
106 }
107 } catch (AttributeNotFoundException e) {
108 logger.severe(e.getMessage());
109 }
110
111 if(rulesAccept(curi)) {
112 innerProcess(curi);
113 } else {
114 innerRejectProcess(curi);
115 }
116 }
117
118 protected void checkForInterrupt() throws InterruptedException {
119 if (Thread.interrupted()) {
120 throw new InterruptedException("interrupted");
121 }
122 }
123
124 /***
125 * @param curi CrawlURI instance.
126 * @throws InterruptedException
127 */
128 protected void innerRejectProcess(CrawlURI curi)
129 throws InterruptedException {
130
131 }
132
133 /***
134 * Classes subclassing this one should override this method to perform
135 * their custom actions on the CrawlURI.
136 *
137 * @param curi The CrawlURI being processed.
138 * @throws InterruptedException
139 */
140 protected void innerProcess(CrawlURI curi)
141 throws InterruptedException {
142
143 }
144
145 /***
146 * Classes subclassing this one should override this method to perform
147 * processor specific actions.
148 * <p>
149 *
150 * This method is garanteed to be called after the crawl is set up, but
151 * before any URI-processing has occured.
152 */
153 protected void initialTasks () {
154
155 }
156
157 /***
158 * Classes subclassing this one should override this method to perform
159 * processor specific actions.
160 *
161 */
162 protected void finalTasks () {
163
164 }
165
166 protected DecideRule getDecideRule(Object o) {
167 try {
168 return (DecideRule)getAttribute(o, attrDecideRules);
169 } catch (AttributeNotFoundException e) {
170 throw new RuntimeException(e);
171 }
172 }
173
174 protected boolean rulesAccept(Object o) {
175 return rulesAccept(getDecideRule(o),o);
176 }
177
178 protected boolean rulesAccept(DecideRule rule, Object o) {
179 return rule.decisionFor(o) != DecideRule.REJECT;
180 }
181 /***
182 * Returns the next processor for the given CrawlURI in the processor chain.
183 * @param curi The CrawlURI that we want to find the next processor for.
184 * @return The next processor for the given CrawlURI in the processor chain.
185 */
186 public Processor getDefaultNextProcessor(CrawlURI curi) {
187 return defaultNextProcessor;
188 }
189
190 /*** Set the default next processor in the chain.
191 *
192 * @param nextProcessor the default next processor in the chain.
193 */
194 public void setDefaultNextProcessor(Processor nextProcessor) {
195 defaultNextProcessor = nextProcessor;
196 }
197
198 /***
199 * Get the controller object.
200 *
201 * @return the controller object.
202 */
203 public CrawlController getController() {
204 return getSettingsHandler().getOrder().getController();
205 }
206
207 public Processor spawn(int serialNum) {
208 Processor newInst = null;
209 try {
210 Constructor co =
211 getClass().getConstructor(new Class[] { String.class });
212 newInst =
213 (Processor) co.newInstance(new Object[] {
214 getName() + serialNum
215 });
216 getParent().setAttribute(newInst);
217 newInst.setTransient(true);
218 } catch (Exception e) {
219
220 e.printStackTrace();
221 }
222 return newInst;
223 }
224
225 /***
226 * Compiles and returns a report (in human readable form) about the status
227 * of the processor. The processor's name (of implementing class) should
228 * always be included.
229 * <p>
230 * Examples of stats declared would include:<br>
231 * * Number of CrawlURIs handled.<br>
232 * * Number of links extracted (for link extractors)<br>
233 * etc.
234 *
235 * @return A human readable report on the processor's state.
236 */
237 public String report(){
238 return "";
239 }
240
241 /***
242 * @param curi CrawlURI to examine.
243 * @return True if content to process -- content length is > 0
244 * -- and links have not yet been extracted.
245 */
246 protected boolean isContentToProcess(CrawlURI curi) {
247 return !curi.hasBeenLinkExtracted() && curi.getContentLength() > 0;
248 }
249
250 /***
251 * @param curi CrawlURI to examine.
252 * @return True if {@link #isContentToProcess(CrawlURI)} and
253 * the CrawlURI represents a successful http transaction.
254 */
255 protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
256 return isContentToProcess(curi) &&
257 curi.isHttpTransaction() &&
258 curi.isSuccess();
259 }
260
261 /***
262 * @param contentType Found content type.
263 * @param expectedPrefix String to find at start of contenttype: e.g.
264 * <code>text/html</code>.
265 * @return True if passed content-type begins with
266 * expected mimetype.
267 */
268 protected boolean isExpectedMimeType(String contentType,
269 String expectedPrefix) {
270 return contentType != null &&
271 contentType.toLowerCase().startsWith(expectedPrefix);
272 }
273
274 public void kickUpdate() {
275
276 }
277 }