1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.extractor;
26
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29
30 import org.archive.crawler.datamodel.CrawlURI;
31 import org.archive.crawler.framework.Processor;
32
33 /***
34 * Convenience shared superclass for Extractor Processors.
35 *
36 * Currently only wraps Extractor-specific extract() action with
37 * a StackOverflowError catch/log/proceed handler, so that any
38 * extractors that recurse too deep on problematic input will
39 * only suffer a local error, and other normal CrawlURI processing
40 * can continue. See:
41 * [ 1122836 ] Localize StackOverflowError in Extractors
42 * http://sourceforge.net/tracker/index.php?func=detail&aid=1122836&group_id=73833&atid=539099
43 *
44 * This class could also become home to common utility features
45 * of extractors, like a running tally of the URIs examined/discovered,
46 * etc.
47 *
48 * @author gojomo
49 */
50 public abstract class Extractor extends Processor {
51 private static final Logger logger = Logger
52 .getLogger(Extractor.class.getName());
53
54 /***
55 * Passthrough constructor.
56 *
57 * @param name
58 * @param description
59 */
60 public Extractor(String name, String description) {
61 super(name, description);
62
63 }
64
65 public void innerProcess(CrawlURI curi) {
66 try {
67 extract(curi);
68 } catch (NullPointerException npe) {
69
70 curi.addAnnotation("err=" + npe.getClass().getName());
71 curi.addLocalizedError(getName(), npe, "");
72
73 logger.log(Level.WARNING, getName() + ": NullPointerException",
74 npe);
75 } catch (StackOverflowError soe) {
76
77 curi.addAnnotation("err=" + soe.getClass().getName());
78 curi.addLocalizedError(getName(), soe, "");
79
80 logger.log(Level.WARNING, getName() + ": StackOverflowError", soe);
81 } catch (java.nio.charset.CoderMalfunctionError cme) {
82
83
84 curi.addAnnotation("err=" + cme.getClass().getName());
85 curi.addLocalizedError(getName(), cme, "");
86 logger.log(Level.WARNING, getName() + ": CoderMalfunctionError",
87 cme);
88 }
89 }
90
91 protected abstract void extract(CrawlURI curi);
92 }