1   /* Extractor
2   *
3   * $Id: Extractor.java 4497 2006-08-15 01:31:35Z stack-sf $
4   *
5   * Created on Sep 22, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.extractor;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.archive.crawler.datamodel.CrawlURI;
31  import org.archive.crawler.framework.Processor;
32  
33  /***
34   * Convenience shared superclass for Extractor Processors.
35   * 
36   * Currently only wraps Extractor-specific extract() action with
37   * a StackOverflowError catch/log/proceed handler, so that any
38   * extractors that recurse too deep on problematic input will
39   * only suffer a local error, and other normal CrawlURI processing
40   * can continue. See:
41   *  [ 1122836 ] Localize StackOverflowError in Extractors
42   *  http://sourceforge.net/tracker/index.php?func=detail&aid=1122836&group_id=73833&atid=539099
43   * 
44   * This class could also become home to common utility features
45   * of extractors, like a running tally of the URIs examined/discovered,
46   * etc.
47   * 
48   * @author gojomo
49   */
50  public abstract class Extractor extends Processor {
51      private static final Logger logger = Logger
52          .getLogger(Extractor.class.getName());
53  
54      /***
55       * Passthrough constructor.
56       * 
57       * @param name
58       * @param description
59       */
60      public Extractor(String name, String description) {
61          super(name, description);
62          // TODO Auto-generated constructor stub
63      }
64  
65      public void innerProcess(CrawlURI curi) {
66          try {
67              extract(curi);
68          } catch (NullPointerException npe) {
69              // both annotate (to highlight in crawl log) & add as local-error
70              curi.addAnnotation("err=" + npe.getClass().getName());
71              curi.addLocalizedError(getName(), npe, "");
72              // also log as warning
73              logger.log(Level.WARNING, getName() + ": NullPointerException",
74                  npe);
75          } catch (StackOverflowError soe) {
76              // both annotate (to highlight in crawl log) & add as local-error
77              curi.addAnnotation("err=" + soe.getClass().getName());
78              curi.addLocalizedError(getName(), soe, "");
79              // also log as warning
80              logger.log(Level.WARNING, getName() + ": StackOverflowError", soe);
81          } catch (java.nio.charset.CoderMalfunctionError cme) {
82              // See http://sourceforge.net/tracker/index.php?func=detail&aid=1540222&group_id=73833&atid=539099
83              // Both annotate (to highlight in crawl log) & add as local-error
84              curi.addAnnotation("err=" + cme.getClass().getName());
85              curi.addLocalizedError(getName(), cme, ""); // <-- Message field ignored when logging.
86              logger.log(Level.WARNING, getName() + ": CoderMalfunctionError",
87                  cme);
88          }
89      }
90  
91      protected abstract void extract(CrawlURI curi);
92  }