1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Processor.java
20   * Created on Apr 16, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.framework;
25  
26  import java.lang.reflect.Constructor;
27  import java.util.Iterator;
28  import java.util.logging.Level;
29  import java.util.logging.Logger;
30  
31  import javax.management.AttributeNotFoundException;
32  
33  import org.archive.crawler.datamodel.CrawlURI;
34  import org.archive.crawler.deciderules.DecideRule;
35  import org.archive.crawler.deciderules.DecideRuleSequence;
36  import org.archive.crawler.settings.MapType;
37  import org.archive.crawler.settings.ModuleType;
38  import org.archive.crawler.settings.SimpleType;
39  
40  /***
41   * Base class for URI processing classes.
42   *
43   * <p> Each URI is processed by a user defined series of processors. This class
44   * provides the basic infrastructure for these but does not actually do
45   * anything. New processors can be easily created by subclassing this class.
46   *
47   * <p> Classes subclassing this one should not trap InterruptedExceptions.
48   * They should be allowed to propagate to the ToeThread executing the processor.
49   * Also they should immediately exit their main method (<tt>innerProcess()</tt>)
50   * if the <tt>interrupted</tt> flag is set.
51   *
52   * @author Gordon Mohr
53   *
54   * @see org.archive.crawler.framework.ToeThread
55   */
56  public class Processor extends ModuleType {
57  
58      private static final long serialVersionUID = 6248563827413710226L;
59  
60      /***
61       * Key to use asking settings for decide-rules value.
62       */
63      public static final String ATTR_DECIDE_RULES = "decide-rules";
64      /*** local name for decide-rules */
65      protected String attrDecideRules; 
66  
67      /***
68       * Key to use asking settings for enabled value.
69       */
70      public final static String ATTR_ENABLED = "enabled";
71  
72      private Processor defaultNextProcessor = null;
73  
74      private static Logger logger =
75          Logger.getLogger("org.archive.crawler.framework.Processor");
76  
77      /***
78       * @param name
79       * @param description
80       */
81      public Processor(String name, String description) {
82          super(name, description);
83          addElementToDefinition(new SimpleType(ATTR_ENABLED,
84              "Is processor enabled", new Boolean(true)));
85          attrDecideRules = getName()+"#"+ATTR_DECIDE_RULES;
86          addElementToDefinition(
87              new DecideRuleSequence(attrDecideRules,
88                  "DecideRules which, if their final decision is REJECT, " +
89                  "prevent this Processor from running."));
90      }
91  
92      /***
93       * Perform processing on the given CrawlURI.
94       *
95       * @param curi
96       * @throws InterruptedException
97       */
98      public final void process(CrawlURI curi) throws InterruptedException {
99          // by default, arrange for curi to proceed to next processor
100         curi.setNextProcessor(getDefaultNextProcessor(curi));
101 
102         // Check if this processor is enabled before processing
103         try {
104             if (!((Boolean) getAttribute(ATTR_ENABLED, curi)).booleanValue()) {
105                 return;
106             }
107         } catch (AttributeNotFoundException e) {
108             logger.severe(e.getMessage());
109         }
110 
111         if(rulesAccept(curi)) {
112             innerProcess(curi);
113         } else {
114             innerRejectProcess(curi);
115         }
116     }
117 
118     protected void checkForInterrupt() throws InterruptedException {
119         if (Thread.interrupted()) {
120             throw new InterruptedException("interrupted");
121         }
122     }
123 
124     /***
125      * @param curi CrawlURI instance.
126      * @throws InterruptedException
127      */
128     protected void innerRejectProcess(CrawlURI curi)
129     throws InterruptedException {
130         // by default do nothing
131     }
132 
133     /***
134      * Classes subclassing this one should override this method to perform
135      * their custom actions on the CrawlURI.
136      *
137      * @param curi The CrawlURI being processed.
138      * @throws InterruptedException
139      */
140     protected void innerProcess(CrawlURI curi)
141     throws InterruptedException {
142         // by default do nothing
143     }
144 
145     /***
146      * Classes subclassing this one should override this method to perform
147      * processor specific actions.
148      * <p>
149      *
150      * This method is garanteed to be called after the crawl is set up, but
151      * before any URI-processing has occured.
152      */
153     protected void initialTasks () {
154         // by default do nothing
155     }
156 
157     /***
158      * Classes subclassing this one should override this method to perform
159      * processor specific actions.
160      *
161      */
162     protected void finalTasks () {
163         // by default do nothing
164     }
165 
166     protected DecideRule getDecideRule(Object o) {
167         try {
168             return (DecideRule)getAttribute(o, attrDecideRules);
169         } catch (AttributeNotFoundException e) {
170             throw new RuntimeException(e);
171         }
172     }
173 
174     protected boolean rulesAccept(Object o) {
175         return rulesAccept(getDecideRule(o),o);
176     }
177 
178     protected boolean rulesAccept(DecideRule rule, Object o) {
179         return rule.decisionFor(o) != DecideRule.REJECT;
180     }
181     /***
182      * Returns the next processor for the given CrawlURI in the processor chain.
183      * @param curi The CrawlURI that we want to find the next processor for.
184      * @return The next processor for the given CrawlURI in the processor chain.
185      */
186     public Processor getDefaultNextProcessor(CrawlURI curi) {
187         return defaultNextProcessor;
188     }
189 
190     /*** Set the default next processor in the chain.
191      *
192      * @param nextProcessor the default next processor in the chain.
193      */
194     public void setDefaultNextProcessor(Processor nextProcessor) {
195         defaultNextProcessor = nextProcessor;
196     }
197 
198     /*** 
199      * Get the controller object.
200      *
201      * @return the controller object.
202      */
203     public CrawlController getController() {
204         return getSettingsHandler().getOrder().getController();
205     }
206 
207     public Processor spawn(int serialNum) {
208         Processor newInst = null;
209         try {
210             Constructor co =
211                 getClass().getConstructor(new Class[] { String.class });
212             newInst =
213                 (Processor) co.newInstance(new Object[] {
214                     getName() + serialNum
215                     });
216             getParent().setAttribute(newInst);
217             newInst.setTransient(true);
218         } catch (Exception e) {
219             // TODO Auto-generated catch block
220             e.printStackTrace();
221         }
222         return newInst;
223     }
224 
225     /***
226      * Compiles and returns a report (in human readable form) about the status
227      * of the processor.  The processor's name (of implementing class) should
228      * always be included.
229      * <p>
230      * Examples of stats declared would include:<br>
231      * * Number of CrawlURIs handled.<br>
232      * * Number of links extracted (for link extractors)<br>
233      * etc.
234      *
235      * @return A human readable report on the processor's state.
236      */
237     public String report(){
238         return ""; // Default behavior.
239     }
240     
241     /***
242      * @param curi CrawlURI to examine.
243      * @return True if content to process -- content length is > 0 
244      * -- and links have not yet been extracted.
245      */
246     protected boolean isContentToProcess(CrawlURI curi) {
247         return !curi.hasBeenLinkExtracted() && curi.getContentLength() > 0;
248     }
249     
250     /***
251      * @param curi CrawlURI to examine.
252      * @return True if {@link #isContentToProcess(CrawlURI)} and
253      * the CrawlURI represents a successful http transaction.
254      */
255     protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
256         return isContentToProcess(curi) &&
257             curi.isHttpTransaction() &&
258             curi.isSuccess();
259     }
260     
261     /***
262      * @param contentType Found content type.
263      * @param expectedPrefix String to find at start of contenttype: e.g.
264      * <code>text/html</code>.
265      * @return True if passed content-type begins with
266      * expected mimetype.
267      */
268     protected boolean isExpectedMimeType(String contentType,
269             String expectedPrefix) {
270         return contentType != null &&
271             contentType.toLowerCase().startsWith(expectedPrefix);
272     }
273 
274     public void kickUpdate() {
275         // by default do nothing
276     }
277 }