1   /* BeanShellProcessor
2    *
3    * Created on Aug 4, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
22   */
23  package org.archive.crawler.processor;
24  
25  import java.io.File;
26  import java.io.IOException;
27  import java.util.Collections;
28  import java.util.HashMap;
29  import java.util.Map;
30  import java.util.logging.Level;
31  import java.util.logging.Logger;
32  
33  import org.archive.crawler.datamodel.CrawlURI;
34  import org.archive.crawler.datamodel.FetchStatusCodes;
35  import org.archive.crawler.framework.Processor;
36  import org.archive.crawler.settings.SimpleType;
37  import org.archive.crawler.settings.Type;
38  
39  import bsh.EvalError;
40  import bsh.Interpreter;
41  
42  /***
43   * A processor which runs a BeanShell script on the CrawlURI.
44   *
45   * Script source may be provided via a file
46   * local to the crawler. 
47   * Script source should define
48   * a method with one argument, 'run(curi)'. Each processed CrawlURI is
49   * passed to this script method. 
50   * 
51   * Other variables available to the script include 'self' (this 
52   * BeanShellProcessor instance) and 'controller' (the crawl's 
53   * CrawlController instance). 
54   * 
55   * @author gojomo
56   * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
57   */
58  public class BeanShellProcessor extends Processor implements FetchStatusCodes {
59  
60      private static final long serialVersionUID = 6926589944337050754L;
61  
62      private static final Logger logger =
63          Logger.getLogger(BeanShellProcessor.class.getName());
64  
65      /*** setting for script file */
66      public final static String ATTR_SCRIPT_FILE = "script-file"; 
67  
68      /*** whether each thread should have its own script runner (true), or
69       * they should share a single script runner with synchronized access */
70      public final static String ATTR_ISOLATE_THREADS = "isolate-threads";
71  
72      protected ThreadLocal<Interpreter> threadInterpreter;
73      protected Interpreter sharedInterpreter;
74      public Map<Object,Object> sharedMap = Collections.synchronizedMap(
75              new HashMap<Object,Object>());
76      
77      /***
78       * Constructor.
79       * @param name Name of this processor.
80       */
81      public BeanShellProcessor(String name) {
82          super(name, "BeanShellProcessor. Runs the BeanShell script source " +
83                  "(supplied directly or via a file path) against the " +
84                  "current URI. Source should define a script method " +
85                  "'process(curi)' which will be passed the current CrawlURI. " +
86                  "The script may also access this BeanShellProcessor via" +
87                  "the 'self' variable and the CrawlController via the " +
88                  "'controller' variable.");
89          Type t = addElementToDefinition(new SimpleType(ATTR_SCRIPT_FILE,
90                  "BeanShell script file", ""));
91          t.setOverrideable(false);
92          t = addElementToDefinition(new SimpleType(ATTR_ISOLATE_THREADS,
93                  "Whether each ToeThread should get its own independent " +
94                  "script context, or they should share synchronized access " +
95                  "to one context. Default is true, meaning each threads " +
96                  "gets its own isolated context.", true));
97          t.setOverrideable(false);
98  
99      }
100 
101     protected synchronized void innerProcess(CrawlURI curi) {
102         // depending on previous configuration, interpreter may 
103         // be local to this thread or shared
104         Interpreter interpreter = getInterpreter(); 
105         synchronized(interpreter) {
106             // synchronization is harmless for local thread interpreter,
107             // necessary for shared interpreter
108             try {
109                 interpreter.set("curi",curi);
110                 interpreter.eval("process(curi)");
111             } catch (EvalError e) {
112                 // TODO Auto-generated catch block
113                 e.printStackTrace();
114             } 
115         }
116     }
117 
118     /***
119      * Get the proper Interpreter instance -- either shared or local 
120      * to this thread. 
121      * @return Interpreter to use
122      */
123     protected Interpreter getInterpreter() {
124         if(sharedInterpreter!=null) {
125             return sharedInterpreter;
126         }
127         Interpreter interpreter = threadInterpreter.get(); 
128         if(interpreter==null) {
129             interpreter = newInterpreter(); 
130             threadInterpreter.set(interpreter);
131         }
132         return interpreter; 
133     }
134 
135     /***
136      * Create a new Interpreter instance, preloaded with any supplied
137      * source code or source file and the variables 'self' (this 
138      * BeanShellProcessor) and 'controller' (the CrawlController). 
139      * 
140      * @return  the new Interpreter instance
141      */
142     protected Interpreter newInterpreter() {
143         Interpreter interpreter = new Interpreter(); 
144         try {
145             interpreter.set("self", this);
146             interpreter.set("controller", getController());
147             
148             String filePath = (String) getUncheckedAttribute(null, ATTR_SCRIPT_FILE);
149             if(filePath.length()>0) {
150                 try {
151                     File file = getSettingsHandler().getPathRelativeToWorkingDirectory(filePath);
152                     interpreter.source(file.getPath());
153                 } catch (IOException e) {
154                     logger.log(Level.SEVERE,"unable to read script file",e);
155                 }
156             }
157         } catch (EvalError e) {
158             // TODO Auto-generated catch block
159             e.printStackTrace();
160         }
161         
162         return interpreter; 
163     }
164 
165     protected void initialTasks() {
166         super.initialTasks();
167         kickUpdate();
168     }
169 
170     /***
171      * Setup (or reset) Intepreter variables, as appropraite based on 
172      * thread-isolation setting. 
173      */
174     public void kickUpdate() {
175         // TODO make it so running state (tallies, etc.) isn't lost on changes
176         // unless unavoidable
177         if((Boolean)getUncheckedAttribute(null,ATTR_ISOLATE_THREADS)) {
178             sharedInterpreter = null; 
179             threadInterpreter = new ThreadLocal<Interpreter>(); 
180         } else {
181             sharedInterpreter = newInterpreter(); 
182             threadInterpreter = null;
183         }
184     }
185 }