View Javadoc

1   /* SettingsHandler
2    *
3    * $Id: SettingsHandler.java 5504 2007-10-03 20:01:02Z gojomo $
4    *
5    * Created on Dec 16, 2003
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.settings;
26  
27  import java.io.File;
28  import java.lang.reflect.Constructor;
29  import java.lang.reflect.InvocationTargetException;
30  import java.text.ParseException;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.HashMap;
34  import java.util.HashSet;
35  import java.util.Iterator;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Set;
39  import java.util.logging.Level;
40  
41  import javax.management.AttributeNotFoundException;
42  import javax.management.InvalidAttributeValueException;
43  
44  import org.archive.crawler.datamodel.CrawlOrder;
45  import org.archive.crawler.framework.Checkpointer;
46  import org.archive.crawler.framework.ToeThread;
47  import org.archive.crawler.settings.refinements.Refinement;
48  import org.archive.net.UURI;
49  import org.archive.util.ArchiveUtils;
50  
51  /*** An instance of this class holds a hierarchy of settings.
52   *
53   * More than one instance in memory is allowed so that a new CrawlJob could
54   * be configured while another job is running.
55   *
56   * This class should be subclassed to adapt to a persistent storage.
57   *
58   * @author John Erik Halse
59   */
60  public abstract class SettingsHandler {
61      /*** Cached CrawlerSettings objects */
62      private SettingsCache settingsCache =
63          new SettingsCache(new CrawlerSettings(this, null));
64  
65      /*** Reference to the order module */
66      private CrawlOrder order;
67  
68      private Set<ValueErrorHandler> valueErrorHandlers 
69       = Collections.synchronizedSet(new HashSet<ValueErrorHandler>());
70      private int errorReportingLevel = Level.ALL.intValue();
71  
72      /*** Datatypes supported by the settings framwork */
73      final static String INTEGER = "integer";
74      final static String LONG = "long";
75      final static String FLOAT = "float";
76      final static String DOUBLE = "double";
77      final static String BOOLEAN = "boolean";
78      final static String STRING = "string";
79      final static String TEXT = "text";
80      final static String OBJECT = "object";
81      final static String TIMESTAMP = "timestamp";
82      final static String MAP = "map";
83      final static String INTEGER_LIST = "integerList";
84      final static String LONG_LIST = "longList";
85      final static String FLOAT_LIST = "floatList";
86      final static String DOUBLE_LIST = "doubleList";
87      final static String STRING_LIST = "stringList";
88      private final static String names[][] = new String[][] {
89              { INTEGER, "java.lang.Integer"},
90              { LONG, "java.lang.Long"},
91              { FLOAT, "java.lang.Float"},
92              { DOUBLE, "java.lang.Double"},
93              { BOOLEAN, "java.lang.Boolean"},
94              { STRING, "java.lang.String"},
95              { TEXT, "org.archive.crawler.settings.TextField"},
96              { OBJECT, "org.archive.crawler.settings.ModuleType"},
97              { TIMESTAMP, "java.util.Date"},
98              { MAP, "org.archive.crawler.settings.MapType"},
99              { INTEGER_LIST,
100                     "org.archive.crawler.settings.IntegerList"},
101             { LONG_LIST, "org.archive.crawler.settings.LongList"},
102             { FLOAT_LIST, "org.archive.crawler.settings.FloatList"},
103             { DOUBLE_LIST, "org.archive.crawler.settings.DoubleList"},
104             { STRING_LIST, "org.archive.crawler.settings.StringList"}};
105     private final static Map<String,String> name2class
106      = new HashMap<String,String>();
107     private final static Map<String,String> class2name
108      = new HashMap<String,String>();
109     static {
110         for (int i = 0; i < names.length; i++) {
111             name2class.put(names[i][0], names[i][1]);
112             class2name.put(names[i][1], names[i][0]);
113         }
114     }
115 
116     /*** Create a new SettingsHandler object.
117      *
118      * @throws InvalidAttributeValueException
119      */
120     public SettingsHandler() throws InvalidAttributeValueException {
121         order = new CrawlOrder();
122         order.setAsOrder(this);
123     }
124 
125     /*** Initialize the SettingsHandler.
126      *
127      * This method reads the default settings from the persistent storage.
128      */
129     public void initialize() {
130         readSettingsObject(settingsCache.getGlobalSettings());
131     }
132     
133     public void cleanup() {
134         this.settingsCache = null;
135         if (this.order != null) {
136             this.order.setController(null);
137         }
138         this.order =  null;
139     }
140 
141     /*** Strip off the leftmost part of a domain name.
142      *
143      * @param scope the domain name.
144      * @return scope with everything before the first dot ripped off.
145      */
146     protected String getParentScope(String scope) {
147         int split = scope.indexOf('.');
148         return (split == -1)? null: scope.substring(split + 1);
149     }
150 
151     /*** Get a module by name.
152      *
153      * All modules in the order should have unique names. This method makes it
154      * possible to get the modules of the order by its name.
155      *
156      * @param name the modules name.
157      * @return the module the name references.
158      */
159     public ModuleType getModule(String name) {
160         return settingsCache.getGlobalSettings().getModule(name);
161     }
162 
163     /*** Get a complex type by its absolute name.
164      *
165      * The absolute name is the complex types name and the path leading to
166      * it.
167      *
168      * @param settings the settings object to query.
169      * @param absoluteName the absolute name of the complex type to get.
170      * @return the complex type referenced by the absolute name or null if
171      *         the complex type could not be found in this settings object.
172      * @throws AttributeNotFoundException is thrown if no ComplexType by this
173      *         name exist.
174      */
175     public ComplexType getComplexTypeByAbsoluteName(
176             CrawlerSettings settings, String absoluteName)
177             throws AttributeNotFoundException {
178 
179         settings = settings == null ? settingsCache.getGlobalSettings() : settings;
180 
181         DataContainer data = settings.getData(absoluteName);
182         if (data == null) {
183             CrawlerSettings parentSettings = settings.getParent();
184             if (parentSettings == null) {
185                 throw new AttributeNotFoundException(absoluteName);
186             }
187             return getComplexTypeByAbsoluteName(parentSettings, absoluteName);
188         }
189         return data.getComplexType();
190     }
191 
192     protected static String getTypeName(String className) {
193         return (String) class2name.get(className);
194     }
195 
196     protected static String getClassName(String typeName) {
197         return (String) name2class.get(typeName);
198     }
199 
200     /*** Convert a String object to an object of <code>typeName</code>.
201      *
202      * @param stringValue string to convert.
203      * @param typeName type to convert to. typeName should be one of the
204      *        supported types represented by constants in this class.
205      * @return the new value object.
206      * @throws ClassCastException is thrown if string could not be converted.
207      */
208     protected static Object StringToType(String stringValue, String typeName) {
209         Object value;
210         if (typeName == SettingsHandler.STRING) {
211             value = stringValue;
212         } else if (typeName == SettingsHandler.TEXT) {
213             value = new TextField(stringValue);
214         } else if (typeName == SettingsHandler.INTEGER) {
215             value = Integer.decode(stringValue);
216         } else if (typeName == SettingsHandler.LONG) {
217             value = Long.decode(stringValue);
218         } else if (typeName == SettingsHandler.BOOLEAN) {
219             value = Boolean.valueOf(stringValue);
220         } else if (typeName == SettingsHandler.DOUBLE) {
221             value = Double.valueOf(stringValue);
222         } else if (typeName == SettingsHandler.FLOAT) {
223             value = Float.valueOf(stringValue);
224         } else if (typeName == SettingsHandler.TIMESTAMP) {
225             try {
226                 value = ArchiveUtils.parse14DigitDate(stringValue);
227             } catch (ParseException e) {
228                 throw new ClassCastException(
229                     "Cannot convert '"
230                         + stringValue
231                         + "' to type '"
232                         + typeName
233                         + "'");
234             }
235         } else {
236             throw new ClassCastException(
237                 "Cannot convert '"
238                     + stringValue
239                     + "' to type '"
240                     + typeName
241                     + "'");
242         }
243         return value;
244     }
245 
246     /*** Get CrawlerSettings object in effect for a host or domain.
247      *
248      * If there is no specific settings for the host/domain, it will recursively
249      * go up the hierarchy to find the settings object that should be used for
250      * this host/domain.
251      *
252      * @param host the host or domain to get the settings for.
253      * @return settings object in effect for the host/domain.
254      * @see #getSettingsObject(String)
255      * @see #getOrCreateSettingsObject(String)
256      */
257     public CrawlerSettings getSettings(String host) {
258         return getRefinementsForSettings(getSettingsForHost(host), null);
259     }
260 
261     /*** Get CrawlerSettings object in effect for a host or domain.
262     *
263     * If there is no specific settings for the host/domain, it will recursively
264     * go up the hierarchy to find the settings object that should be used for
265     * this host/domain.
266     * <p/>
267     * This method passes around a URI that refinement are checked against.
268     *
269     * @param host the host or domain to get the settings for.
270     * @param uuri UURI for context.
271     * @return settings object in effect for the host/domain.
272     * @see #getSettingsObject(String)
273     * @see #getOrCreateSettingsObject(String)
274     */
275     public CrawlerSettings getSettings(String host, UURI uuri) {
276         return getRefinementsForSettings(getSettingsForHost(host), uuri);
277     }
278 
279     protected CrawlerSettings getSettingsForHost(String host) {
280         CrawlerSettings settings = settingsCache.getSettings(host, null);
281 
282         if (settings == null) {
283             String tmpHost = host;
284             settings = getSettingsObject(tmpHost);
285             while (settings == null && tmpHost != null) {
286                 tmpHost = getParentScope(tmpHost);
287                 settings = getSettingsObject(tmpHost);
288             }
289 
290             settingsCache.putSettings(host, settings);
291         }
292 
293         return settings;
294     }
295 
296     private CrawlerSettings getRefinementsForSettings(CrawlerSettings settings,
297             UURI uri) {
298         if (settings.hasRefinements()) {
299             for(Iterator it = settings.refinementsIterator(); it.hasNext();) {
300                 Refinement refinement = (Refinement) it.next();
301                 if (refinement.isWithinRefinementBounds(uri)) {
302                     settings = getSettingsObject(settings.getScope(),
303                             refinement.getReference());
304                 }
305             }
306         }
307 
308         return settings;
309     }
310 
311     /*** Get CrawlerSettings object for a host or domain.
312      *
313      * The difference between this method and the
314      * <code>getSettings(String host)</code> is that this method will return
315      * null if there is no settings for particular host or domain.
316      *
317      * @param scope the host or domain to get the settings for.
318      * @return settings object for the host/domain or null if no
319      *         settings exist for the host/domain.
320      * @see #getSettings(String)
321      * @see #getOrCreateSettingsObject(String)
322      */
323     public CrawlerSettings getSettingsObject(String scope) {
324         return getSettingsObject(scope, null);
325     }
326 
327     /***
328      * Get CrawlerSettings object for a host/domain and a particular refinement.
329      *
330      * @param scope the host or domain to get the settings for.
331      * @param refinement the refinement reference to get.
332      * @return CrawlerSettings object for a host/domain and a particular
333      * refinement or null if no settings exist for the host/domain.
334      */
335     public CrawlerSettings getSettingsObject(String scope, String refinement) {
336         CrawlerSettings settings =
337             settingsCache.getSettingsObject(scope, refinement);
338 
339         if (settings == null) {
340             // Reference not found
341             settings = new CrawlerSettings(this, scope, refinement);
342             // Try to read settings from persisten storage. If its not there
343             // it will be set to null.
344             settings = readSettingsObject(settings);
345             if (settings != null) {
346                 settingsCache.putSettings(scope, settings);
347             }
348         }
349         return settings;
350     }
351 
352     /*** Get or create CrawlerSettings object for a host or domain.
353      *
354      * This method is similar to {@link #getSettingsObject(String)} except that
355      * if there is no settings for this particular host or domain a new settings
356      * object will be returned.
357      *
358      * @param scope the host or domain to get or create the settings for.
359      * @return settings object for the host/domain.
360      * @see #getSettings(String)
361      * @see #getSettingsObject(String)
362      */
363     public CrawlerSettings getOrCreateSettingsObject(String scope) {
364         return getOrCreateSettingsObject(scope, null);
365     }
366 
367     public CrawlerSettings getOrCreateSettingsObject(String scope,
368             String refinement) {
369         CrawlerSettings settings;
370         settings = getSettingsObject(scope, refinement);
371         if (settings == null) {
372             scope = scope.intern();
373 
374             // No existing settings object found, create one
375             settings = new CrawlerSettings(this, scope, refinement);
376             settingsCache.refreshHostToSettings();
377             settingsCache.putSettings(scope, settings);
378         }
379         return settings;
380     }
381 
382     /*** Write the CrawlerSettings object to persistent storage.
383      *
384      * @param settings the settings object to write.
385      */
386     public abstract void writeSettingsObject(CrawlerSettings settings);
387 
388     /*** Read the CrawlerSettings object from persistent storage.
389      *
390      * @param settings the settings object to be updated with data from the
391      *                 persistent storage.
392      * @return the updated settings object or null if there was no data for this
393      *         in the persistent storage.
394      */
395     protected abstract CrawlerSettings readSettingsObject(CrawlerSettings settings);
396 
397     /*** Delete a settings object from persistent storage.
398      *
399      * @param settings the settings object to delete.
400      */
401     public void deleteSettingsObject(CrawlerSettings settings) {
402         settingsCache.deleteSettingsObject(settings);
403     }
404 
405     /*** Get the CrawlOrder.
406      *
407      * @return the CrawlOrder
408      */
409     public CrawlOrder getOrder() {
410         return order;
411     }
412 
413     /*** Instatiate a new ModuleType given its name and className.
414      *
415      * @param name the name for the new ComplexType.
416      * @param className the class name of the new ComplexType.
417      * @return an instance of the class identified by className.
418      *
419      * @throws InvocationTargetException
420      */
421     public static ModuleType instantiateModuleTypeFromClassName(
422             String name, String className)
423             throws InvocationTargetException {
424 
425         Class cl;
426         try {
427             cl = Class.forName(className);
428         } catch (ClassNotFoundException e) {
429             throw new InvocationTargetException(e);
430         }
431 
432         ModuleType module;
433         try {
434             Constructor co =
435                 cl.getConstructor(new Class[] { String.class });
436             module = (ModuleType) co.newInstance(new Object[] { name });
437         } catch (IllegalArgumentException e) {
438             throw new InvocationTargetException(e);
439         } catch (InstantiationException e) {
440             throw new InvocationTargetException(e);
441         } catch (IllegalAccessException e) {
442             throw new InvocationTargetException(e);
443         } catch (SecurityException e) {
444             throw new InvocationTargetException(e);
445         } catch (NoSuchMethodException e) {
446             throw new InvocationTargetException(e);
447         }
448         return module;
449     }
450 
451     /***
452      * Transforms a relative path so that it is relative to a location that is
453      * regarded as a working dir for these settings. If an absolute path is given,
454      * it will be returned unchanged.
455      * @param path A relative path to a file (or directory)
456      * @return The same path modified so that it is relative to the file level
457      *         location that is considered the working directory for these settings.
458      */
459     public abstract File getPathRelativeToWorkingDirectory(String path);
460 
461     /***
462      * Will return a Collection of strings with domains that contain 'per'
463      * domain overrides (or their subdomains contain them). 
464      * 
465      * The domains considered are
466      * limited to those that are subdomains of the supplied domain. If null or
467      * empty string is supplied the TLDs will be considered.
468      * @param rootDomain The domain to get domain overrides for. Examples:
469      *                   'org', 'archive.org', 'crawler.archive.org' etc.
470      * @return An array of domains that contain overrides. If rootDomain does not
471      *         exist an empty array will be returned.
472      */
473     public abstract Collection getDomainOverrides(String rootDomain);
474 
475     /***
476      * Unregister an instance of {@link ValueErrorHandler}.
477      *
478      * @param errorHandler the <code>CalueErrorHandler</code> to unregister.
479      *
480      * @see ValueErrorHandler
481      * @see #setErrorReportingLevel(Level)
482      * @see #registerValueErrorHandler(ValueErrorHandler)
483      *
484      */
485     public void unregisterValueErrorHandler(ValueErrorHandler errorHandler) {
486         valueErrorHandlers.remove(errorHandler);
487     }
488 
489     /***
490      * Register an instance of {@link ValueErrorHandler}.
491      * <p>
492      * If a ValueErrorHandler is registered, only constraints with level
493      * {@link Level#SEVERE}will throw an {@link InvalidAttributeValueException}.
494      * The ValueErrorHandler will recieve a notification for all failed checks
495      * with level equal or greater than the error reporting level.
496      *
497      * @param errorHandler the <code>CalueErrorHandler</code> to register.
498      *
499      * @see ValueErrorHandler
500      * @see #setErrorReportingLevel(Level)
501      * @see #unregisterValueErrorHandler(ValueErrorHandler)
502      */
503     public void registerValueErrorHandler(ValueErrorHandler errorHandler) {
504         if (errorHandler != null) {
505             valueErrorHandlers.add(errorHandler);
506         }
507     }
508 
509     /***
510      * Fire events on all registered {@link ValueErrorHandler}.
511      *
512      * @param error the failed constraints return value.
513      * @return true if there was any registered ValueErrorHandlers to notify.
514      */
515     boolean fireValueErrorHandlers(Constraint.FailedCheck error) {
516         if (error.getLevel().intValue() >= errorReportingLevel) {
517             for (Iterator it = valueErrorHandlers.iterator(); it.hasNext();) {
518                 ((ValueErrorHandler) it.next()).handleValueError(error);
519             }
520         }
521         return valueErrorHandlers.size() > 0;
522     }
523 
524     /***
525      * Set the level for which notification of failed constraints will be fired.
526      *
527      * @param level the error reporting level.
528      */
529     public void setErrorReportingLevel(Level level) {
530         errorReportingLevel = level.intValue();
531     }
532 
533     /***
534      * Creates and returns a <tt>List</tt> of all files comprising the current
535      * settings framework.
536      *
537      * <p>The List contains the absolute String path of each file.
538      *
539      * <p>The list should contain any configurable files, including such files
540      * as seed file and any other files use by the various settings modules.
541      *
542      * <p>Implementations of the SettingsHandler that do not use files for
543      * permanent storage should return an empty list.
544      * @return <code>List</code> of framework files.
545      */
546     public abstract List getListOfAllFiles();
547     
548     /***
549      * Clear any per-host settings cached in memory; allows editting of 
550      * per-host settings files on disk, perhaps in bulk/automated fashion,
551      * to take effect in running crawl. 
552      */
553     public void clearPerHostSettingsCache() {
554         settingsCache.clear();
555     }
556 
557     static ThreadLocal<SettingsHandler> threadContextSettingsHandler = 
558         new ThreadLocal<SettingsHandler>();
559     public static void setThreadContextSettingsHandler(SettingsHandler settingsHandler) {
560         threadContextSettingsHandler.set(settingsHandler);
561     }
562     public static SettingsHandler getThreadContextSettingsHandler() {
563         Thread t = Thread.currentThread();
564         if (t instanceof Checkpointer.CheckpointingThread) {
565             return ((Checkpointer.CheckpointingThread)t)
566                 .getController().getSettingsHandler();
567         } 
568         if (t instanceof ToeThread) {
569             return ((ToeThread) Thread.currentThread())
570                 .getController().getSettingsHandler();
571         } 
572         if(threadContextSettingsHandler.get()!=null) {
573             return threadContextSettingsHandler.get();
574         }
575         
576         // TODO: log differently? (if no throw here
577         // NPE is inevitable)
578         throw new RuntimeException(
579                 "No threadContextSettingsHandler available.");
580     }
581 }