View Javadoc

1   /* XMLSettingsHandler
2    *
3    * $Id: XMLSettingsHandler.java 4662 2006-09-25 23:45:21Z paul_jack $
4    *
5    * Created on Dec 18, 2003
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.settings;
26  
27  import java.io.BufferedInputStream;
28  import java.io.BufferedOutputStream;
29  import java.io.File;
30  import java.io.FileInputStream;
31  import java.io.FileOutputStream;
32  import java.io.IOException;
33  import java.io.InputStream;
34  import java.util.ArrayList;
35  import java.util.Collection;
36  import java.util.List;
37  import java.util.TreeSet;
38  import java.util.logging.Logger;
39  
40  import javax.management.Attribute;
41  import javax.management.AttributeNotFoundException;
42  import javax.management.InvalidAttributeValueException;
43  import javax.management.MBeanAttributeInfo;
44  import javax.management.MBeanException;
45  import javax.management.MBeanInfo;
46  import javax.management.ReflectionException;
47  import javax.xml.parsers.FactoryConfigurationError;
48  import javax.xml.parsers.ParserConfigurationException;
49  import javax.xml.parsers.SAXParserFactory;
50  import javax.xml.transform.Source;
51  import javax.xml.transform.Transformer;
52  import javax.xml.transform.TransformerFactory;
53  import javax.xml.transform.stream.StreamResult;
54  
55  import org.archive.crawler.datamodel.CrawlOrder;
56  import org.archive.util.ArchiveUtils;
57  import org.archive.util.FileUtils;
58  import org.xml.sax.InputSource;
59  import org.xml.sax.SAXException;
60  import org.xml.sax.SAXParseException;
61  import org.xml.sax.XMLReader;
62  
63  /*** A SettingsHandler which uses XML files as persistent storage.
64   *
65   * @author John Erik Halse
66   */
67  public class XMLSettingsHandler extends SettingsHandler {
68      private static Logger logger =
69          Logger.getLogger(
70              "org.archive.crawler.settings.XMLSettingsHandler");
71  
72      // XML element name constants
73      protected static final String XML_SCHEMA = "heritrix_settings.xsd";
74      protected static final String XML_ROOT_ORDER = "crawl-order";
75      protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings";
76      protected static final String XML_ROOT_REFINEMENT = "crawl-refinement";
77      protected static final String XML_ELEMENT_CONTROLLER = "controller";
78      protected static final String XML_ELEMENT_META = "meta";
79      protected static final String XML_ELEMENT_NAME = "name";
80      protected static final String XML_ELEMENT_DESCRIPTION = "description";
81      protected static final String XML_ELEMENT_OPERATOR = "operator";
82      protected static final String XML_ELEMENT_ORGANIZATION = "organization";
83      protected static final String XML_ELEMENT_AUDIENCE = "audience";
84      protected static final String XML_ELEMENT_DATE = "date";
85      protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list";
86      protected static final String XML_ELEMENT_REFINEMENT = "refinement";
87      protected static final String XML_ELEMENT_REFERENCE = "reference";
88      protected static final String XML_ELEMENT_LIMITS = "limits";
89      protected static final String XML_ELEMENT_TIMESPAN = "timespan";
90      protected static final String XML_ELEMENT_PORTNUMBER = "portnumber";
91      protected static final String XML_ELEMENT_URIMATCHES = "uri-matches";
92      protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
93      protected static final String XML_ELEMENT_OBJECT = "object";
94      protected static final String XML_ELEMENT_NEW_OBJECT = "newObject";
95      protected static final String XML_ATTRIBUTE_NAME = "name";
96      protected static final String XML_ATTRIBUTE_CLASS = "class";
97      protected static final String XML_ATTRIBUTE_FROM = "from";
98      protected static final String XML_ATTRIBUTE_TO = "to";
99  
100     private File orderFile;
101     private final static String settingsFilename = "settings";
102     private final static String settingsFilenameSuffix = "xml";
103     private final static String REFINEMENT_DIR = "_refinements";
104 
105     /*** Create a new XMLSettingsHandler object.
106      *
107      * @param orderFile where the order file is located.
108      * @throws InvalidAttributeValueException
109      */
110     public XMLSettingsHandler(File orderFile)
111     throws InvalidAttributeValueException {
112         super();
113         this.orderFile = orderFile.getAbsoluteFile();
114     }
115 
116     /*** Initialize the SettingsHandler.
117      *
118      * This method builds the settings data structure and initializes it with
119      * settings from the order file given to the constructor.
120      */
121     public void initialize() {
122         super.initialize();
123     }
124 
125     /*** 
126      * Initialize the SettingsHandler from a source.
127      *
128      * This method builds the settings data structure and initializes it with
129      * settings from the order file given as a parameter. The intended use is
130      * to create a new order file based on a default (template) order file.
131      *
132      * @param source the order file to initialize from.
133      */
134     public void initialize(File source) {
135         File tmpOrderFile = orderFile;
136         orderFile = source.getAbsoluteFile();
137         this.initialize();
138         orderFile = tmpOrderFile;
139     }
140 
141     private File getSettingsDirectory() {
142         String settingsDirectoryName = null;
143         try {
144             settingsDirectoryName =
145                     (String) getOrder().getAttribute(
146                         CrawlOrder.ATTR_SETTINGS_DIRECTORY);
147         } catch (AttributeNotFoundException e) {
148             e.printStackTrace();
149         } catch (MBeanException e) {
150             e.printStackTrace();
151         } catch (ReflectionException e) {
152             e.printStackTrace();
153         }
154 
155         return getPathRelativeToWorkingDirectory(settingsDirectoryName);
156     }
157 
158     /*** Resolves the filename for a settings object into a file path.
159      *
160      * It will also create the directory structure leading to this file
161      * if it doesn't exist.
162      *
163      * @param settings the settings object to get file path for.
164      * @return the file path for this settings object.
165      */
166     protected final File settingsToFilename(CrawlerSettings settings) {
167         File file;
168 
169         if (settings.getScope() == null || settings.getScope().equals("")) {
170             if (settings.isRefinement()) {
171                 file = new File(getSettingsDirectory(), File.separatorChar
172                         + REFINEMENT_DIR + File.separatorChar
173                         + settings.getName() + '.' + settingsFilenameSuffix);
174             } else {
175                 file = orderFile;
176             }
177         } else {
178             String elements[] = settings.getScope().split("//.");
179             if (elements.length == 0) {
180                 return orderFile;
181             }
182 
183             StringBuffer path = new StringBuffer();
184             for (int i = elements.length - 1; i > 0; i--) {
185                 path.append(elements[i]);
186                 path.append(File.separatorChar);
187             }
188             path.append(elements[0]);
189 
190             if (settings.isRefinement()) {
191                 file = new File(getSettingsDirectory(), path.toString()
192                         + File.separatorChar + REFINEMENT_DIR
193                         + File.separatorChar + settings.getName() + '.'
194                         + settingsFilenameSuffix);
195             } else {
196                 file = new File(getSettingsDirectory(), path.toString()
197                         + File.separatorChar + settingsFilename + "."
198                         + settingsFilenameSuffix);
199             }
200         }
201         return file;
202     }
203 
204     public final void writeSettingsObject(CrawlerSettings settings) {
205         File filename = settingsToFilename(settings);
206         writeSettingsObject(settings, filename);
207     }
208 
209     /*** Write a CrawlerSettings object to a specified file.
210      *
211      * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
212      * except that it uses the submitted File object instead of trying to
213      * resolve where the file should be written.
214      *
215      * @param settings the settings object to be serialized.
216      * @param filename the file to which the settings object should be written.
217      */
218     public final void writeSettingsObject(
219             CrawlerSettings settings, File filename) {
220 
221         logger.fine("Writing " + filename.getAbsolutePath());
222         filename.getParentFile().mkdirs();
223 
224         try {
225             long lastSaved = 0L;
226             File backup = null;
227             if (getOrder().getController() != null && filename.exists()) {
228                 // The crawler is running and file exists - make backup first.
229                 String name = filename.getName();
230                 lastSaved = settings.getLastSavedTime().getTime();
231                 name = name.substring(0, name.lastIndexOf('.')) + '_'
232                         + ArchiveUtils.get14DigitDate(lastSaved) + "."
233                         + settingsFilenameSuffix;
234                 backup = new File(filename.getParentFile(), name);
235                 FileUtils.copyFiles(filename, backup);
236             }
237 
238             StreamResult result =
239                 new StreamResult(
240                     new BufferedOutputStream(new FileOutputStream(filename)));
241             Transformer transformer =
242                 TransformerFactory.newInstance().newTransformer();
243             Source source = new CrawlSettingsSAXSource(settings);
244             transformer.transform(source, result);
245 
246             // Hack to get rid of unnesessary backupfiles.
247             // What happens is that the WUI often saves settings files
248             // several times during a settings change. This code removes the
249             // last backup file if its no more than 2 minutes old.
250             if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
251                 backup.delete();
252             }
253         } catch (Exception e) {
254             e.printStackTrace();
255         }
256     }
257 
258     /*** Read the CrawlerSettings object from a specific file.
259      *
260      * @param settings the settings object to be updated with data from the
261      *                 persistent storage.
262      * @param f the file to read from.
263      * @return the updated settings object or null if there was no data for this
264      *         in the persistent storage.
265      */    
266     protected final CrawlerSettings readSettingsObject(CrawlerSettings settings,
267             File f) {
268         CrawlerSettings result = null;
269         try {
270             InputStream is = null;
271             if (!f.exists()) {
272                 // Perhaps the file we're looking for is on the CLASSPATH.
273                 // DON'T look on the CLASSPATH for 'settings.xml' files.  The
274                 // look for 'settings.xml' files happens frequently. Not looking
275                 // on classpath for 'settings.xml' is an optimization based on
276                 // ASSUMPTION that there will never be a 'settings.xml' saved
277                 // on classpath.
278                 if (!f.getName().startsWith(settingsFilename)) {
279                     is = XMLSettingsHandler.class.
280                         getResourceAsStream(f.getPath());
281                 }
282             } else {
283                 is = new FileInputStream(f);
284             }
285             if (is != null) {
286                 XMLReader parser = SAXParserFactory.newInstance()
287                     .newSAXParser().getXMLReader();
288                 InputStream file = new BufferedInputStream(is);
289                 parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
290                 InputSource source = new InputSource(file);
291                 source.setSystemId(f.toURL().toExternalForm());
292                 parser.parse(source);
293                 result = settings;
294             }
295         } catch (SAXParseException e) {
296             logger.warning(e.getMessage() + " in '" + e.getSystemId()
297                 + "', line: " + e.getLineNumber() + ", column: "
298                 + e.getColumnNumber());
299         } catch (SAXException e) {
300             logger.warning(e.getMessage() + ": "
301                 + e.getException().getMessage());
302         } catch (ParserConfigurationException e) {
303             logger.warning(e.getMessage() + ": "
304                 + e.getCause().getMessage());
305         } catch (FactoryConfigurationError e) {
306             logger.warning(e.getMessage() + ": "
307                 + e.getException().getMessage());
308         } catch (IOException e) {
309             logger.warning("Could not access file '"
310                 + f.getAbsolutePath() + "': " + e.getMessage());
311         }
312         return result;
313     }
314 
315     protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) {
316         File filename = settingsToFilename(settings);
317         return readSettingsObject(settings, filename);
318     }
319 
320     /*** Get the <code>File</code> object pointing to the order file.
321      *
322      * @return File object for the order file.
323      */
324     public File getOrderFile() {
325         return orderFile;
326     }
327 
328     /*** Creates a replica of the settings file structure in another directory
329      * (fully recursive, includes all per host settings). The SettingsHandler
330      * will then refer to the new files.
331      *
332      * Observe that this method should only be called after the SettingsHandler
333      * has been initialized.
334      *
335      * @param newOrderFileName where the new order file should be saved.
336      * @param newSettingsDirectory the top level directory of the per host/domain
337      *                          settings files.
338      * @throws IOException
339      */
340     public void copySettings(File newOrderFileName, String newSettingsDirectory)
341       throws IOException {
342         File oldSettingsDirectory = getSettingsDirectory();
343 
344         // Write new orderfile and point the settingshandler to it
345         orderFile = newOrderFileName;
346         try {
347             getOrder().setAttribute(
348                 new Attribute(
349                     CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory));
350         } catch (Exception e) {
351             throw new IOException("Could not update settings with new location: "
352                 + e.getMessage());
353         }
354         writeSettingsObject(getSettingsObject(null));
355 
356         File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);
357 
358         // Copy the per host files if src and dest directories are different.
359         if (oldSettingsDirectory.compareTo(newDir) != 0) {
360             FileUtils.copyFiles(oldSettingsDirectory, newDir);
361         }
362     }
363 
364     /***
365      * Transforms a relative path so that it is relative to the location of the
366      * order file. If an absolute path is given, it will be returned unchanged.<p>
367      * The location of it's order file is always considered as the 'working'
368      * directory for any given settings.
369      * @param path A relative path to a file (or directory)
370      * @return The same path modified so that it is relative to the file level
371      *         location of the order file for the settings handler.
372      */
373     public File getPathRelativeToWorkingDirectory(String path) {
374         File f = new File(path);
375         // If path is not absolute, set f's directory
376         // relative to the path of the order file
377         if (!f.isAbsolute()) {
378             f = new File(this.getOrderFile().getParent(), path);
379         }
380         return f;
381     }
382 
383     public Collection getDomainOverrides(String rootDomain) {
384         File settingsDir = getSettingsDirectory();
385 
386         //Find the right start directory.
387         ArrayList<String> domains = new ArrayList<String>();
388         //First we deconstruct the rootDomain string
389         while(rootDomain != null && rootDomain.length()>0){
390             if(rootDomain.indexOf('.')<0){
391                 // Last level.
392                 domains.add(rootDomain);
393                 break; //We're done.
394             } else {
395                 // Got more then one level left.
396                 domains.add(rootDomain.substring(0,rootDomain.indexOf('.')));
397                 // Strip down rootDomain.
398                 rootDomain = rootDomain.substring(rootDomain.indexOf('.')+1);
399             }
400         }
401         //Build up a proper path
402         //Since the domains are right to left, we start at the end of the array.
403         StringBuffer subDir = new StringBuffer();
404         for(int i=(domains.size()-1) ; i>=0 ; i--){
405             subDir.append(File.separator+domains.get(i));
406         }
407         //Then we move to the approprite directory.
408         settingsDir = new File(settingsDir.getPath()+subDir);
409         TreeSet<String> confirmedSubDomains = new TreeSet<String>();
410         if(settingsDir.exists()){
411             // Found our place! Search through it's subdirs.
412             File[] possibleSubDomains = settingsDir.listFiles();
413             for (int i = 0; i < possibleSubDomains.length; i++) {
414                 if (possibleSubDomains[i].isDirectory()
415                     && isOverride(possibleSubDomains[i])) {
416                     // Found one!
417                     confirmedSubDomains.add(possibleSubDomains[i].getName());
418                 }
419             }
420         }
421         return confirmedSubDomains;
422     }
423 
424     /***
425      * Checks if a file is a a 'per host' override or if it's a directory if it
426      * or it's subdirectories  contains a 'per host' override file.
427      * @param f The file or directory to check
428      * @return True if the file is an override or it's a directory that contains
429      *         such a file.
430      */
431     private boolean isOverride(File f){
432         if(f.isDirectory()){
433             // Have a directory, check it's contents.
434             File[] subs = f.listFiles();
435             for(int i=0 ; i < subs.length ; i++){
436                 if(isOverride(subs[i])){
437                     // Found one. Can stop looking.
438                     return true;
439                 }
440             }
441         } else if (f.getName().equals(
442                 settingsFilename + "." + settingsFilenameSuffix)) {
443             // This is an override file (or sure looks like one in any case).
444             return true;
445         }
446         // Didn't find an override.
447         return false;
448     }
449 
450     /*** Delete a settings object from persistent storage.
451      *
452      * Deletes the file represented by the submitted settings object. All empty
453      * directories that are parents to the files path are also deleted.
454      *
455      * @param settings the settings object to delete.
456      */
457     public void deleteSettingsObject(CrawlerSettings settings) {
458         super.deleteSettingsObject(settings);
459         File settingsDirectory = getSettingsDirectory();
460         File settingsFile = settingsToFilename(settings);
461 
462         settingsFile.delete();
463         settingsFile = settingsFile.getParentFile();
464         while (settingsFile.isDirectory() && settingsFile.list().length == 0
465                 && !settingsFile.equals(settingsDirectory)) {
466             settingsFile.delete();
467             settingsFile = settingsFile.getParentFile();
468         }
469     }
470 
471     /* (non-Javadoc)
472      * @see org.archive.crawler.settings.SettingsHandler#getListOfAllFiles()
473      */
474     public List<String> getListOfAllFiles() {
475         ArrayList<String> list = new ArrayList<String>();
476         // Add CrawlOrder.
477         list.add(getOrderFile().getAbsolutePath());
478         // Iterate through the entire override hierarchy
479         if (getSettingsDirectory().exists()) {
480             recursiveFindFiles(getSettingsDirectory(),list);
481         }
482         // Get files used by settings modules.
483         recursiveFindSecondaryFiles(getOrder(),list);
484         return list;
485     }
486 
487     /***
488      * Add any files being used by any of the Modules making up the settings to
489      * the list.
490      *
491      * @param mbean A ModuleType to interrogate for files. Any child modules
492      *           will be recursively interrogated.
493      * @param list The list to add found files to.
494      */
495     private void recursiveFindSecondaryFiles(ComplexType mbean, 
496             ArrayList<String> list) {
497         MBeanInfo info = mbean.getMBeanInfo();
498         MBeanAttributeInfo[] a = info.getAttributes();
499         // Interrogate the current module
500         if(mbean instanceof ModuleType){
501             ((ModuleType)mbean).listUsedFiles(list);
502         }
503 
504         // Recursively interrogate all sub modules that are of ModuleType
505         for(int n=0; n<a.length; n++) {
506             if(a[n] == null) {
507                 // Error null attribute.
508             } else {
509                 ModuleAttributeInfo att = (ModuleAttributeInfo)a[n];
510                 Object currentAttribute;
511                 try {
512                     currentAttribute = mbean.getAttribute(att.getName());
513                     if(currentAttribute instanceof ComplexType) {
514                         recursiveFindSecondaryFiles((ComplexType)currentAttribute,list);
515                     }
516                 } catch (AttributeNotFoundException e) {
517                     // TODO Auto-generated catch block
518                     e.printStackTrace();
519                 } catch (MBeanException e) {
520                     // TODO Auto-generated catch block
521                     e.printStackTrace();
522                 } catch (ReflectionException e) {
523                     // TODO Auto-generated catch block
524                     e.printStackTrace();
525                 }
526             }
527         }
528     }
529 
530     /***
531      * Starting at the specific directory this method will iterate through all
532      * sub directories and add each file (as absolute name, with path as a
533      * string) to the provided ArrayList. Any file found under the settings
534      * directory with the proper suffix will be considered valid and added to
535      * the list.
536      * @param dir Starting directory
537      * @param list The list to add to
538      */
539     private void recursiveFindFiles(File dir, ArrayList<String> list){
540         File[] subs = dir.listFiles();
541         if (subs != null) {
542             for(int i=0 ; i < subs.length ; i++){
543                 if(subs[i].isDirectory()){
544                     recursiveFindFiles(subs[i],list);
545                 } else {
546                     if(subs[i].getName().endsWith(settingsFilenameSuffix)){
547                         // Add it to list
548                         list.add(subs[i].getAbsolutePath());
549                     }
550                 }
551             }
552         }
553     }
554 }