1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.settings;
26
27 import java.io.BufferedInputStream;
28 import java.io.BufferedOutputStream;
29 import java.io.File;
30 import java.io.FileInputStream;
31 import java.io.FileOutputStream;
32 import java.io.IOException;
33 import java.io.InputStream;
34 import java.util.ArrayList;
35 import java.util.Collection;
36 import java.util.List;
37 import java.util.TreeSet;
38 import java.util.logging.Logger;
39
40 import javax.management.Attribute;
41 import javax.management.AttributeNotFoundException;
42 import javax.management.InvalidAttributeValueException;
43 import javax.management.MBeanAttributeInfo;
44 import javax.management.MBeanException;
45 import javax.management.MBeanInfo;
46 import javax.management.ReflectionException;
47 import javax.xml.parsers.FactoryConfigurationError;
48 import javax.xml.parsers.ParserConfigurationException;
49 import javax.xml.parsers.SAXParserFactory;
50 import javax.xml.transform.Source;
51 import javax.xml.transform.Transformer;
52 import javax.xml.transform.TransformerFactory;
53 import javax.xml.transform.stream.StreamResult;
54
55 import org.archive.crawler.datamodel.CrawlOrder;
56 import org.archive.util.ArchiveUtils;
57 import org.archive.util.FileUtils;
58 import org.xml.sax.InputSource;
59 import org.xml.sax.SAXException;
60 import org.xml.sax.SAXParseException;
61 import org.xml.sax.XMLReader;
62
63 /*** A SettingsHandler which uses XML files as persistent storage.
64 *
65 * @author John Erik Halse
66 */
67 public class XMLSettingsHandler extends SettingsHandler {
68 private static Logger logger =
69 Logger.getLogger(
70 "org.archive.crawler.settings.XMLSettingsHandler");
71
72
73 protected static final String XML_SCHEMA = "heritrix_settings.xsd";
74 protected static final String XML_ROOT_ORDER = "crawl-order";
75 protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings";
76 protected static final String XML_ROOT_REFINEMENT = "crawl-refinement";
77 protected static final String XML_ELEMENT_CONTROLLER = "controller";
78 protected static final String XML_ELEMENT_META = "meta";
79 protected static final String XML_ELEMENT_NAME = "name";
80 protected static final String XML_ELEMENT_DESCRIPTION = "description";
81 protected static final String XML_ELEMENT_OPERATOR = "operator";
82 protected static final String XML_ELEMENT_ORGANIZATION = "organization";
83 protected static final String XML_ELEMENT_AUDIENCE = "audience";
84 protected static final String XML_ELEMENT_DATE = "date";
85 protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list";
86 protected static final String XML_ELEMENT_REFINEMENT = "refinement";
87 protected static final String XML_ELEMENT_REFERENCE = "reference";
88 protected static final String XML_ELEMENT_LIMITS = "limits";
89 protected static final String XML_ELEMENT_TIMESPAN = "timespan";
90 protected static final String XML_ELEMENT_PORTNUMBER = "portnumber";
91 protected static final String XML_ELEMENT_URIMATCHES = "uri-matches";
92 protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
93 protected static final String XML_ELEMENT_OBJECT = "object";
94 protected static final String XML_ELEMENT_NEW_OBJECT = "newObject";
95 protected static final String XML_ATTRIBUTE_NAME = "name";
96 protected static final String XML_ATTRIBUTE_CLASS = "class";
97 protected static final String XML_ATTRIBUTE_FROM = "from";
98 protected static final String XML_ATTRIBUTE_TO = "to";
99
100 private File orderFile;
101 private final static String settingsFilename = "settings";
102 private final static String settingsFilenameSuffix = "xml";
103 private final static String REFINEMENT_DIR = "_refinements";
104
105 /*** Create a new XMLSettingsHandler object.
106 *
107 * @param orderFile where the order file is located.
108 * @throws InvalidAttributeValueException
109 */
110 public XMLSettingsHandler(File orderFile)
111 throws InvalidAttributeValueException {
112 super();
113 this.orderFile = orderFile.getAbsoluteFile();
114 }
115
116 /*** Initialize the SettingsHandler.
117 *
118 * This method builds the settings data structure and initializes it with
119 * settings from the order file given to the constructor.
120 */
121 public void initialize() {
122 super.initialize();
123 }
124
125 /***
126 * Initialize the SettingsHandler from a source.
127 *
128 * This method builds the settings data structure and initializes it with
129 * settings from the order file given as a parameter. The intended use is
130 * to create a new order file based on a default (template) order file.
131 *
132 * @param source the order file to initialize from.
133 */
134 public void initialize(File source) {
135 File tmpOrderFile = orderFile;
136 orderFile = source.getAbsoluteFile();
137 this.initialize();
138 orderFile = tmpOrderFile;
139 }
140
141 private File getSettingsDirectory() {
142 String settingsDirectoryName = null;
143 try {
144 settingsDirectoryName =
145 (String) getOrder().getAttribute(
146 CrawlOrder.ATTR_SETTINGS_DIRECTORY);
147 } catch (AttributeNotFoundException e) {
148 e.printStackTrace();
149 } catch (MBeanException e) {
150 e.printStackTrace();
151 } catch (ReflectionException e) {
152 e.printStackTrace();
153 }
154
155 return getPathRelativeToWorkingDirectory(settingsDirectoryName);
156 }
157
158 /*** Resolves the filename for a settings object into a file path.
159 *
160 * It will also create the directory structure leading to this file
161 * if it doesn't exist.
162 *
163 * @param settings the settings object to get file path for.
164 * @return the file path for this settings object.
165 */
166 protected final File settingsToFilename(CrawlerSettings settings) {
167 File file;
168
169 if (settings.getScope() == null || settings.getScope().equals("")) {
170 if (settings.isRefinement()) {
171 file = new File(getSettingsDirectory(), File.separatorChar
172 + REFINEMENT_DIR + File.separatorChar
173 + settings.getName() + '.' + settingsFilenameSuffix);
174 } else {
175 file = orderFile;
176 }
177 } else {
178 String elements[] = settings.getScope().split("//.");
179 if (elements.length == 0) {
180 return orderFile;
181 }
182
183 StringBuffer path = new StringBuffer();
184 for (int i = elements.length - 1; i > 0; i--) {
185 path.append(elements[i]);
186 path.append(File.separatorChar);
187 }
188 path.append(elements[0]);
189
190 if (settings.isRefinement()) {
191 file = new File(getSettingsDirectory(), path.toString()
192 + File.separatorChar + REFINEMENT_DIR
193 + File.separatorChar + settings.getName() + '.'
194 + settingsFilenameSuffix);
195 } else {
196 file = new File(getSettingsDirectory(), path.toString()
197 + File.separatorChar + settingsFilename + "."
198 + settingsFilenameSuffix);
199 }
200 }
201 return file;
202 }
203
204 public final void writeSettingsObject(CrawlerSettings settings) {
205 File filename = settingsToFilename(settings);
206 writeSettingsObject(settings, filename);
207 }
208
209 /*** Write a CrawlerSettings object to a specified file.
210 *
211 * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
212 * except that it uses the submitted File object instead of trying to
213 * resolve where the file should be written.
214 *
215 * @param settings the settings object to be serialized.
216 * @param filename the file to which the settings object should be written.
217 */
218 public final void writeSettingsObject(
219 CrawlerSettings settings, File filename) {
220
221 logger.fine("Writing " + filename.getAbsolutePath());
222 filename.getParentFile().mkdirs();
223
224 try {
225 long lastSaved = 0L;
226 File backup = null;
227 if (getOrder().getController() != null && filename.exists()) {
228
229 String name = filename.getName();
230 lastSaved = settings.getLastSavedTime().getTime();
231 name = name.substring(0, name.lastIndexOf('.')) + '_'
232 + ArchiveUtils.get14DigitDate(lastSaved) + "."
233 + settingsFilenameSuffix;
234 backup = new File(filename.getParentFile(), name);
235 FileUtils.copyFiles(filename, backup);
236 }
237
238 StreamResult result =
239 new StreamResult(
240 new BufferedOutputStream(new FileOutputStream(filename)));
241 Transformer transformer =
242 TransformerFactory.newInstance().newTransformer();
243 Source source = new CrawlSettingsSAXSource(settings);
244 transformer.transform(source, result);
245
246
247
248
249
250 if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
251 backup.delete();
252 }
253 } catch (Exception e) {
254 e.printStackTrace();
255 }
256 }
257
258 /*** Read the CrawlerSettings object from a specific file.
259 *
260 * @param settings the settings object to be updated with data from the
261 * persistent storage.
262 * @param f the file to read from.
263 * @return the updated settings object or null if there was no data for this
264 * in the persistent storage.
265 */
266 protected final CrawlerSettings readSettingsObject(CrawlerSettings settings,
267 File f) {
268 CrawlerSettings result = null;
269 try {
270 InputStream is = null;
271 if (!f.exists()) {
272
273
274
275
276
277
278 if (!f.getName().startsWith(settingsFilename)) {
279 is = XMLSettingsHandler.class.
280 getResourceAsStream(f.getPath());
281 }
282 } else {
283 is = new FileInputStream(f);
284 }
285 if (is != null) {
286 XMLReader parser = SAXParserFactory.newInstance()
287 .newSAXParser().getXMLReader();
288 InputStream file = new BufferedInputStream(is);
289 parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
290 InputSource source = new InputSource(file);
291 source.setSystemId(f.toURL().toExternalForm());
292 parser.parse(source);
293 result = settings;
294 }
295 } catch (SAXParseException e) {
296 logger.warning(e.getMessage() + " in '" + e.getSystemId()
297 + "', line: " + e.getLineNumber() + ", column: "
298 + e.getColumnNumber());
299 } catch (SAXException e) {
300 logger.warning(e.getMessage() + ": "
301 + e.getException().getMessage());
302 } catch (ParserConfigurationException e) {
303 logger.warning(e.getMessage() + ": "
304 + e.getCause().getMessage());
305 } catch (FactoryConfigurationError e) {
306 logger.warning(e.getMessage() + ": "
307 + e.getException().getMessage());
308 } catch (IOException e) {
309 logger.warning("Could not access file '"
310 + f.getAbsolutePath() + "': " + e.getMessage());
311 }
312 return result;
313 }
314
315 protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) {
316 File filename = settingsToFilename(settings);
317 return readSettingsObject(settings, filename);
318 }
319
320 /*** Get the <code>File</code> object pointing to the order file.
321 *
322 * @return File object for the order file.
323 */
324 public File getOrderFile() {
325 return orderFile;
326 }
327
328 /*** Creates a replica of the settings file structure in another directory
329 * (fully recursive, includes all per host settings). The SettingsHandler
330 * will then refer to the new files.
331 *
332 * Observe that this method should only be called after the SettingsHandler
333 * has been initialized.
334 *
335 * @param newOrderFileName where the new order file should be saved.
336 * @param newSettingsDirectory the top level directory of the per host/domain
337 * settings files.
338 * @throws IOException
339 */
340 public void copySettings(File newOrderFileName, String newSettingsDirectory)
341 throws IOException {
342 File oldSettingsDirectory = getSettingsDirectory();
343
344
345 orderFile = newOrderFileName;
346 try {
347 getOrder().setAttribute(
348 new Attribute(
349 CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory));
350 } catch (Exception e) {
351 throw new IOException("Could not update settings with new location: "
352 + e.getMessage());
353 }
354 writeSettingsObject(getSettingsObject(null));
355
356 File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);
357
358
359 if (oldSettingsDirectory.compareTo(newDir) != 0) {
360 FileUtils.copyFiles(oldSettingsDirectory, newDir);
361 }
362 }
363
364 /***
365 * Transforms a relative path so that it is relative to the location of the
366 * order file. If an absolute path is given, it will be returned unchanged.<p>
367 * The location of it's order file is always considered as the 'working'
368 * directory for any given settings.
369 * @param path A relative path to a file (or directory)
370 * @return The same path modified so that it is relative to the file level
371 * location of the order file for the settings handler.
372 */
373 public File getPathRelativeToWorkingDirectory(String path) {
374 File f = new File(path);
375
376
377 if (!f.isAbsolute()) {
378 f = new File(this.getOrderFile().getParent(), path);
379 }
380 return f;
381 }
382
383 public Collection getDomainOverrides(String rootDomain) {
384 File settingsDir = getSettingsDirectory();
385
386
387 ArrayList<String> domains = new ArrayList<String>();
388
389 while(rootDomain != null && rootDomain.length()>0){
390 if(rootDomain.indexOf('.')<0){
391
392 domains.add(rootDomain);
393 break;
394 } else {
395
396 domains.add(rootDomain.substring(0,rootDomain.indexOf('.')));
397
398 rootDomain = rootDomain.substring(rootDomain.indexOf('.')+1);
399 }
400 }
401
402
403 StringBuffer subDir = new StringBuffer();
404 for(int i=(domains.size()-1) ; i>=0 ; i--){
405 subDir.append(File.separator+domains.get(i));
406 }
407
408 settingsDir = new File(settingsDir.getPath()+subDir);
409 TreeSet<String> confirmedSubDomains = new TreeSet<String>();
410 if(settingsDir.exists()){
411
412 File[] possibleSubDomains = settingsDir.listFiles();
413 for (int i = 0; i < possibleSubDomains.length; i++) {
414 if (possibleSubDomains[i].isDirectory()
415 && isOverride(possibleSubDomains[i])) {
416
417 confirmedSubDomains.add(possibleSubDomains[i].getName());
418 }
419 }
420 }
421 return confirmedSubDomains;
422 }
423
424 /***
425 * Checks if a file is a a 'per host' override or if it's a directory if it
426 * or it's subdirectories contains a 'per host' override file.
427 * @param f The file or directory to check
428 * @return True if the file is an override or it's a directory that contains
429 * such a file.
430 */
431 private boolean isOverride(File f){
432 if(f.isDirectory()){
433
434 File[] subs = f.listFiles();
435 for(int i=0 ; i < subs.length ; i++){
436 if(isOverride(subs[i])){
437
438 return true;
439 }
440 }
441 } else if (f.getName().equals(
442 settingsFilename + "." + settingsFilenameSuffix)) {
443
444 return true;
445 }
446
447 return false;
448 }
449
450 /*** Delete a settings object from persistent storage.
451 *
452 * Deletes the file represented by the submitted settings object. All empty
453 * directories that are parents to the files path are also deleted.
454 *
455 * @param settings the settings object to delete.
456 */
457 public void deleteSettingsObject(CrawlerSettings settings) {
458 super.deleteSettingsObject(settings);
459 File settingsDirectory = getSettingsDirectory();
460 File settingsFile = settingsToFilename(settings);
461
462 settingsFile.delete();
463 settingsFile = settingsFile.getParentFile();
464 while (settingsFile.isDirectory() && settingsFile.list().length == 0
465 && !settingsFile.equals(settingsDirectory)) {
466 settingsFile.delete();
467 settingsFile = settingsFile.getParentFile();
468 }
469 }
470
471
472
473
474 public List<String> getListOfAllFiles() {
475 ArrayList<String> list = new ArrayList<String>();
476
477 list.add(getOrderFile().getAbsolutePath());
478
479 if (getSettingsDirectory().exists()) {
480 recursiveFindFiles(getSettingsDirectory(),list);
481 }
482
483 recursiveFindSecondaryFiles(getOrder(),list);
484 return list;
485 }
486
487 /***
488 * Add any files being used by any of the Modules making up the settings to
489 * the list.
490 *
491 * @param mbean A ModuleType to interrogate for files. Any child modules
492 * will be recursively interrogated.
493 * @param list The list to add found files to.
494 */
495 private void recursiveFindSecondaryFiles(ComplexType mbean,
496 ArrayList<String> list) {
497 MBeanInfo info = mbean.getMBeanInfo();
498 MBeanAttributeInfo[] a = info.getAttributes();
499
500 if(mbean instanceof ModuleType){
501 ((ModuleType)mbean).listUsedFiles(list);
502 }
503
504
505 for(int n=0; n<a.length; n++) {
506 if(a[n] == null) {
507
508 } else {
509 ModuleAttributeInfo att = (ModuleAttributeInfo)a[n];
510 Object currentAttribute;
511 try {
512 currentAttribute = mbean.getAttribute(att.getName());
513 if(currentAttribute instanceof ComplexType) {
514 recursiveFindSecondaryFiles((ComplexType)currentAttribute,list);
515 }
516 } catch (AttributeNotFoundException e) {
517
518 e.printStackTrace();
519 } catch (MBeanException e) {
520
521 e.printStackTrace();
522 } catch (ReflectionException e) {
523
524 e.printStackTrace();
525 }
526 }
527 }
528 }
529
530 /***
531 * Starting at the specific directory this method will iterate through all
532 * sub directories and add each file (as absolute name, with path as a
533 * string) to the provided ArrayList. Any file found under the settings
534 * directory with the proper suffix will be considered valid and added to
535 * the list.
536 * @param dir Starting directory
537 * @param list The list to add to
538 */
539 private void recursiveFindFiles(File dir, ArrayList<String> list){
540 File[] subs = dir.listFiles();
541 if (subs != null) {
542 for(int i=0 ; i < subs.length ; i++){
543 if(subs[i].isDirectory()){
544 recursiveFindFiles(subs[i],list);
545 } else {
546 if(subs[i].getName().endsWith(settingsFilenameSuffix)){
547
548 list.add(subs[i].getAbsolutePath());
549 }
550 }
551 }
552 }
553 }
554 }