View Javadoc

1   /*
2    * CrawlOrder
3    *
4    * $Header$
5    *
6    * Created on May 15, 2003
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   *
26   */
27  
28  package org.archive.crawler.datamodel;
29  
30  import java.io.File;
31  import java.io.Serializable;
32  import java.util.logging.Logger;
33  
34  import javax.management.AttributeNotFoundException;
35  
36  import org.archive.crawler.framework.CrawlController;
37  import org.archive.crawler.framework.CrawlScope;
38  import org.archive.crawler.framework.Frontier;
39  import org.archive.crawler.framework.Processor;
40  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41  import org.archive.crawler.settings.MapType;
42  import org.archive.crawler.settings.ModuleType;
43  import org.archive.crawler.settings.SimpleType;
44  import org.archive.crawler.settings.Type;
45  import org.archive.crawler.url.canonicalize.BaseRule;
46  
47  /***
48   * Represents the 'root' of the settings hierarchy. Contains those settings that
49   * do not belong to any specific module, but rather relate to the crawl as a
50   * whole (much of this is used by the CrawlController directly or indirectly).
51   *
52   * @see org.archive.crawler.settings.ModuleType
53   */
54  public class CrawlOrder extends ModuleType implements Serializable {
55  
56      private static final long serialVersionUID = -6715840285961511669L;
57  
58      private static Logger logger =
59          Logger.getLogger("org.archive.crawler.datamodel.CrawlOrder");
60  
61      public static final String ATTR_NAME = "crawl-order";
62      public static final String ATTR_SETTINGS_DIRECTORY = "settings-directory";
63      public static final String ATTR_DISK_PATH = "disk-path";
64      public static final String ATTR_LOGS_PATH = "logs-path";
65      public static final String ATTR_CHECKPOINTS_PATH = "checkpoints-path";
66      public static final String ATTR_STATE_PATH = "state-path";
67      public static final String ATTR_SCRATCH_PATH = "scratch-path";
68      public static final String ATTR_RECOVER_PATH = "recover-path";
69      public static final String ATTR_RECOVER_RETAIN_FAILURES =
70          "recover-retain-failures";
71      public static final String ATTR_RECOVER_SCOPE_INCLUDES =
72          "recover-scope-includes";
73      public static final String ATTR_RECOVER_SCOPE_ENQUEUES =
74          "recover-scope-enqueues";
75      public static final String ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download";
76      public static final String ATTR_MAX_DOCUMENT_DOWNLOAD =
77          "max-document-download";
78      public static final String ATTR_MAX_TIME_SEC = "max-time-sec";
79      public static final String ATTR_MAX_TOE_THREADS = "max-toe-threads";
80      public static final String ATTR_HTTP_HEADERS = "http-headers";
81      public static final String ATTR_USER_AGENT = "user-agent";
82      public static final String ATTR_FROM = "from";
83      public static final String ATTR_PRE_FETCH_PROCESSORS =
84          "pre-fetch-processors";
85      public static final String ATTR_FETCH_PROCESSORS = "fetch-processors";
86      public static final String ATTR_EXTRACT_PROCESSORS = "extract-processors";
87      public static final String ATTR_WRITE_PROCESSORS = "write-processors";
88      public static final String ATTR_POST_PROCESSORS = "post-processors";
89      public static final String ATTR_LOGGERS = "loggers";
90      public static final String ATTR_RULES = "uri-canonicalization-rules";
91      public static final String ATTR_RECORDER_OUT_BUFFER =
92          "recorder-out-buffer-bytes";
93      public static final String ATTR_RECORDER_IN_BUFFER =
94          "recorder-in-buffer-bytes";
95      
96      /*** Percentage of heap to allocate to bdb cache */
97      public static final String ATTR_BDB_CACHE_PERCENT =
98          "bdb-cache-percent";
99      
100     /***
101      * When checkpointing, copy the bdb logs.
102      * Default is true.  If false, then we do not copy logs on checkpoint AND
103      * we tell bdbje never to delete log files; instead it renames
104      * files-to-delete with a '.del' extension.  Assumption is that when this
105      * setting is false, an external process is managing the removing of
106      * bdbje log files and that come time to recover from a checkpoint, the
107      * files that comprise a checkpoint are manually assembled.
108      */
109     public static final String ATTR_CHECKPOINT_COPY_BDBJE_LOGS =
110         "checkpoint-copy-bdbje-logs";
111     public static final Boolean DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS =
112         Boolean.TRUE;
113     
114     /***
115      * Default size of bdb cache.
116      */
117     private final static Integer DEFAULT_BDB_CACHE_PERCENT = new Integer(0);
118 
119     private transient MapType httpHeaders;
120     private transient MapType loggers;
121 
122     private transient CrawlController controller;
123 
124     /***
125      * Regex for acceptable user-agent format.
126      */
127     private static String ACCEPTABLE_USER_AGENT =
128         "//S+.*//(.*//+http(s)?:////S+//.//S+.*//).*";
129 
130     /***
131      * Regex for acceptable from address.
132      */
133     private static String ACCEPTABLE_FROM = "//S+@//S+//.//S+";
134     
135 
136     /*** Construct a CrawlOrder.
137      */
138     public CrawlOrder() {
139         super(ATTR_NAME, "Heritrix crawl order. This forms the root of " +
140                 "the settings framework.");
141         Type e;
142 
143         e = addElementToDefinition(new SimpleType(ATTR_SETTINGS_DIRECTORY,
144                 "Directory where override settings are kept. The settings " +
145                 "for many modules can be overridden based on the domain or " +
146                 "subdomain of the URI being processed. This setting specifies" +
147                 " a file level directory to store those settings. The path" +
148                 " is relative to 'disk-path' unless" +
149                 " an absolute path is provided.", "settings"));
150         e.setOverrideable(false);
151         e.setExpertSetting(true);
152 
153         e = addElementToDefinition(new SimpleType(ATTR_DISK_PATH,
154                 "Directory where logs, arcs and other run time files will " +
155                 "be kept. If this path is a relative path, it will be " +
156                 "relative to the crawl order.", ""));
157         e.setOverrideable(false);
158         e.setExpertSetting(true);
159 
160         e = addElementToDefinition(new SimpleType(ATTR_LOGS_PATH,
161                 "Directory where crawler log files will be kept. If this path " +
162                 "is a relative path, it will be relative to the 'disk-path'.",
163                 "logs"));
164         e.setOverrideable(false);
165         e.setExpertSetting(true);
166 
167         e = addElementToDefinition(new SimpleType(ATTR_CHECKPOINTS_PATH,
168                 "Directory where crawler checkpoint files will be kept. " +
169                 "If this path " +
170                 "is a relative path, it will be relative to the 'disk-path'.",
171                 "checkpoints"));
172         e.setOverrideable(false);
173         e.setExpertSetting(true);
174 
175         e = addElementToDefinition(new SimpleType(ATTR_STATE_PATH,
176                 "Directory where crawler-state files will be kept. If this path " +
177                 "is a relative path, it will be relative to the 'disk-path'.",
178                 "state"));
179         e.setOverrideable(false);
180         e.setExpertSetting(true);
181 
182         e = addElementToDefinition(new SimpleType(ATTR_SCRATCH_PATH,
183                 "Directory where discardable temporary files will be kept. " +
184                 "If this path " +
185                 "is a relative path, it will be relative to the 'disk-path'.",
186                 "scratch"));
187         e.setOverrideable(false);
188         e.setExpertSetting(true);
189 
190         e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_DOWNLOAD,
191                 "Maximum number of bytes to download. Once this number is" +
192                 " exceeded the crawler will stop. " +
193                 "A value of zero means no upper limit.", new Long(0)));
194         e.setOverrideable(false);
195 
196         e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCUMENT_DOWNLOAD,
197                 "Maximum number of documents to download. Once this number" +
198                 " is exceeded the crawler will stop. " +
199                 "A value of zero means no upper limit.", new Long(0)));
200         e.setOverrideable(false);
201 
202         e = addElementToDefinition(new SimpleType(ATTR_MAX_TIME_SEC,
203                 "Maximum amount of time to crawl (in seconds). Once this" +
204                 " much time has elapsed the crawler will stop. A value of" +
205                 " zero means no upper limit.",
206                 new Long(0)));
207         e.setOverrideable(false);
208         
209         e = addElementToDefinition(new SimpleType(ATTR_MAX_TOE_THREADS,
210                 "Maximum number of threads processing URIs at the same time.",
211                 new Integer(100)));
212         e.setOverrideable(false);
213 
214         e = addElementToDefinition(new SimpleType(ATTR_RECORDER_OUT_BUFFER,
215                 "Size in bytes of in-memory buffer to record outbound " +
216                 "traffic. One such buffer is reserved for every ToeThread.",
217                 new Integer(4096)));
218         e.setOverrideable(false);
219         e.setExpertSetting(true);
220         
221         e = addElementToDefinition(new SimpleType(ATTR_RECORDER_IN_BUFFER,
222                 "Size in bytes of in-memory buffer to record inbound " +
223                 "traffic. One such buffer is reserved for every ToeThread.",
224                 new Integer(65536)));
225         e.setOverrideable(false);
226         e.setExpertSetting(true);
227         
228         e = addElementToDefinition(new SimpleType(ATTR_BDB_CACHE_PERCENT,
229                 "Percentage of heap to allocate to BerkeleyDB JE cache. " +
230                 "Default of zero means no preference (accept BDB's default, " +
231                 "usually 60%, or the je.maxMemoryPercent property value).",
232                 DEFAULT_BDB_CACHE_PERCENT));
233         e.setExpertSetting(true);
234         e.setOverrideable(false);
235         
236         addElementToDefinition(new CrawlScope());
237 
238         httpHeaders = (MapType) addElementToDefinition(new MapType(
239                 ATTR_HTTP_HEADERS, "HTTP headers. Information that will " +
240                         "be used when constructing the HTTP headers of " +
241                         "the crawler's HTTP requests."));
242 
243         e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_USER_AGENT,
244                 "User agent to act as. Field must contain valid URL " +
245                 "that links to website of person or organization " +
246                 "running the crawl. Replace 'PROJECT_URL_HERE' in " +
247                 "initial template. E.g. If organization " +
248                 "is Library of Congress, a valid user agent would be:" +
249                 "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 " +
250                 "+http://loc.gov)'. " +
251                 "Note, you must preserve the '+' before the 'http'.",
252           "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));
253 
254         e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM,
255                 "Contact information. This field must contain a valid " +
256                 "e-mail address for the person or organization responsible" +
257                 "for this crawl: e.g. 'webmaster@loc.gov'",
258                 "CONTACT_EMAIL_ADDRESS_HERE"));
259 
260         addElementToDefinition(new RobotsHonoringPolicy());
261 
262         e = addElementToDefinition(new ModuleType(
263                 Frontier.ATTR_NAME, "Frontier"));
264         e.setLegalValueType(Frontier.class);
265 
266         e = (MapType) addElementToDefinition(new MapType(ATTR_RULES,
267             "Ordered list of url canonicalization rules. " +
268             "Rules are applied in the order listed from top to bottom.",
269             BaseRule.class));
270         e.setOverrideable(true);
271         e.setExpertSetting(true);
272         
273         e = addElementToDefinition(new MapType(
274                 ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" +
275                         " fetching anything from the network.",
276                         Processor.class));
277         e.setOverrideable(false);
278 
279         e = addElementToDefinition(new MapType(
280                 ATTR_FETCH_PROCESSORS, "Processors that fetch documents."
281                 , Processor.class));
282         e.setOverrideable(false);
283 
284         e = addElementToDefinition(new MapType(
285                 ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" +
286                         " from fetched documents.", Processor.class));
287         e.setOverrideable(false);
288 
289         e = addElementToDefinition(new MapType(
290                 ATTR_WRITE_PROCESSORS, "Processors that write documents" +
291                         " to archives.", Processor.class));
292         e.setOverrideable(false);
293 
294         e = addElementToDefinition(new MapType(
295                 ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" +
296                         " the frontier with new URIs.", Processor.class));
297         e.setOverrideable(false);
298 
299         loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS,
300                 "Statistics tracking modules. Any number of specialized " +
301                 "statistics tracker that monitor a crawl and write logs, " +
302                 "reports and/or provide information to the user interface."));
303 
304         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH,
305                 "Optional. Points at recover log (or recover.gz log) OR " +
306                 "the checkpoint directory to use recovering a crawl.", ""));
307         e.setOverrideable(false);
308         e.setExpertSetting(true);
309         
310         e = addElementToDefinition(new SimpleType(
311             ATTR_CHECKPOINT_COPY_BDBJE_LOGS,
312             "When true, on a checkpoint, we copy off the bdbje log files to " +
313             "the checkpoint directory. To recover a checkpoint, just " +
314             "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " +
315             "directory to recover.  This is default setting. " +
316             "But if crawl is large, " +
317             "copying bdbje log files can take tens of minutes and even " +
318             "upwards of an hour (Copying bdbje log files will consume bulk " +
319             "of time checkpointing). If this setting is false, we do NOT copy " +
320             "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " +
321             "files (instead we have it rename files-to-delete with a '.del'" +
322             "extension). Assumption is that when this setting is false, " +
323             "an external process is managing the removal of bdbje log files " +
324             "and that come time to recover from a checkpoint, the files that " +
325             "comprise a checkpoint are manually assembled. This is an expert " +
326             "setting.",
327             DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));
328         e.setOverrideable(false);
329         e.setExpertSetting(true);
330 
331         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES,
332                 "When recovering via the recover.log, should failures " +
333                 "in the log be retained in the recovered crawl, " +
334                 "preventing the corresponding URIs from being retried. " +
335                 "Default is false, meaning failures are forgotten, and " +
336                 "the corresponding URIs will be retried in the recovered " +
337                 "crawl.", Boolean.FALSE));
338         e.setOverrideable(false);
339         e.setExpertSetting(true);
340         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_SCOPE_INCLUDES,
341                 "When recovering via the recover.log, should URIs " +
342                 "be checked against scope before considered included " +
343                 "during the first phase which primes the already-seen " +
344                 "set. " +
345                 "Default is true, meaning scope changes in a recovered " +
346                 "crawl can slim the already-seen size. ", Boolean.TRUE));
347         e.setOverrideable(false);
348         e.setExpertSetting(true);
349         e = addElementToDefinition(new SimpleType(ATTR_RECOVER_SCOPE_ENQUEUES,
350                 "When recovering via the recover.log, should URIs " +
351                 "be checked against scope before reenqueued during " +
352                 "the second phase which fills the to-be-fetched queues. " +
353                 "Default is true, meaning scope changes in a recovered " +
354                 "crawl can slim the pending queues. ", Boolean.TRUE));
355         e.setOverrideable(false);
356         e.setExpertSetting(true);
357         
358         
359         e = addElementToDefinition(
360            new CredentialStore(CredentialStore.ATTR_NAME));
361         e.setOverrideable(true);
362         e.setExpertSetting(true);
363     }
364 
365     /***
366      * @param curi
367      * @return user-agent header value to use
368      */
369     public String getUserAgent(CrawlURI curi) {
370         return ((String) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT));
371     }
372 
373     /***
374      * @param curi
375      * @return from header value to use
376      */
377     public String getFrom(CrawlURI curi) {
378         String res = null;
379         try {
380             res = (String) httpHeaders.getAttribute(ATTR_FROM, curi);
381         } catch (AttributeNotFoundException e) {
382             logger.severe(e.getMessage());
383         }
384         return res;
385     }
386 
387     /***
388      * Returns the set number of maximum toe threads.
389      * @return Number of maximum toe threads
390      */
391     public int getMaxToes() {
392         Integer res = null;
393         try {
394             res = (Integer) getAttribute(null, ATTR_MAX_TOE_THREADS);
395         } catch (AttributeNotFoundException e) {
396             logger.severe(e.getMessage());
397         }
398         return res.intValue();
399     }
400 
401     /***
402      * This method gets the RobotsHonoringPolicy object from the orders file.
403      *
404      * @return the new RobotsHonoringPolicy
405      */
406     public RobotsHonoringPolicy getRobotsHonoringPolicy() {
407         try {
408             return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME);
409         } catch (AttributeNotFoundException e) {
410             logger.severe(e.getMessage());
411             return null;
412         } 
413     }
414 
415     /*** Get the name of the order file.
416      *
417      * @return the name of the order file.
418      */
419     public String getCrawlOrderName() {
420         return getSettingsHandler().getSettingsObject(null).getName();
421     }
422 
423     /***
424      * @return The crawl controller.
425      */
426     public CrawlController getController() {
427         return controller;
428     }
429 
430     /***
431      * @param controller
432      */
433     public void setController(CrawlController controller) {
434         this.controller = controller;
435     }
436 
437     /***
438      * Returns the Map of the StatisticsTracking modules that are included in the
439      * configuration that the current instance of this class is representing.
440      * @return Map of the StatisticsTracking modules
441      */
442     public MapType getLoggers() {
443         return loggers;
444     }
445 
446     /***
447      * Checks if the User Agent and From field are set 'correctly' in
448      * the specified Crawl Order.
449      *
450      * @throws FatalConfigurationException
451      */
452     public void checkUserAgentAndFrom() throws FatalConfigurationException {
453         // don't start the crawl if they're using the default user-agent
454         String userAgent = this.getUserAgent(null);
455         String from = this.getFrom(null);
456         if (!(userAgent.matches(ACCEPTABLE_USER_AGENT)
457             && from.matches(ACCEPTABLE_FROM))) {
458             throw new FatalConfigurationException("unacceptable user-agent " +
459                     " or from (Reedit your order file).");
460         }
461     }
462 
463     /***
464      * @return Checkpoint directory.
465      */
466     public File getCheckpointsDirectory() {
467         try {
468             return getDirectoryRelativeToDiskPath((String) getAttribute(null,
469                     CrawlOrder.ATTR_CHECKPOINTS_PATH));
470         } catch (AttributeNotFoundException e) {
471             // TODO Auto-generated catch block
472             e.printStackTrace();
473             return null;
474         }
475     }
476 
477     private File getDirectoryRelativeToDiskPath(String subpath) {
478         File disk;
479         try {
480             disk = getSettingsHandler().getPathRelativeToWorkingDirectory(
481                     (String) getAttribute(null, CrawlOrder.ATTR_DISK_PATH));
482             return new File(disk, subpath);
483         } catch (AttributeNotFoundException e) {
484             // TODO Auto-generated catch block
485             e.printStackTrace();
486             return null;
487         }
488     }
489     
490     /***
491      * Return fullpath to the directory named by <code>key</code>
492      * in settings.
493      * If directory does not exist, it and all intermediary dirs
494      * will be created.
495      * @param key Key to use going to settings.
496      * @return Full path to directory named by <code>key</code>.
497      * @throws AttributeNotFoundException
498      */
499     public File getSettingsDir(String key)
500     throws AttributeNotFoundException {
501         String path = (String)getAttribute(null, key);
502         File f = new File(path);
503         if (!f.isAbsolute()) {
504             f = getDirectoryRelativeToDiskPath(path);
505         }
506         if (!f.exists()) {
507             f.mkdirs();
508         }
509         return f;
510     }
511     
512     
513 }