1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 package org.archive.crawler.datamodel;
29
30 import java.io.File;
31 import java.io.Serializable;
32 import java.util.logging.Logger;
33
34 import javax.management.AttributeNotFoundException;
35
36 import org.archive.crawler.framework.CrawlController;
37 import org.archive.crawler.framework.CrawlScope;
38 import org.archive.crawler.framework.Frontier;
39 import org.archive.crawler.framework.Processor;
40 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41 import org.archive.crawler.settings.MapType;
42 import org.archive.crawler.settings.ModuleType;
43 import org.archive.crawler.settings.SimpleType;
44 import org.archive.crawler.settings.Type;
45 import org.archive.crawler.url.canonicalize.BaseRule;
46
47 /***
48 * Represents the 'root' of the settings hierarchy. Contains those settings that
49 * do not belong to any specific module, but rather relate to the crawl as a
50 * whole (much of this is used by the CrawlController directly or indirectly).
51 *
52 * @see org.archive.crawler.settings.ModuleType
53 */
54 public class CrawlOrder extends ModuleType implements Serializable {
55
56 private static final long serialVersionUID = -6715840285961511669L;
57
58 private static Logger logger =
59 Logger.getLogger("org.archive.crawler.datamodel.CrawlOrder");
60
61 public static final String ATTR_NAME = "crawl-order";
62 public static final String ATTR_SETTINGS_DIRECTORY = "settings-directory";
63 public static final String ATTR_DISK_PATH = "disk-path";
64 public static final String ATTR_LOGS_PATH = "logs-path";
65 public static final String ATTR_CHECKPOINTS_PATH = "checkpoints-path";
66 public static final String ATTR_STATE_PATH = "state-path";
67 public static final String ATTR_SCRATCH_PATH = "scratch-path";
68 public static final String ATTR_RECOVER_PATH = "recover-path";
69 public static final String ATTR_RECOVER_RETAIN_FAILURES =
70 "recover-retain-failures";
71 public static final String ATTR_RECOVER_SCOPE_INCLUDES =
72 "recover-scope-includes";
73 public static final String ATTR_RECOVER_SCOPE_ENQUEUES =
74 "recover-scope-enqueues";
75 public static final String ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download";
76 public static final String ATTR_MAX_DOCUMENT_DOWNLOAD =
77 "max-document-download";
78 public static final String ATTR_MAX_TIME_SEC = "max-time-sec";
79 public static final String ATTR_MAX_TOE_THREADS = "max-toe-threads";
80 public static final String ATTR_HTTP_HEADERS = "http-headers";
81 public static final String ATTR_USER_AGENT = "user-agent";
82 public static final String ATTR_FROM = "from";
83 public static final String ATTR_PRE_FETCH_PROCESSORS =
84 "pre-fetch-processors";
85 public static final String ATTR_FETCH_PROCESSORS = "fetch-processors";
86 public static final String ATTR_EXTRACT_PROCESSORS = "extract-processors";
87 public static final String ATTR_WRITE_PROCESSORS = "write-processors";
88 public static final String ATTR_POST_PROCESSORS = "post-processors";
89 public static final String ATTR_LOGGERS = "loggers";
90 public static final String ATTR_RULES = "uri-canonicalization-rules";
91 public static final String ATTR_RECORDER_OUT_BUFFER =
92 "recorder-out-buffer-bytes";
93 public static final String ATTR_RECORDER_IN_BUFFER =
94 "recorder-in-buffer-bytes";
95
96 /*** Percentage of heap to allocate to bdb cache */
97 public static final String ATTR_BDB_CACHE_PERCENT =
98 "bdb-cache-percent";
99
100 /***
101 * When checkpointing, copy the bdb logs.
102 * Default is true. If false, then we do not copy logs on checkpoint AND
103 * we tell bdbje never to delete log files; instead it renames
104 * files-to-delete with a '.del' extension. Assumption is that when this
105 * setting is false, an external process is managing the removing of
106 * bdbje log files and that come time to recover from a checkpoint, the
107 * files that comprise a checkpoint are manually assembled.
108 */
109 public static final String ATTR_CHECKPOINT_COPY_BDBJE_LOGS =
110 "checkpoint-copy-bdbje-logs";
111 public static final Boolean DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS =
112 Boolean.TRUE;
113
114 /***
115 * Default size of bdb cache.
116 */
117 private final static Integer DEFAULT_BDB_CACHE_PERCENT = new Integer(0);
118
119 private transient MapType httpHeaders;
120 private transient MapType loggers;
121
122 private transient CrawlController controller;
123
124 /***
125 * Regex for acceptable user-agent format.
126 */
127 private static String ACCEPTABLE_USER_AGENT =
128 "//S+.*//(.*//+http(s)?:////S+//.//S+.*//).*";
129
130 /***
131 * Regex for acceptable from address.
132 */
133 private static String ACCEPTABLE_FROM = "//S+@//S+//.//S+";
134
135
136 /*** Construct a CrawlOrder.
137 */
138 public CrawlOrder() {
139 super(ATTR_NAME, "Heritrix crawl order. This forms the root of " +
140 "the settings framework.");
141 Type e;
142
143 e = addElementToDefinition(new SimpleType(ATTR_SETTINGS_DIRECTORY,
144 "Directory where override settings are kept. The settings " +
145 "for many modules can be overridden based on the domain or " +
146 "subdomain of the URI being processed. This setting specifies" +
147 " a file level directory to store those settings. The path" +
148 " is relative to 'disk-path' unless" +
149 " an absolute path is provided.", "settings"));
150 e.setOverrideable(false);
151 e.setExpertSetting(true);
152
153 e = addElementToDefinition(new SimpleType(ATTR_DISK_PATH,
154 "Directory where logs, arcs and other run time files will " +
155 "be kept. If this path is a relative path, it will be " +
156 "relative to the crawl order.", ""));
157 e.setOverrideable(false);
158 e.setExpertSetting(true);
159
160 e = addElementToDefinition(new SimpleType(ATTR_LOGS_PATH,
161 "Directory where crawler log files will be kept. If this path " +
162 "is a relative path, it will be relative to the 'disk-path'.",
163 "logs"));
164 e.setOverrideable(false);
165 e.setExpertSetting(true);
166
167 e = addElementToDefinition(new SimpleType(ATTR_CHECKPOINTS_PATH,
168 "Directory where crawler checkpoint files will be kept. " +
169 "If this path " +
170 "is a relative path, it will be relative to the 'disk-path'.",
171 "checkpoints"));
172 e.setOverrideable(false);
173 e.setExpertSetting(true);
174
175 e = addElementToDefinition(new SimpleType(ATTR_STATE_PATH,
176 "Directory where crawler-state files will be kept. If this path " +
177 "is a relative path, it will be relative to the 'disk-path'.",
178 "state"));
179 e.setOverrideable(false);
180 e.setExpertSetting(true);
181
182 e = addElementToDefinition(new SimpleType(ATTR_SCRATCH_PATH,
183 "Directory where discardable temporary files will be kept. " +
184 "If this path " +
185 "is a relative path, it will be relative to the 'disk-path'.",
186 "scratch"));
187 e.setOverrideable(false);
188 e.setExpertSetting(true);
189
190 e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_DOWNLOAD,
191 "Maximum number of bytes to download. Once this number is" +
192 " exceeded the crawler will stop. " +
193 "A value of zero means no upper limit.", new Long(0)));
194 e.setOverrideable(false);
195
196 e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCUMENT_DOWNLOAD,
197 "Maximum number of documents to download. Once this number" +
198 " is exceeded the crawler will stop. " +
199 "A value of zero means no upper limit.", new Long(0)));
200 e.setOverrideable(false);
201
202 e = addElementToDefinition(new SimpleType(ATTR_MAX_TIME_SEC,
203 "Maximum amount of time to crawl (in seconds). Once this" +
204 " much time has elapsed the crawler will stop. A value of" +
205 " zero means no upper limit.",
206 new Long(0)));
207 e.setOverrideable(false);
208
209 e = addElementToDefinition(new SimpleType(ATTR_MAX_TOE_THREADS,
210 "Maximum number of threads processing URIs at the same time.",
211 new Integer(100)));
212 e.setOverrideable(false);
213
214 e = addElementToDefinition(new SimpleType(ATTR_RECORDER_OUT_BUFFER,
215 "Size in bytes of in-memory buffer to record outbound " +
216 "traffic. One such buffer is reserved for every ToeThread.",
217 new Integer(4096)));
218 e.setOverrideable(false);
219 e.setExpertSetting(true);
220
221 e = addElementToDefinition(new SimpleType(ATTR_RECORDER_IN_BUFFER,
222 "Size in bytes of in-memory buffer to record inbound " +
223 "traffic. One such buffer is reserved for every ToeThread.",
224 new Integer(65536)));
225 e.setOverrideable(false);
226 e.setExpertSetting(true);
227
228 e = addElementToDefinition(new SimpleType(ATTR_BDB_CACHE_PERCENT,
229 "Percentage of heap to allocate to BerkeleyDB JE cache. " +
230 "Default of zero means no preference (accept BDB's default, " +
231 "usually 60%, or the je.maxMemoryPercent property value).",
232 DEFAULT_BDB_CACHE_PERCENT));
233 e.setExpertSetting(true);
234 e.setOverrideable(false);
235
236 addElementToDefinition(new CrawlScope());
237
238 httpHeaders = (MapType) addElementToDefinition(new MapType(
239 ATTR_HTTP_HEADERS, "HTTP headers. Information that will " +
240 "be used when constructing the HTTP headers of " +
241 "the crawler's HTTP requests."));
242
243 e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_USER_AGENT,
244 "User agent to act as. Field must contain valid URL " +
245 "that links to website of person or organization " +
246 "running the crawl. Replace 'PROJECT_URL_HERE' in " +
247 "initial template. E.g. If organization " +
248 "is Library of Congress, a valid user agent would be:" +
249 "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 " +
250 "+http://loc.gov)'. " +
251 "Note, you must preserve the '+' before the 'http'.",
252 "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));
253
254 e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM,
255 "Contact information. This field must contain a valid " +
256 "e-mail address for the person or organization responsible" +
257 "for this crawl: e.g. 'webmaster@loc.gov'",
258 "CONTACT_EMAIL_ADDRESS_HERE"));
259
260 addElementToDefinition(new RobotsHonoringPolicy());
261
262 e = addElementToDefinition(new ModuleType(
263 Frontier.ATTR_NAME, "Frontier"));
264 e.setLegalValueType(Frontier.class);
265
266 e = (MapType) addElementToDefinition(new MapType(ATTR_RULES,
267 "Ordered list of url canonicalization rules. " +
268 "Rules are applied in the order listed from top to bottom.",
269 BaseRule.class));
270 e.setOverrideable(true);
271 e.setExpertSetting(true);
272
273 e = addElementToDefinition(new MapType(
274 ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" +
275 " fetching anything from the network.",
276 Processor.class));
277 e.setOverrideable(false);
278
279 e = addElementToDefinition(new MapType(
280 ATTR_FETCH_PROCESSORS, "Processors that fetch documents."
281 , Processor.class));
282 e.setOverrideable(false);
283
284 e = addElementToDefinition(new MapType(
285 ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" +
286 " from fetched documents.", Processor.class));
287 e.setOverrideable(false);
288
289 e = addElementToDefinition(new MapType(
290 ATTR_WRITE_PROCESSORS, "Processors that write documents" +
291 " to archives.", Processor.class));
292 e.setOverrideable(false);
293
294 e = addElementToDefinition(new MapType(
295 ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" +
296 " the frontier with new URIs.", Processor.class));
297 e.setOverrideable(false);
298
299 loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS,
300 "Statistics tracking modules. Any number of specialized " +
301 "statistics tracker that monitor a crawl and write logs, " +
302 "reports and/or provide information to the user interface."));
303
304 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH,
305 "Optional. Points at recover log (or recover.gz log) OR " +
306 "the checkpoint directory to use recovering a crawl.", ""));
307 e.setOverrideable(false);
308 e.setExpertSetting(true);
309
310 e = addElementToDefinition(new SimpleType(
311 ATTR_CHECKPOINT_COPY_BDBJE_LOGS,
312 "When true, on a checkpoint, we copy off the bdbje log files to " +
313 "the checkpoint directory. To recover a checkpoint, just " +
314 "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " +
315 "directory to recover. This is default setting. " +
316 "But if crawl is large, " +
317 "copying bdbje log files can take tens of minutes and even " +
318 "upwards of an hour (Copying bdbje log files will consume bulk " +
319 "of time checkpointing). If this setting is false, we do NOT copy " +
320 "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " +
321 "files (instead we have it rename files-to-delete with a '.del'" +
322 "extension). Assumption is that when this setting is false, " +
323 "an external process is managing the removal of bdbje log files " +
324 "and that come time to recover from a checkpoint, the files that " +
325 "comprise a checkpoint are manually assembled. This is an expert " +
326 "setting.",
327 DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));
328 e.setOverrideable(false);
329 e.setExpertSetting(true);
330
331 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES,
332 "When recovering via the recover.log, should failures " +
333 "in the log be retained in the recovered crawl, " +
334 "preventing the corresponding URIs from being retried. " +
335 "Default is false, meaning failures are forgotten, and " +
336 "the corresponding URIs will be retried in the recovered " +
337 "crawl.", Boolean.FALSE));
338 e.setOverrideable(false);
339 e.setExpertSetting(true);
340 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_SCOPE_INCLUDES,
341 "When recovering via the recover.log, should URIs " +
342 "be checked against scope before considered included " +
343 "during the first phase which primes the already-seen " +
344 "set. " +
345 "Default is true, meaning scope changes in a recovered " +
346 "crawl can slim the already-seen size. ", Boolean.TRUE));
347 e.setOverrideable(false);
348 e.setExpertSetting(true);
349 e = addElementToDefinition(new SimpleType(ATTR_RECOVER_SCOPE_ENQUEUES,
350 "When recovering via the recover.log, should URIs " +
351 "be checked against scope before reenqueued during " +
352 "the second phase which fills the to-be-fetched queues. " +
353 "Default is true, meaning scope changes in a recovered " +
354 "crawl can slim the pending queues. ", Boolean.TRUE));
355 e.setOverrideable(false);
356 e.setExpertSetting(true);
357
358
359 e = addElementToDefinition(
360 new CredentialStore(CredentialStore.ATTR_NAME));
361 e.setOverrideable(true);
362 e.setExpertSetting(true);
363 }
364
365 /***
366 * @param curi
367 * @return user-agent header value to use
368 */
369 public String getUserAgent(CrawlURI curi) {
370 return ((String) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT));
371 }
372
373 /***
374 * @param curi
375 * @return from header value to use
376 */
377 public String getFrom(CrawlURI curi) {
378 String res = null;
379 try {
380 res = (String) httpHeaders.getAttribute(ATTR_FROM, curi);
381 } catch (AttributeNotFoundException e) {
382 logger.severe(e.getMessage());
383 }
384 return res;
385 }
386
387 /***
388 * Returns the set number of maximum toe threads.
389 * @return Number of maximum toe threads
390 */
391 public int getMaxToes() {
392 Integer res = null;
393 try {
394 res = (Integer) getAttribute(null, ATTR_MAX_TOE_THREADS);
395 } catch (AttributeNotFoundException e) {
396 logger.severe(e.getMessage());
397 }
398 return res.intValue();
399 }
400
401 /***
402 * This method gets the RobotsHonoringPolicy object from the orders file.
403 *
404 * @return the new RobotsHonoringPolicy
405 */
406 public RobotsHonoringPolicy getRobotsHonoringPolicy() {
407 try {
408 return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME);
409 } catch (AttributeNotFoundException e) {
410 logger.severe(e.getMessage());
411 return null;
412 }
413 }
414
415 /*** Get the name of the order file.
416 *
417 * @return the name of the order file.
418 */
419 public String getCrawlOrderName() {
420 return getSettingsHandler().getSettingsObject(null).getName();
421 }
422
423 /***
424 * @return The crawl controller.
425 */
426 public CrawlController getController() {
427 return controller;
428 }
429
430 /***
431 * @param controller
432 */
433 public void setController(CrawlController controller) {
434 this.controller = controller;
435 }
436
437 /***
438 * Returns the Map of the StatisticsTracking modules that are included in the
439 * configuration that the current instance of this class is representing.
440 * @return Map of the StatisticsTracking modules
441 */
442 public MapType getLoggers() {
443 return loggers;
444 }
445
446 /***
447 * Checks if the User Agent and From field are set 'correctly' in
448 * the specified Crawl Order.
449 *
450 * @throws FatalConfigurationException
451 */
452 public void checkUserAgentAndFrom() throws FatalConfigurationException {
453
454 String userAgent = this.getUserAgent(null);
455 String from = this.getFrom(null);
456 if (!(userAgent.matches(ACCEPTABLE_USER_AGENT)
457 && from.matches(ACCEPTABLE_FROM))) {
458 throw new FatalConfigurationException("unacceptable user-agent " +
459 " or from (Reedit your order file).");
460 }
461 }
462
463 /***
464 * @return Checkpoint directory.
465 */
466 public File getCheckpointsDirectory() {
467 try {
468 return getDirectoryRelativeToDiskPath((String) getAttribute(null,
469 CrawlOrder.ATTR_CHECKPOINTS_PATH));
470 } catch (AttributeNotFoundException e) {
471
472 e.printStackTrace();
473 return null;
474 }
475 }
476
477 private File getDirectoryRelativeToDiskPath(String subpath) {
478 File disk;
479 try {
480 disk = getSettingsHandler().getPathRelativeToWorkingDirectory(
481 (String) getAttribute(null, CrawlOrder.ATTR_DISK_PATH));
482 return new File(disk, subpath);
483 } catch (AttributeNotFoundException e) {
484
485 e.printStackTrace();
486 return null;
487 }
488 }
489
490 /***
491 * Return fullpath to the directory named by <code>key</code>
492 * in settings.
493 * If directory does not exist, it and all intermediary dirs
494 * will be created.
495 * @param key Key to use going to settings.
496 * @return Full path to directory named by <code>key</code>.
497 * @throws AttributeNotFoundException
498 */
499 public File getSettingsDir(String key)
500 throws AttributeNotFoundException {
501 String path = (String)getAttribute(null, key);
502 File f = new File(path);
503 if (!f.isAbsolute()) {
504 f = getDirectoryRelativeToDiskPath(path);
505 }
506 if (!f.exists()) {
507 f.mkdirs();
508 }
509 return f;
510 }
511
512
513 }