1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.framework;
25
26 import java.io.File;
27 import java.io.FileOutputStream;
28 import java.io.FilenameFilter;
29 import java.io.IOException;
30 import java.io.ObjectInputStream;
31 import java.io.PrintWriter;
32 import java.io.Serializable;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.util.Collections;
36 import java.util.EventObject;
37 import java.util.HashMap;
38 import java.util.HashSet;
39 import java.util.Hashtable;
40 import java.util.Iterator;
41 import java.util.LinkedList;
42 import java.util.List;
43 import java.util.Map;
44 import java.util.Set;
45 import java.util.TreeSet;
46 import java.util.concurrent.locks.ReentrantLock;
47 import java.util.logging.FileHandler;
48 import java.util.logging.Formatter;
49 import java.util.logging.Level;
50 import java.util.logging.Logger;
51
52 import javax.management.AttributeNotFoundException;
53 import javax.management.InvalidAttributeValueException;
54 import javax.management.MBeanException;
55 import javax.management.ReflectionException;
56
57 import org.apache.commons.httpclient.URIException;
58 import org.archive.crawler.admin.CrawlJob;
59 import org.archive.crawler.admin.StatisticsTracker;
60 import org.archive.crawler.datamodel.Checkpoint;
61 import org.archive.crawler.datamodel.CrawlOrder;
62 import org.archive.crawler.datamodel.CrawlURI;
63 import org.archive.crawler.datamodel.ServerCache;
64 import org.archive.crawler.event.CrawlStatusListener;
65 import org.archive.crawler.event.CrawlURIDispositionListener;
66 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
67 import org.archive.crawler.framework.exceptions.InitializationException;
68 import org.archive.crawler.io.LocalErrorFormatter;
69 import org.archive.crawler.io.RuntimeErrorFormatter;
70 import org.archive.crawler.io.StatisticsLogFormatter;
71 import org.archive.crawler.io.UriErrorFormatter;
72 import org.archive.crawler.io.UriProcessingFormatter;
73 import org.archive.crawler.settings.MapType;
74 import org.archive.crawler.settings.SettingsHandler;
75 import org.archive.crawler.util.CheckpointUtils;
76 import org.archive.io.GenerationFileHandler;
77 import org.archive.net.UURI;
78 import org.archive.net.UURIFactory;
79 import org.archive.util.ArchiveUtils;
80 import org.archive.util.CachedBdbMap;
81 import org.archive.util.FileUtils;
82 import org.archive.util.Reporter;
83 import org.archive.util.bdbje.EnhancedEnvironment;
84 import org.xbill.DNS.DClass;
85 import org.xbill.DNS.Lookup;
86
87 import com.sleepycat.bind.serial.StoredClassCatalog;
88 import com.sleepycat.je.CheckpointConfig;
89 import com.sleepycat.je.Database;
90 import com.sleepycat.je.DatabaseException;
91 import com.sleepycat.je.DbInternal;
92 import com.sleepycat.je.EnvironmentConfig;
93 import com.sleepycat.je.dbi.EnvironmentImpl;
94 import com.sleepycat.je.utilint.DbLsn;
95
96 /***
97 * CrawlController collects all the classes which cooperate to
98 * perform a crawl and provides a high-level interface to the
99 * running crawl.
100 *
101 * As the "global context" for a crawl, subcomponents will
102 * often reach each other through the CrawlController.
103 *
104 * @author Gordon Mohr
105 */
106 public class CrawlController implements Serializable, Reporter {
107
108 private static final long serialVersionUID =
109 ArchiveUtils.classnameBasedUID(CrawlController.class,1);
110
111 /***
112 * Messages from the crawlcontroller.
113 *
114 * They appear on console.
115 */
116 private final static Logger LOGGER =
117 Logger.getLogger(CrawlController.class.getName());
118
119
120 /*** abbrieviation label for config files in manifest */
121 public static final char MANIFEST_CONFIG_FILE = 'C';
122 /*** abbrieviation label for report files in manifest */
123 public static final char MANIFEST_REPORT_FILE = 'R';
124 /*** abbrieviation label for log files in manifest */
125 public static final char MANIFEST_LOG_FILE = 'L';
126
127
128 private static final String LOGNAME_PROGRESS_STATISTICS =
129 "progress-statistics";
130 private static final String LOGNAME_URI_ERRORS = "uri-errors";
131 private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors";
132 private static final String LOGNAME_LOCAL_ERRORS = "local-errors";
133 private static final String LOGNAME_CRAWL = "crawl";
134
135
136 private transient CrawlOrder order;
137 private transient CrawlScope scope;
138 private transient ProcessorChainList processorChains;
139
140 private transient Frontier frontier;
141
142 private transient ToePool toePool;
143
144 private transient ServerCache serverCache;
145
146
147 private transient SettingsHandler settingsHandler;
148
149
150
151 private volatile transient boolean singleThreadMode = false;
152 private transient ReentrantLock singleThreadLock = null;
153
154
155 private transient LinkedList<char[]> reserveMemory;
156 private static final int RESERVE_BLOCKS = 1;
157 private static final int RESERVE_BLOCK_SIZE = 6*2^20;
158
159
160
161 /***
162 * Crawl exit status.
163 */
164 private transient String sExit;
165
166 private static final Object NASCENT = "NASCENT".intern();
167 private static final Object RUNNING = "RUNNING".intern();
168 private static final Object PAUSED = "PAUSED".intern();
169 private static final Object PAUSING = "PAUSING".intern();
170 private static final Object CHECKPOINTING = "CHECKPOINTING".intern();
171 private static final Object STOPPING = "STOPPING".intern();
172 private static final Object FINISHED = "FINISHED".intern();
173 private static final Object STARTED = "STARTED".intern();
174 private static final Object PREPARING = "PREPARING".intern();
175
176 transient private Object state = NASCENT;
177
178
179 private transient File disk;
180 private transient File logsDisk;
181
182 /***
183 * For temp files representing state of crawler (eg queues)
184 */
185 private transient File stateDisk;
186
187 /***
188 * For discardable temp files (eg fetch buffers).
189 */
190 private transient File scratchDisk;
191
192 /***
193 * Directory that holds checkpoint.
194 */
195 private transient File checkpointsDisk;
196
197 /***
198 * Checkpointer.
199 * Knows if checkpoint in progress and what name of checkpoint is. Also runs
200 * checkpoints.
201 */
202 private Checkpointer checkpointer;
203
204 /***
205 * Gets set to checkpoint we're in recovering if in checkpoint recover
206 * mode. Gets setup by {@link #getCheckpointRecover()}.
207 */
208 private transient Checkpoint checkpointRecover = null;
209
210
211 private long maxBytes;
212 private long maxDocument;
213 private long maxTime;
214
215 /***
216 * A manifest of all files used/created during this crawl. Written to file
217 * at the end of the crawl (the absolutely last thing done).
218 */
219 private StringBuffer manifest;
220
221 /***
222 * Record of fileHandlers established for loggers,
223 * assisting file rotation.
224 */
225 transient private Map<Logger,FileHandler> fileHandlers;
226
227 /*** suffix to use on active logs */
228 public static final String CURRENT_LOG_SUFFIX = ".log";
229
230 /***
231 * Crawl progress logger.
232 *
233 * No exceptions. Logs summary result of each url processing.
234 */
235 public transient Logger uriProcessing;
236
237 /***
238 * This logger contains unexpected runtime errors.
239 *
240 * Would contain errors trying to set up a job or failures inside
241 * processors that they are not prepared to recover from.
242 */
243 public transient Logger runtimeErrors;
244
245 /***
246 * This logger is for job-scoped logging, specifically errors which
247 * happen and are handled within a particular processor.
248 *
249 * Examples would be socket timeouts, exceptions thrown by extractors, etc.
250 */
251 public transient Logger localErrors;
252
253 /***
254 * Special log for URI format problems, wherever they may occur.
255 */
256 public transient Logger uriErrors;
257
258 /***
259 * Statistics tracker writes here at regular intervals.
260 */
261 private transient Logger progressStats;
262
263 /***
264 * Logger to hold job summary report.
265 *
266 * Large state reports made at infrequent intervals (e.g. job ending) go
267 * here.
268 */
269 public transient Logger reports;
270
271 protected StatisticsTracking statistics = null;
272
273 /***
274 * List of crawl status listeners.
275 *
276 * All iterations need to synchronize on this object if they're to avoid
277 * concurrent modification exceptions.
278 * See {@link java.util.Collections#synchronizedList(List)}.
279 */
280 private transient List<CrawlStatusListener> registeredCrawlStatusListeners =
281 Collections.synchronizedList(new ArrayList<CrawlStatusListener>());
282
283
284
285 private transient CrawlURIDispositionListener
286 registeredCrawlURIDispositionListener;
287
288
289 protected transient ArrayList<CrawlURIDispositionListener>
290 registeredCrawlURIDispositionListeners;
291
292 /*** Shared bdb Environment for Frontier subcomponents */
293
294
295 private transient EnhancedEnvironment bdbEnvironment = null;
296
297 /***
298 * Keep a list of all BigMap instance made -- shouldn't be many -- so that
299 * we can checkpoint.
300 */
301 private transient Map<String,CachedBdbMap<?,?>> bigmaps = null;
302
303 /***
304 * Default constructor
305 */
306 public CrawlController() {
307 super();
308
309 }
310
311 /***
312 * Starting from nothing, set up CrawlController and associated
313 * classes to be ready for a first crawl.
314 *
315 * @param sH Settings handler.
316 * @throws InitializationException
317 */
318 public void initialize(SettingsHandler sH)
319 throws InitializationException {
320 sendCrawlStateChangeEvent(PREPARING, CrawlJob.STATUS_PREPARING);
321
322 this.singleThreadLock = new ReentrantLock();
323 this.settingsHandler = sH;
324 installThreadContextSettingsHandler();
325 this.order = settingsHandler.getOrder();
326 this.order.setController(this);
327 this.bigmaps = new Hashtable<String,CachedBdbMap<?,?>>();
328 sExit = "";
329 this.manifest = new StringBuffer();
330 String onFailMessage = "";
331 try {
332 onFailMessage = "You must set the User-Agent and From HTTP" +
333 " header values to acceptable strings. \n" +
334 " User-Agent: [software-name](+[info-url])[misc]\n" +
335 " From: [email-address]\n";
336 order.checkUserAgentAndFrom();
337
338 onFailMessage = "Unable to setup disk";
339 if (disk == null) {
340 setupDisk();
341 }
342
343 onFailMessage = "Unable to create log file(s)";
344 setupLogs();
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362 onFailMessage = "Unable to test/run checkpoint recover";
363 this.checkpointRecover = getCheckpointRecover();
364 if (this.checkpointRecover == null) {
365 this.checkpointer =
366 new Checkpointer(this, this.checkpointsDisk);
367 } else {
368 setupCheckpointRecover();
369 }
370
371 onFailMessage = "Unable to setup bdb environment.";
372 setupBdb();
373
374 onFailMessage = "Unable to setup statistics";
375 setupStatTracking();
376
377 onFailMessage = "Unable to setup crawl modules";
378 setupCrawlModules();
379 } catch (Exception e) {
380 String tmp = "On crawl: "
381 + settingsHandler.getSettingsObject(null).getName() + " " +
382 onFailMessage;
383 LOGGER.log(Level.SEVERE, tmp, e);
384 throw new InitializationException(tmp, e);
385 }
386
387
388
389 Lookup.getDefaultCache(DClass.IN).setMaxEntries(1);
390
391
392 setupToePool();
393 setThresholds();
394
395 reserveMemory = new LinkedList<char[]>();
396 for(int i = 1; i < RESERVE_BLOCKS; i++) {
397 reserveMemory.add(new char[RESERVE_BLOCK_SIZE]);
398 }
399 }
400
401 /***
402 * Utility method to install this crawl's SettingsHandler into the
403 * 'global' (for this thread) holder, so that any subsequent
404 * deserialization operations in this thread can find it.
405 *
406 * @param sH
407 */
408 public void installThreadContextSettingsHandler() {
409 SettingsHandler.setThreadContextSettingsHandler(settingsHandler);
410 }
411
412 /***
413 * Does setup of checkpoint recover.
414 * Copies bdb log files into state dir.
415 * @throws IOException
416 */
417 protected void setupCheckpointRecover()
418 throws IOException {
419 long started = System.currentTimeMillis();;
420 if (LOGGER.isLoggable(Level.FINE)) {
421 LOGGER.fine("Starting recovery setup -- copying into place " +
422 "bdbje log files -- for checkpoint named " +
423 this.checkpointRecover.getDisplayName());
424 }
425
426 this.checkpointer.recover(this);
427 this.progressStats.info("CHECKPOINT RECOVER " +
428 this.checkpointRecover.getDisplayName());
429
430
431
432
433
434
435 File bdbSubDir = CheckpointUtils.
436 getBdbSubDirectory(this.checkpointRecover.getDirectory());
437 List<IOException> errs = new ArrayList<IOException>();
438 FileUtils.copyFiles(bdbSubDir, CheckpointUtils.getJeLogsFilter(),
439 getStateDisk(), true, false, errs);
440 for (IOException ioe : errs) {
441 LOGGER.log(Level.SEVERE, "Problem copying checkpoint files: "
442 +"checkpoint may be corrupt",ioe);
443 }
444 if (LOGGER.isLoggable(Level.INFO)) {
445 LOGGER.info("Finished recovery setup for checkpoint named " +
446 this.checkpointRecover.getDisplayName() + " in " +
447 (System.currentTimeMillis() - started) + "ms.");
448 }
449 }
450
451 protected boolean getCheckpointCopyBdbjeLogs() {
452 return ((Boolean)this.order.getUncheckedAttribute(null,
453 CrawlOrder.ATTR_CHECKPOINT_COPY_BDBJE_LOGS)).booleanValue();
454 }
455
456 private void setupBdb()
457 throws FatalConfigurationException, AttributeNotFoundException {
458 EnvironmentConfig envConfig = new EnvironmentConfig();
459 envConfig.setAllowCreate(true);
460 int bdbCachePercent = ((Integer)this.order.
461 getAttribute(null, CrawlOrder.ATTR_BDB_CACHE_PERCENT)).intValue();
462 if(bdbCachePercent > 0) {
463
464
465 envConfig.setCachePercent(bdbCachePercent);
466 }
467 envConfig.setLockTimeout(5000000);
468 if (LOGGER.isLoggable(Level.FINEST)) {
469 envConfig.setConfigParam("java.util.logging.level", "SEVERE");
470 envConfig.setConfigParam("java.util.logging.level.evictor",
471 "SEVERE");
472 envConfig.setConfigParam("java.util.logging.ConsoleHandler.on",
473 "true");
474 }
475
476 if (!getCheckpointCopyBdbjeLogs()) {
477
478
479
480
481 envConfig.setConfigParam("je.cleaner.expunge", "false");
482 }
483
484 try {
485 this.bdbEnvironment = new EnhancedEnvironment(getStateDisk(), envConfig);
486 if (LOGGER.isLoggable(Level.FINE)) {
487
488 envConfig = bdbEnvironment.getConfig();
489 LOGGER.fine("BdbConfiguration: Cache percentage " +
490 envConfig.getCachePercent() +
491 ", cache size " + envConfig.getCacheSize());
492 }
493 } catch (DatabaseException e) {
494 e.printStackTrace();
495 throw new FatalConfigurationException(e.getMessage());
496 }
497 }
498
499 /***
500 * @return the shared EnhancedEnvironment
501 */
502 public EnhancedEnvironment getBdbEnvironment() {
503 return this.bdbEnvironment;
504 }
505
506 /***
507 * @deprecated use EnhancedEnvironment's getClassCatalog() instead
508 */
509 public StoredClassCatalog getClassCatalog() {
510 return this.bdbEnvironment.getClassCatalog();
511 }
512
513 /***
514 * Register for CrawlStatus events.
515 *
516 * @param cl a class implementing the CrawlStatusListener interface
517 *
518 * @see CrawlStatusListener
519 */
520 public void addCrawlStatusListener(CrawlStatusListener cl) {
521 synchronized (this.registeredCrawlStatusListeners) {
522 this.registeredCrawlStatusListeners.add(cl);
523 }
524 }
525
526 /***
527 * Register for CrawlURIDisposition events.
528 *
529 * @param cl a class implementing the CrawlURIDispostionListener interface
530 *
531 * @see CrawlURIDispositionListener
532 */
533 public void addCrawlURIDispositionListener(CrawlURIDispositionListener cl) {
534 registeredCrawlURIDispositionListener = null;
535 if (registeredCrawlURIDispositionListeners == null) {
536
537 registeredCrawlURIDispositionListener = cl;
538
539 registeredCrawlURIDispositionListeners
540 = new ArrayList<CrawlURIDispositionListener>(1);
541
542 }
543 registeredCrawlURIDispositionListeners.add(cl);
544 }
545
546 /***
547 * Allows an external class to raise a CrawlURIDispostion
548 * crawledURISuccessful event that will be broadcast to all listeners that
549 * have registered with the CrawlController.
550 *
551 * @param curi - The CrawlURI that will be sent with the event notification.
552 *
553 * @see CrawlURIDispositionListener#crawledURISuccessful(CrawlURI)
554 */
555 public void fireCrawledURISuccessfulEvent(CrawlURI curi) {
556 if (registeredCrawlURIDispositionListener != null) {
557
558 registeredCrawlURIDispositionListener.crawledURISuccessful(curi);
559 } else {
560
561 if (registeredCrawlURIDispositionListeners != null
562 && registeredCrawlURIDispositionListeners.size() > 0) {
563 Iterator it = registeredCrawlURIDispositionListeners.iterator();
564 while (it.hasNext()) {
565 (
566 (CrawlURIDispositionListener) it
567 .next())
568 .crawledURISuccessful(
569 curi);
570 }
571 }
572 }
573 }
574
575 /***
576 * Allows an external class to raise a CrawlURIDispostion
577 * crawledURINeedRetry event that will be broadcast to all listeners that
578 * have registered with the CrawlController.
579 *
580 * @param curi - The CrawlURI that will be sent with the event notification.
581 *
582 * @see CrawlURIDispositionListener#crawledURINeedRetry(CrawlURI)
583 */
584 public void fireCrawledURINeedRetryEvent(CrawlURI curi) {
585 if (registeredCrawlURIDispositionListener != null) {
586
587 registeredCrawlURIDispositionListener.crawledURINeedRetry(curi);
588 return;
589 }
590
591
592 if (registeredCrawlURIDispositionListeners != null
593 && registeredCrawlURIDispositionListeners.size() > 0) {
594 for (Iterator i = registeredCrawlURIDispositionListeners.iterator();
595 i.hasNext();) {
596 ((CrawlURIDispositionListener)i.next()).crawledURINeedRetry(curi);
597 }
598 }
599 }
600
601 /***
602 * Allows an external class to raise a CrawlURIDispostion
603 * crawledURIDisregard event that will be broadcast to all listeners that
604 * have registered with the CrawlController.
605 *
606 * @param curi -
607 * The CrawlURI that will be sent with the event notification.
608 *
609 * @see CrawlURIDispositionListener#crawledURIDisregard(CrawlURI)
610 */
611 public void fireCrawledURIDisregardEvent(CrawlURI curi) {
612 if (registeredCrawlURIDispositionListener != null) {
613
614 registeredCrawlURIDispositionListener.crawledURIDisregard(curi);
615 } else {
616
617 if (registeredCrawlURIDispositionListeners != null
618 && registeredCrawlURIDispositionListeners.size() > 0) {
619 Iterator it = registeredCrawlURIDispositionListeners.iterator();
620 while (it.hasNext()) {
621 (
622 (CrawlURIDispositionListener) it
623 .next())
624 .crawledURIDisregard(
625 curi);
626 }
627 }
628 }
629 }
630
631 /***
632 * Allows an external class to raise a CrawlURIDispostion crawledURIFailure event
633 * that will be broadcast to all listeners that have registered with the CrawlController.
634 *
635 * @param curi - The CrawlURI that will be sent with the event notification.
636 *
637 * @see CrawlURIDispositionListener#crawledURIFailure(CrawlURI)
638 */
639 public void fireCrawledURIFailureEvent(CrawlURI curi) {
640 if (registeredCrawlURIDispositionListener != null) {
641
642 registeredCrawlURIDispositionListener.crawledURIFailure(curi);
643 } else {
644
645 if (registeredCrawlURIDispositionListeners != null
646 && registeredCrawlURIDispositionListeners.size() > 0) {
647 Iterator it = registeredCrawlURIDispositionListeners.iterator();
648 while (it.hasNext()) {
649 ((CrawlURIDispositionListener)it.next())
650 .crawledURIFailure(curi);
651 }
652 }
653 }
654 }
655
656 private void setupCrawlModules() throws FatalConfigurationException,
657 AttributeNotFoundException, MBeanException, ReflectionException {
658 if (scope == null) {
659 scope = (CrawlScope) order.getAttribute(CrawlScope.ATTR_NAME);
660 scope.initialize(this);
661 }
662 try {
663 this.serverCache = new ServerCache(this);
664 } catch (Exception e) {
665 throw new FatalConfigurationException("Unable to" +
666 " initialize frontier (Failed setup of ServerCache) " + e);
667 }
668
669 if (this.frontier == null) {
670 this.frontier = (Frontier)order.getAttribute(Frontier.ATTR_NAME);
671 try {
672 frontier.initialize(this);
673 frontier.pause();
674
675
676
677 if (!isCheckpointRecover()) {
678 runFrontierRecover((String)order.
679 getAttribute(CrawlOrder.ATTR_RECOVER_PATH));
680 }
681 } catch (IOException e) {
682 throw new FatalConfigurationException(
683 "unable to initialize frontier: " + e);
684 }
685 }
686
687
688 if (processorChains == null) {
689 processorChains = new ProcessorChainList(order);
690 }
691 }
692
693 protected void runFrontierRecover(String recoverPath)
694 throws AttributeNotFoundException, MBeanException,
695 ReflectionException, FatalConfigurationException {
696 if (recoverPath == null || recoverPath.length() <= 0) {
697 return;
698 }
699 File f = new File(recoverPath);
700 if (!f.exists()) {
701 LOGGER.severe("Recover file does not exist " + recoverPath);
702 return;
703 }
704 if (!f.isFile()) {
705
706 return;
707 }
708 boolean retainFailures = ((Boolean)order.
709 getAttribute(CrawlOrder.ATTR_RECOVER_RETAIN_FAILURES)).booleanValue();
710 try {
711 frontier.importRecoverLog(recoverPath, retainFailures);
712 } catch (IOException e) {
713 e.printStackTrace();
714 throw (FatalConfigurationException) new FatalConfigurationException(
715 "Recover.log " + recoverPath + " problem: " + e).initCause(e);
716 }
717 }
718
719 private void setupDisk() throws AttributeNotFoundException {
720 String diskPath
721 = (String) order.getAttribute(null, CrawlOrder.ATTR_DISK_PATH);
722 this.disk = getSettingsHandler().
723 getPathRelativeToWorkingDirectory(diskPath);
724 this.disk.mkdirs();
725 this.logsDisk = getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
726 this.checkpointsDisk = getSettingsDir(CrawlOrder.ATTR_CHECKPOINTS_PATH);
727 this.stateDisk = getSettingsDir(CrawlOrder.ATTR_STATE_PATH);
728 this.scratchDisk = getSettingsDir(CrawlOrder.ATTR_SCRATCH_PATH);
729 }
730
731 /***
732 * @return The logging directory or null if problem reading the settings.
733 */
734 public File getLogsDir() {
735 File f = null;
736 try {
737 f = getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
738 } catch (AttributeNotFoundException e) {
739 LOGGER.severe("Failed get of logs directory: " + e.getMessage());
740 }
741 return f;
742 }
743
744 /***
745 * Return fullpath to the directory named by <code>key</code>
746 * in settings.
747 * If directory does not exist, it and all intermediary dirs
748 * will be created.
749 * @param key Key to use going to settings.
750 * @return Full path to directory named by <code>key</code>.
751 * @throws AttributeNotFoundException
752 */
753 public File getSettingsDir(String key)
754 throws AttributeNotFoundException {
755 String path = (String)order.getAttribute(null, key);
756 File f = new File(path);
757 if (!f.isAbsolute()) {
758 f = new File(disk.getPath(), path);
759 }
760 if (!f.exists()) {
761 f.mkdirs();
762 }
763 return f;
764 }
765
766 /***
767 * Setup the statistics tracker.
768 * The statistics object must be created before modules can use it.
769 * Do it here now so that when modules retrieve the object from the
770 * controller during initialization (which some do), its in place.
771 * @throws InvalidAttributeValueException
772 * @throws FatalConfigurationException
773 */
774 private void setupStatTracking()
775 throws InvalidAttributeValueException, FatalConfigurationException {
776 MapType loggers = order.getLoggers();
777 final String cstName = "crawl-statistics";
778 if (loggers.isEmpty(null)) {
779 if (!isCheckpointRecover() && this.statistics == null) {
780 this.statistics = new StatisticsTracker(cstName);
781 }
782 loggers.addElement(null, (StatisticsTracker)this.statistics);
783 }
784
785 if (isCheckpointRecover()) {
786 restoreStatisticsTracker(loggers, cstName);
787 }
788
789 for (Iterator it = loggers.iterator(null); it.hasNext();) {
790 StatisticsTracking tracker = (StatisticsTracking)it.next();
791 tracker.initialize(this);
792 if (this.statistics == null) {
793 this.statistics = tracker;
794 }
795 }
796 }
797
798 protected void restoreStatisticsTracker(MapType loggers,
799 String replaceName)
800 throws FatalConfigurationException {
801 try {
802
803 loggers.removeElement(loggers.globalSettings(), replaceName);
804 loggers.addElement(loggers.globalSettings(),
805 (StatisticsTracker)this.statistics);
806 } catch (Exception e) {
807 throw convertToFatalConfigurationException(e);
808 }
809 }
810
811 protected FatalConfigurationException
812 convertToFatalConfigurationException(Exception e) {
813 FatalConfigurationException fce =
814 new FatalConfigurationException("Converted exception: " +
815 e.getMessage());
816 fce.setStackTrace(e.getStackTrace());
817 return fce;
818 }
819
820 private void setupLogs() throws IOException {
821 String logsPath = logsDisk.getAbsolutePath() + File.separatorChar;
822 uriProcessing = Logger.getLogger(LOGNAME_CRAWL + "." + logsPath);
823 runtimeErrors = Logger.getLogger(LOGNAME_RUNTIME_ERRORS + "." +
824 logsPath);
825 localErrors = Logger.getLogger(LOGNAME_LOCAL_ERRORS + "." + logsPath);
826 uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS + "." + logsPath);
827 progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS + "." +
828 logsPath);
829
830 this.fileHandlers = new HashMap<Logger,FileHandler>();
831
832 setupLogFile(uriProcessing,
833 logsPath + LOGNAME_CRAWL + CURRENT_LOG_SUFFIX,
834 new UriProcessingFormatter(), true);
835
836 setupLogFile(runtimeErrors,
837 logsPath + LOGNAME_RUNTIME_ERRORS + CURRENT_LOG_SUFFIX,
838 new RuntimeErrorFormatter(), true);
839
840 setupLogFile(localErrors,
841 logsPath + LOGNAME_LOCAL_ERRORS + CURRENT_LOG_SUFFIX,
842 new LocalErrorFormatter(), true);
843
844 setupLogFile(uriErrors,
845 logsPath + LOGNAME_URI_ERRORS + CURRENT_LOG_SUFFIX,
846 new UriErrorFormatter(), true);
847
848 setupLogFile(progressStats,
849 logsPath + LOGNAME_PROGRESS_STATISTICS + CURRENT_LOG_SUFFIX,
850 new StatisticsLogFormatter(), true);
851
852 }
853
854 private void setupLogFile(Logger logger, String filename, Formatter f,
855 boolean shouldManifest) throws IOException, SecurityException {
856 GenerationFileHandler fh = new GenerationFileHandler(filename, true,
857 shouldManifest);
858 fh.setFormatter(f);
859 logger.addHandler(fh);
860 addToManifest(filename, MANIFEST_LOG_FILE, shouldManifest);
861 logger.setUseParentHandlers(false);
862 this.fileHandlers.put(logger, fh);
863 }
864
865 protected void rotateLogFiles(String generationSuffix)
866 throws IOException {
867 if (this.state != PAUSED && this.state != CHECKPOINTING) {
868 throw new IllegalStateException("Pause crawl before requesting " +
869 "log rotation.");
870 }
871 for (Iterator i = fileHandlers.keySet().iterator(); i.hasNext();) {
872 Logger l = (Logger)i.next();
873 GenerationFileHandler gfh =
874 (GenerationFileHandler)fileHandlers.get(l);
875 GenerationFileHandler newGfh =
876 gfh.rotate(generationSuffix, CURRENT_LOG_SUFFIX);
877 if (gfh.shouldManifest()) {
878 addToManifest((String) newGfh.getFilenameSeries().get(1),
879 MANIFEST_LOG_FILE, newGfh.shouldManifest());
880 }
881 l.removeHandler(gfh);
882 l.addHandler(newGfh);
883 fileHandlers.put(l, newGfh);
884 }
885 }
886
887 /***
888 * Close all log files and remove handlers from loggers.
889 */
890 public void closeLogFiles() {
891 for (Iterator i = fileHandlers.keySet().iterator(); i.hasNext();) {
892 Logger l = (Logger)i.next();
893 GenerationFileHandler gfh =
894 (GenerationFileHandler)fileHandlers.get(l);
895 gfh.close();
896 l.removeHandler(gfh);
897 }
898 }
899
900 /***
901 * Sets the values for max bytes, docs and time based on crawl order.
902 */
903 private void setThresholds() {
904 try {
905 maxBytes =
906 ((Long) order.getAttribute(CrawlOrder.ATTR_MAX_BYTES_DOWNLOAD))
907 .longValue();
908 } catch (Exception e) {
909 maxBytes = 0;
910 }
911 try {
912 maxDocument =
913 ((Long) order
914 .getAttribute(CrawlOrder.ATTR_MAX_DOCUMENT_DOWNLOAD))
915 .longValue();
916 } catch (Exception e) {
917 maxDocument = 0;
918 }
919 try {
920 maxTime =
921 ((Long) order.getAttribute(CrawlOrder.ATTR_MAX_TIME_SEC))
922 .longValue();
923 } catch (Exception e) {
924 maxTime = 0;
925 }
926 }
927
928 /***
929 * @return Object this controller is using to track crawl statistics
930 */
931 public StatisticsTracking getStatistics() {
932 return statistics==null ?
933 new StatisticsTracker("crawl-statistics"): this.statistics;
934 }
935
936 /***
937 * Send crawl change event to all listeners.
938 * @param newState State change we're to tell listeners' about.
939 * @param message Message on state change.
940 * @see #sendCheckpointEvent(File) for special case event sending
941 * telling listeners to checkpoint.
942 */
943 protected void sendCrawlStateChangeEvent(Object newState, String message) {
944 synchronized (this.registeredCrawlStatusListeners) {
945 this.state = newState;
946 for (Iterator i = this.registeredCrawlStatusListeners.iterator();
947 i.hasNext();) {
948 CrawlStatusListener l = (CrawlStatusListener)i.next();
949 if (newState.equals(PAUSED)) {
950 l.crawlPaused(message);
951 } else if (newState.equals(RUNNING)) {
952 l.crawlResuming(message);
953 } else if (newState.equals(PAUSING)) {
954 l.crawlPausing(message);
955 } else if (newState.equals(STARTED)) {
956 l.crawlStarted(message);
957 } else if (newState.equals(STOPPING)) {
958 l.crawlEnding(message);
959 } else if (newState.equals(FINISHED)) {
960 l.crawlEnded(message);
961 } else if (newState.equals(PREPARING)) {
962 l.crawlResuming(message);
963 } else {
964 throw new RuntimeException("Unknown state: " + newState);
965 }
966 if (LOGGER.isLoggable(Level.FINE)) {
967 LOGGER.fine("Sent " + newState + " to " + l);
968 }
969 }
970 LOGGER.fine("Sent " + newState);
971 }
972 }
973
974 /***
975 * Send the checkpoint event.
976 * Has its own method apart from
977 * {@link #sendCrawlStateChangeEvent(Object, String)} because checkpointing
978 * throws an Exception (Didn't want to have to wrap all of the
979 * sendCrawlStateChangeEvent in try/catches).
980 * @param checkpointDir Where to write checkpoint state to.
981 * @throws Exception
982 */
983 protected void sendCheckpointEvent(File checkpointDir) throws Exception {
984 synchronized (this.registeredCrawlStatusListeners) {
985 if (this.state != PAUSED) {
986 throw new IllegalStateException("Crawler must be completly " +
987 "paused before checkpointing can start");
988 }
989 this.state = CHECKPOINTING;
990 for (Iterator i = this.registeredCrawlStatusListeners.iterator();
991 i.hasNext();) {
992 CrawlStatusListener l = (CrawlStatusListener)i.next();
993 l.crawlCheckpoint(checkpointDir);
994 if (LOGGER.isLoggable(Level.FINE)) {
995 LOGGER.fine("Sent " + CHECKPOINTING + " to " + l);
996 }
997 }
998 LOGGER.fine("Sent " + CHECKPOINTING);
999 }
1000 }
1001
1002 /***
1003 * Operator requested crawl begin
1004 */
1005 public void requestCrawlStart() {
1006 runProcessorInitialTasks();
1007
1008 sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
1009 String jobState;
1010 state = RUNNING;
1011 jobState = CrawlJob.STATUS_RUNNING;
1012 sendCrawlStateChangeEvent(this.state, jobState);
1013
1014
1015 this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;
1016
1017 Thread statLogger = new Thread(statistics);
1018 statLogger.setName("StatLogger");
1019 statLogger.start();
1020
1021 frontier.start();
1022 }
1023
1024 /***
1025 * Called when the last toethread exits.
1026 */
1027 protected void completeStop() {
1028 LOGGER.fine("Entered complete stop.");
1029
1030 runProcessorFinalTasks();
1031
1032 sendCrawlStateChangeEvent(FINISHED, this.sExit);
1033 synchronized (this.registeredCrawlStatusListeners) {
1034
1035 this.registeredCrawlStatusListeners.
1036 removeAll(this.registeredCrawlStatusListeners);
1037 this.registeredCrawlStatusListeners = null;
1038 }
1039
1040 closeLogFiles();
1041
1042
1043 this.fileHandlers = null;
1044 this.uriErrors = null;
1045 this.uriProcessing = null;
1046 this.localErrors = null;
1047 this.runtimeErrors = null;
1048 this.progressStats = null;
1049 this.reports = null;
1050 this.manifest = null;
1051
1052
1053 this.statistics = null;
1054 this.frontier = null;
1055 this.disk = null;
1056 this.scratchDisk = null;
1057 this.order = null;
1058 this.scope = null;
1059 if (this.settingsHandler != null) {
1060 this.settingsHandler.cleanup();
1061 }
1062 this.settingsHandler = null;
1063 this.reserveMemory = null;
1064 this.processorChains = null;
1065 if (this.serverCache != null) {
1066 this.serverCache.cleanup();
1067 this.serverCache = null;
1068 }
1069 if (this.checkpointer != null) {
1070 this.checkpointer.cleanup();
1071 this.checkpointer = null;
1072 }
1073 if (this.bdbEnvironment != null) {
1074 try {
1075 this.bdbEnvironment.sync();
1076 this.bdbEnvironment.close();
1077 } catch (DatabaseException e) {
1078 e.printStackTrace();
1079 }
1080 this.bdbEnvironment = null;
1081 }
1082 this.bigmaps = null;
1083 if (this.toePool != null) {
1084 this.toePool.cleanup();
1085
1086
1087
1088
1089
1090
1091 }
1092 this.toePool = null;
1093 LOGGER.fine("Finished crawl.");
1094 }
1095
1096 synchronized void completePause() {
1097
1098
1099 notifyAll();
1100 sendCrawlStateChangeEvent(PAUSED, CrawlJob.STATUS_PAUSED);
1101 }
1102
1103 private boolean shouldContinueCrawling() {
1104 if (frontier.isEmpty()) {
1105 this.sExit = CrawlJob.STATUS_FINISHED;
1106 return false;
1107 }
1108
1109 if (maxBytes > 0 && frontier.totalBytesWritten() >= maxBytes) {
1110
1111 sExit = CrawlJob.STATUS_FINISHED_DATA_LIMIT;
1112 return false;
1113 } else if (maxDocument > 0
1114 && frontier.succeededFetchCount() >= maxDocument) {
1115
1116 this.sExit = CrawlJob.STATUS_FINISHED_DOCUMENT_LIMIT;
1117 return false;
1118 } else if (maxTime > 0 &&
1119 statistics.crawlDuration() >= maxTime * 1000) {
1120
1121 this.sExit = CrawlJob.STATUS_FINISHED_TIME_LIMIT;
1122 return false;
1123 }
1124 return state == RUNNING;
1125 }
1126
1127 /***
1128 * Request a checkpoint.
1129 * Sets a checkpointing thread running.
1130 * @throws IllegalStateException Thrown if crawl is not in paused state
1131 * (Crawl must be first paused before checkpointing).
1132 */
1133 public synchronized void requestCrawlCheckpoint()
1134 throws IllegalStateException {
1135 if (this.checkpointer == null) {
1136 return;
1137 }
1138 if (this.checkpointer.isCheckpointing()) {
1139 throw new IllegalStateException("Checkpoint already running.");
1140 }
1141 this.checkpointer.checkpoint();
1142 }
1143
1144 /***
1145 * @return True if checkpointing.
1146 */
1147 public boolean isCheckpointing() {
1148 return this.state == CHECKPOINTING;
1149 }
1150
1151 /***
1152 * Run checkpointing.
1153 * CrawlController takes care of managing the checkpointing/serializing
1154 * of bdb, the StatisticsTracker, and the CheckpointContext. Other
1155 * modules that want to revive themselves on checkpoint recovery need to
1156 * save state during their {@link CrawlStatusListener#crawlCheckpoint(File)}
1157 * invocation and then in their #initialize if a module,
1158 * or in their #initialTask if a processor, check with the CrawlController
1159 * if its checkpoint recovery. If it is, read in their old state from the
1160 * pointed to checkpoint directory.
1161 * <p>Default access only to be called by Checkpointer.
1162 * @throws Exception
1163 */
1164 void checkpoint()
1165 throws Exception {
1166
1167 sendCheckpointEvent(this.checkpointer.
1168 getCheckpointInProgressDirectory());
1169
1170
1171 LOGGER.fine("Rotating log files.");
1172 rotateLogFiles(CURRENT_LOG_SUFFIX + "." +
1173 this.checkpointer.getNextCheckpointName());
1174
1175
1176 LOGGER.fine("BigMaps.");
1177 checkpointBigMaps(this.checkpointer.getCheckpointInProgressDirectory());
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187 LOGGER.fine("Bdb environment.");
1188 checkpointBdb(this.checkpointer.getCheckpointInProgressDirectory());
1189
1190
1191 LOGGER.fine("Copying settings.");
1192 copySettings(this.checkpointer.getCheckpointInProgressDirectory());
1193
1194
1195 CheckpointUtils.writeObjectToFile(this,
1196 this.checkpointer.getCheckpointInProgressDirectory());
1197 }
1198
1199 /***
1200 * Copy off the settings.
1201 * @param checkpointDir Directory to write checkpoint to.
1202 * @throws IOException
1203 */
1204 protected void copySettings(final File checkpointDir) throws IOException {
1205 final List files = this.settingsHandler.getListOfAllFiles();
1206 boolean copiedSettingsDir = false;
1207 final File settingsDir = new File(this.disk, "settings");
1208 for (final Iterator i = files.iterator(); i.hasNext();) {
1209 File f = new File((String)i.next());
1210 if (f.getAbsolutePath().startsWith(settingsDir.getAbsolutePath())) {
1211 if (copiedSettingsDir) {
1212
1213
1214 continue;
1215 }
1216
1217 copiedSettingsDir = true;
1218 FileUtils.copyFiles(settingsDir,
1219 new File(checkpointDir, settingsDir.getName()));
1220 continue;
1221 }
1222 FileUtils.copyFiles(f, f.isDirectory()? checkpointDir:
1223 new File(checkpointDir, f.getName()));
1224 }
1225 }
1226
1227 /***
1228 * Checkpoint bdb.
1229 * I used do a call to log cleaning as suggested in je-2.0 javadoc but takes
1230 * way too much time (20minutes for a crawl of 1million items). Assume
1231 * cleaner is keeping up. Below was log cleaning loop .
1232 * <pre>int totalCleaned = 0;
1233 * for (int cleaned = 0; (cleaned = this.bdbEnvironment.cleanLog()) != 0;
1234 * totalCleaned += cleaned) {
1235 * LOGGER.fine("Cleaned " + cleaned + " log files.");
1236 * }
1237 * </pre>
1238 * <p>I also used to do a sync. But, from Mark Hayes, sync and checkpoint
1239 * are effectively same thing only sync is not configurable. He suggests
1240 * doing one or the other:
1241 * <p>MS: Reading code, Environment.sync() is a checkpoint. Looks like
1242 * I don't need to call a checkpoint after calling a sync?
1243 * <p>MH: Right, they're almost the same thing -- just do one or the other,
1244 * not both. With the new API, you'll need to do a checkpoint not a
1245 * sync, because the sync() method has no config parameter. Don't worry
1246 * -- it's fine to do a checkpoint even though you're not using.
1247 * @param checkpointDir Directory to write checkpoint to.
1248 * @throws DatabaseException
1249 * @throws IOException
1250 * @throws RuntimeException Thrown if failed setup of new bdb environment.
1251 */
1252 protected void checkpointBdb(File checkpointDir)
1253 throws DatabaseException, IOException, RuntimeException {
1254 EnvironmentConfig envConfig = this.bdbEnvironment.getConfig();
1255 final List bkgrdThreads = Arrays.asList(new String []
1256 {"je.env.runCheckpointer", "je.env.runCleaner",
1257 "je.env.runINCompressor"});
1258 try {
1259
1260 setBdbjeBkgrdThreads(envConfig, bkgrdThreads, "false");
1261
1262 CheckpointConfig chkptConfig = new CheckpointConfig();
1263 chkptConfig.setForce(true);
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278 chkptConfig.setMinimizeRecoveryTime(true);
1279 this.bdbEnvironment.checkpoint(chkptConfig);
1280 LOGGER.fine("Finished bdb checkpoint.");
1281
1282
1283 EnvironmentImpl envImpl =
1284 DbInternal.envGetEnvironmentImpl(this.bdbEnvironment);
1285 long firstFileInNextSet =
1286 DbLsn.getFileNumber(envImpl.forceLogFileFlip());
1287
1288
1289 final String lastBdbCheckpointLog =
1290 getBdbLogFileName(firstFileInNextSet - 1);
1291 processBdbLogs(checkpointDir, lastBdbCheckpointLog);
1292 LOGGER.fine("Finished processing bdb log files.");
1293 } finally {
1294
1295 setBdbjeBkgrdThreads(envConfig, bkgrdThreads, "true");
1296 }
1297 }
1298
1299 protected void processBdbLogs(final File checkpointDir,
1300 final String lastBdbCheckpointLog) throws IOException {
1301 File bdbDir = CheckpointUtils.getBdbSubDirectory(checkpointDir);
1302 if (!bdbDir.exists()) {
1303 bdbDir.mkdir();
1304 }
1305 PrintWriter pw = new PrintWriter(new FileOutputStream(new File(
1306 checkpointDir, "bdbje-logs-manifest.txt")));
1307 try {
1308
1309
1310 boolean pastLastLogFile = false;
1311 Set<String> srcFilenames = null;
1312 final boolean copyFiles = getCheckpointCopyBdbjeLogs();
1313 do {
1314 FilenameFilter filter = CheckpointUtils.getJeLogsFilter();
1315 srcFilenames =
1316 new HashSet<String>(Arrays.asList(
1317 getStateDisk().list(filter)));
1318 List tgtFilenames = Arrays.asList(bdbDir.list(filter));
1319 if (tgtFilenames != null && tgtFilenames.size() > 0) {
1320 srcFilenames.removeAll(tgtFilenames);
1321 }
1322 if (srcFilenames.size() > 0) {
1323
1324 srcFilenames = new TreeSet<String>(srcFilenames);
1325 int count = 0;
1326 for (final Iterator i = srcFilenames.iterator();
1327 i.hasNext() && !pastLastLogFile;) {
1328 String name = (String) i.next();
1329 if (copyFiles) {
1330 FileUtils.copyFiles(new File(getStateDisk(), name),
1331 new File(bdbDir, name));
1332 }
1333 pw.println(name);
1334 if (name.equals(lastBdbCheckpointLog)) {
1335
1336 pastLastLogFile = true;
1337 }
1338 count++;
1339 }
1340 if (LOGGER.isLoggable(Level.FINE)) {
1341 LOGGER.fine("Copied " + count);
1342 }
1343 }
1344 } while (!pastLastLogFile && srcFilenames != null &&
1345 srcFilenames.size() > 0);
1346 } finally {
1347 pw.close();
1348 }
1349 }
1350
1351 protected String getBdbLogFileName(final long index) {
1352 String lastBdbLogFileHex = Long.toHexString(index);
1353 StringBuffer buffer = new StringBuffer();
1354 for (int i = 0; i < (8 - lastBdbLogFileHex.length()); i++) {
1355 buffer.append('0');
1356 }
1357 buffer.append(lastBdbLogFileHex);
1358 buffer.append(".jdb");
1359 return buffer.toString();
1360 }
1361
1362 protected void setBdbjeBkgrdThreads(final EnvironmentConfig config,
1363 final List threads, final String setting) {
1364 for (final Iterator i = threads.iterator(); i.hasNext();) {
1365 config.setConfigParam((String)i.next(), setting);
1366 }
1367 }
1368
1369 /***
1370 * Get recover checkpoint.
1371 * Returns null if we're NOT in recover mode.
1372 * Looks at ATTR_RECOVER_PATH and if its a directory, assumes checkpoint
1373 * recover. If checkpoint mode, returns Checkpoint instance if
1374 * checkpoint was VALID (else null).
1375 * @return Checkpoint instance if we're in recover checkpoint
1376 * mode and the pointed-to checkpoint was valid.
1377 * @see #isCheckpointRecover()
1378 */
1379 public synchronized Checkpoint getCheckpointRecover() {
1380 if (this.checkpointRecover != null) {
1381 return this.checkpointRecover;
1382 }
1383 return getCheckpointRecover(this.order);
1384 }
1385
1386 public static Checkpoint getCheckpointRecover(final CrawlOrder order) {
1387 String path = (String)order.getUncheckedAttribute(null,
1388 CrawlOrder.ATTR_RECOVER_PATH);
1389 if (path == null || path.length() <= 0) {
1390 return null;
1391 }
1392 File rp = new File(path);
1393
1394 Checkpoint result = null;
1395 if (rp.exists() && rp.isDirectory()) {
1396 Checkpoint cp = new Checkpoint(rp);
1397 if (cp.isValid()) {
1398
1399 result = cp;
1400 }
1401 }
1402 return result;
1403 }
1404
1405 public static boolean isCheckpointRecover(final CrawlOrder order) {
1406 return getCheckpointRecover(order) != null;
1407 }
1408
1409 /***
1410 * @return True if we're in checkpoint recover mode. Call
1411 * {@link #getCheckpointRecover()} to get at Checkpoint instance
1412 * that has info on checkpoint directory being recovered from.
1413 */
1414 public boolean isCheckpointRecover() {
1415 return this.checkpointRecover != null;
1416 }
1417
1418 /***
1419 * Operator requested for crawl to stop.
1420 */
1421 public synchronized void requestCrawlStop() {
1422 requestCrawlStop(CrawlJob.STATUS_ABORTED);
1423 }
1424
1425 /***
1426 * Operator requested for crawl to stop.
1427 * @param message
1428 */
1429 public synchronized void requestCrawlStop(String message) {
1430 if (state == STOPPING || state == FINISHED) {
1431 return;
1432 }
1433 if (message == null) {
1434 throw new IllegalArgumentException("Message cannot be null.");
1435 }
1436 this.sExit = message;
1437 beginCrawlStop();
1438 }
1439
1440 /***
1441 * Start the process of stopping the crawl.
1442 */
1443 public void beginCrawlStop() {
1444 LOGGER.fine("Started.");
1445 sendCrawlStateChangeEvent(STOPPING, this.sExit);
1446 if (this.frontier != null) {
1447 this.frontier.terminate();
1448 this.frontier.unpause();
1449 }
1450 LOGGER.fine("Finished.");
1451 }
1452
1453 /***
1454 * Stop the crawl temporarly.
1455 */
1456 public synchronized void requestCrawlPause() {
1457 if (state == PAUSING || state == PAUSED) {
1458
1459 return;
1460 }
1461 sExit = CrawlJob.STATUS_WAITING_FOR_PAUSE;
1462 frontier.pause();
1463 sendCrawlStateChangeEvent(PAUSING, this.sExit);
1464 if (toePool.getActiveToeCount() == 0) {
1465
1466
1467 completePause();
1468 }
1469 }
1470
1471 /***
1472 * Tell if the controller is paused
1473 * @return true if paused
1474 */
1475 public boolean isPaused() {
1476 return state == PAUSED;
1477 }
1478
1479 public boolean isPausing() {
1480 return state == PAUSING;
1481 }
1482
1483 public boolean isRunning() {
1484 return state == RUNNING;
1485 }
1486
1487 /***
1488 * Resume crawl from paused state
1489 */
1490 public synchronized void requestCrawlResume() {
1491 if (state != PAUSING && state != PAUSED && state != CHECKPOINTING) {
1492
1493
1494 return;
1495 }
1496 multiThreadMode();
1497 frontier.unpause();
1498 LOGGER.fine("Crawl resumed.");
1499 sendCrawlStateChangeEvent(RUNNING, CrawlJob.STATUS_RUNNING);
1500 }
1501
1502 /***
1503 * @return Active toe thread count.
1504 */
1505 public int getActiveToeCount() {
1506 if (toePool == null) {
1507 return 0;
1508 }
1509 return toePool.getActiveToeCount();
1510 }
1511
1512 private void setupToePool() {
1513 toePool = new ToePool(this);
1514
1515 toePool.setSize(order.getMaxToes());
1516 }
1517
1518 /***
1519 * @return The order file instance.
1520 */
1521 public CrawlOrder getOrder() {
1522 return order;
1523 }
1524
1525 /***
1526 * @return The server cache instance.
1527 */
1528 public ServerCache getServerCache() {
1529 return serverCache;
1530 }
1531
1532 /***
1533 * @param o
1534 */
1535 public void setOrder(CrawlOrder o) {
1536 order = o;
1537 }
1538
1539
1540 /***
1541 * @return The frontier.
1542 */
1543 public Frontier getFrontier() {
1544 return frontier;
1545 }
1546
1547 /***
1548 * @return This crawl scope.
1549 */
1550 public CrawlScope getScope() {
1551 return scope;
1552 }
1553
1554 /*** Get the list of processor chains.
1555 *
1556 * @return the list of processor chains.
1557 */
1558 public ProcessorChainList getProcessorChainList() {
1559 return processorChains;
1560 }
1561
1562 /*** Get the first processor chain.
1563 *
1564 * @return the first processor chain.
1565 */
1566 public ProcessorChain getFirstProcessorChain() {
1567 return processorChains.getFirstChain();
1568 }
1569
1570 /*** Get the postprocessor chain.
1571 *
1572 * @return the postprocessor chain.
1573 */
1574 public ProcessorChain getPostprocessorChain() {
1575 return processorChains.getLastChain();
1576 }
1577
1578 /***
1579 * Get the 'working' directory of the current crawl.
1580 * @return the 'working' directory of the current crawl.
1581 */
1582 public File getDisk() {
1583 return disk;
1584 }
1585
1586 /***
1587 * @return Scratch disk location.
1588 */
1589 public File getScratchDisk() {
1590 return scratchDisk;
1591 }
1592
1593 /***
1594 * @return State disk location.
1595 */
1596 public File getStateDisk() {
1597 return stateDisk;
1598 }
1599
1600 /***
1601 * @return The number of ToeThreads
1602 *
1603 * @see ToePool#getToeCount()
1604 */
1605 public int getToeCount() {
1606 return this.toePool == null? 0: this.toePool.getToeCount();
1607 }
1608
1609 /***
1610 * @return The ToePool
1611 */
1612 public ToePool getToePool() {
1613 return toePool;
1614 }
1615
1616 /***
1617 * @return toepool one-line report
1618 */
1619 public String oneLineReportThreads() {
1620
1621 return toePool.singleLineReport();
1622 }
1623
1624 /***
1625 * While many settings will update automatically when the SettingsHandler is
1626 * modified, some settings need to be explicitly changed to reflect new
1627 * settings. This includes, number of toe threads and seeds.
1628 */
1629 public void kickUpdate() {
1630
1631 installThreadContextSettingsHandler();
1632
1633 toePool.setSize(order.getMaxToes());
1634
1635 this.scope.kickUpdate();
1636 this.frontier.kickUpdate();
1637 this.processorChains.kickUpdate();
1638
1639
1640
1641
1642 setThresholds();
1643 }
1644
1645 /***
1646 * @return The settings handler.
1647 */
1648 public SettingsHandler getSettingsHandler() {
1649 return settingsHandler;
1650 }
1651
1652 /***
1653 * This method iterates through processor chains to run processors' initial
1654 * tasks.
1655 *
1656 */
1657 private void runProcessorInitialTasks(){
1658 for (Iterator ic = processorChains.iterator(); ic.hasNext(); ) {
1659 for (Iterator ip = ((ProcessorChain) ic.next()).iterator();
1660 ip.hasNext(); ) {
1661 ((Processor) ip.next()).initialTasks();
1662 }
1663 }
1664 }
1665
1666 /***
1667 * This method iterates through processor chains to run processors' final
1668 * tasks.
1669 *
1670 */
1671 private void runProcessorFinalTasks(){
1672 for (Iterator ic = processorChains.iterator(); ic.hasNext(); ) {
1673 for (Iterator ip = ((ProcessorChain) ic.next()).iterator();
1674 ip.hasNext(); ) {
1675 ((Processor) ip.next()).finalTasks();
1676 }
1677 }
1678 }
1679
1680 /***
1681 * Kills a thread. For details see
1682 * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
1683 * ToePool.killThread(int, boolean)}.
1684 * @param threadNumber Thread to kill.
1685 * @param replace Should thread be replaced.
1686 * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
1687 */
1688 public void killThread(int threadNumber, boolean replace){
1689 toePool.killThread(threadNumber, replace);
1690 }
1691
1692 /***
1693 * Add a file to the manifest of files used/generated by the current
1694 * crawl.
1695 *
1696 * TODO: Its possible for a file to be added twice if reports are
1697 * force generated midcrawl. Fix.
1698 *
1699 * @param file The filename (with absolute path) of the file to add
1700 * @param type The type of the file
1701 * @param bundle Should the file be included in a typical bundling of
1702 * crawler files.
1703 *
1704 * @see #MANIFEST_CONFIG_FILE
1705 * @see #MANIFEST_LOG_FILE
1706 * @see #MANIFEST_REPORT_FILE
1707 */
1708 public void addToManifest(String file, char type, boolean bundle) {
1709 manifest.append(type + (bundle? "+": "-") + " " + file + "\n");
1710 }
1711
1712 /***
1713 * Evaluate if the crawl should stop because it is finished.
1714 */
1715 public void checkFinish() {
1716 if(atFinish()) {
1717 beginCrawlStop();
1718 }
1719 }
1720
1721 /***
1722 * Evaluate if the crawl should stop because it is finished,
1723 * without actually stopping the crawl.
1724 *
1725 * @return true if crawl is at a finish-possible state
1726 */
1727 public boolean atFinish() {
1728 return state == RUNNING && !shouldContinueCrawling();
1729 }
1730
1731 private void readObject(ObjectInputStream stream)
1732 throws IOException, ClassNotFoundException {
1733 stream.defaultReadObject();
1734
1735 this.registeredCrawlStatusListeners =
1736 Collections.synchronizedList(new ArrayList<CrawlStatusListener>());
1737
1738 singleThreadMode = false;
1739 }
1740
1741 /***
1742 * Go to single thread mode, where only one ToeThread may
1743 * proceed at a time. Also acquires the single lock, so
1744 * no further threads will proceed past an
1745 * acquireContinuePermission. Caller mush be sure to release
1746 * lock to allow other threads to proceed one at a time.
1747 */
1748 public void singleThreadMode() {
1749 this.singleThreadLock.lock();
1750 singleThreadMode = true;
1751 }
1752
1753 /***
1754 * Go to back to regular multi thread mode, where all
1755 * ToeThreads may proceed at once
1756 */
1757 public void multiThreadMode() {
1758 this.singleThreadLock.lock();
1759 singleThreadMode = false;
1760 while(this.singleThreadLock.isHeldByCurrentThread()) {
1761 this.singleThreadLock.unlock();
1762 }
1763 }
1764
1765 /***
1766 * Proceed only if allowed, giving CrawlController a chance
1767 * to enforce single-thread mode.
1768 */
1769 public void acquireContinuePermission() {
1770 if (singleThreadMode) {
1771 this.singleThreadLock.lock();
1772 if(!singleThreadMode) {
1773
1774 while(this.singleThreadLock.isHeldByCurrentThread()) {
1775 this.singleThreadLock.unlock();
1776 }
1777 }
1778 }
1779 }
1780
1781 /***
1782 * Relinquish continue permission at end of processing (allowing
1783 * another thread to proceed if in single-thread mode).
1784 */
1785 public void releaseContinuePermission() {
1786 if (singleThreadMode) {
1787 while(this.singleThreadLock.isHeldByCurrentThread()) {
1788 this.singleThreadLock.unlock();
1789 }
1790 }
1791 }
1792
1793 public void freeReserveMemory() {
1794 if(!reserveMemory.isEmpty()) {
1795 reserveMemory.removeLast();
1796 System.gc();
1797 }
1798 }
1799
1800 /***
1801 * Note that a ToeThread reached paused condition, possibly
1802 * completing the crawl-pause.
1803 */
1804 public synchronized void toePaused() {
1805 releaseContinuePermission();
1806 if (state == PAUSING && toePool.getActiveToeCount() == 0) {
1807 completePause();
1808 }
1809 }
1810
1811 /***
1812 * Note that a ToeThread ended, possibly completing the crawl-stop.
1813 */
1814 public synchronized void toeEnded() {
1815 if (state == STOPPING && toePool.getActiveToeCount() == 0) {
1816 completeStop();
1817 }
1818 }
1819
1820 /***
1821 * Add order file contents to manifest.
1822 * Write configuration files and any files managed by CrawlController to
1823 * it - files managed by other classes, excluding the settings framework,
1824 * are responsible for adding their files to the manifest themselves.
1825 * by calling addToManifest.
1826 * Call before writing out reports.
1827 */
1828 public void addOrderToManifest() {
1829 for (Iterator it = getSettingsHandler().getListOfAllFiles().iterator();
1830 it.hasNext();) {
1831 addToManifest((String)it.next(),
1832 CrawlController.MANIFEST_CONFIG_FILE, true);
1833 }
1834 }
1835
1836 /***
1837 * Log a URIException from deep inside other components to the crawl's
1838 * shared log.
1839 *
1840 * @param e URIException encountered
1841 * @param u CrawlURI where problem occurred
1842 * @param l String which could not be interpreted as URI without exception
1843 */
1844 public void logUriError(URIException e, UURI u, CharSequence l) {
1845 if (e.getReasonCode() == UURIFactory.IGNORED_SCHEME) {
1846
1847 return;
1848 }
1849 Object[] array = {u, l};
1850 uriErrors.log(Level.INFO, e.getMessage(), array);
1851 }
1852
1853
1854
1855
1856 public final static String PROCESSORS_REPORT = "processors";
1857 public final static String MANIFEST_REPORT = "manifest";
1858 protected final static String[] REPORTS = {PROCESSORS_REPORT, MANIFEST_REPORT};
1859
1860
1861
1862
1863 public String[] getReports() {
1864 return REPORTS;
1865 }
1866
1867
1868
1869
1870 public void reportTo(PrintWriter writer) {
1871 reportTo(null,writer);
1872 }
1873
1874 public String singleLineReport() {
1875 return ArchiveUtils.singleLineReport(this);
1876 }
1877
1878 public void reportTo(String name, PrintWriter writer) {
1879 if(PROCESSORS_REPORT.equals(name)) {
1880 reportProcessorsTo(writer);
1881 return;
1882 } else if (MANIFEST_REPORT.equals(name)) {
1883 reportManifestTo(writer);
1884 return;
1885 } else if (name!=null) {
1886 writer.println("requested report unknown: "+name);
1887 }
1888 singleLineReportTo(writer);
1889 }
1890
1891 /***
1892 * @param writer Where to write report to.
1893 */
1894 protected void reportManifestTo(PrintWriter writer) {
1895 writer.print(manifest.toString());
1896 }
1897
1898 /***
1899 * Compiles and returns a human readable report on the active processors.
1900 * @param writer Where to write to.
1901 * @see org.archive.crawler.framework.Processor#report()
1902 */
1903 protected void reportProcessorsTo(PrintWriter writer) {
1904 writer.print(
1905 "Processors report - "
1906 + ArchiveUtils.get12DigitDate()
1907 + "\n");
1908 writer.print(" Job being crawled: " + getOrder().getCrawlOrderName()
1909 + "\n");
1910
1911 writer.print(" Number of Processors: " +
1912 processorChains.processorCount() + "\n");
1913 writer.print(" NOTE: Some processors may not return a report!\n\n");
1914
1915 for (Iterator ic = processorChains.iterator(); ic.hasNext(); ) {
1916 for (Iterator ip = ((ProcessorChain) ic.next()).iterator();
1917 ip.hasNext(); ) {
1918 writer.print(((Processor) ip.next()).report());
1919 }
1920 }
1921 }
1922
1923 public void singleLineReportTo(PrintWriter writer) {
1924
1925 writer.write("[Crawl Controller]\n");
1926 }
1927
1928 public String singleLineLegend() {
1929
1930 return "nothingYet";
1931 }
1932
1933 /***
1934 * Call this method to get instance of the crawler BigMap implementation.
1935 * A "BigMap" is a Map that knows how to manage ever-growing sets of
1936 * key/value pairs. If we're in a checkpoint recovery, this method will
1937 * manage reinstantiation of checkpointed bigmaps.
1938 * @param dbName Name to give any associated database. Also used
1939 * as part of name serializing out bigmap. Needs to be unique to a crawl.
1940 * @param keyClass Class of keys we'll be using.
1941 * @param valueClass Class of values we'll be using.
1942 * @return Map that knows how to carry large sets of key/value pairs or
1943 * if none available, returns instance of HashMap.
1944 * @throws Exception
1945 */
1946 public <K,V> Map<K,V> getBigMap(final String dbName,
1947 final Class<? super K> keyClass,
1948 final Class<? super V> valueClass)
1949 throws Exception {
1950 CachedBdbMap<K,V> result = new CachedBdbMap<K,V>(dbName);
1951 if (isCheckpointRecover()) {
1952 File baseDir = getCheckpointRecover().getDirectory();
1953 @SuppressWarnings("unchecked")
1954 CachedBdbMap<K,V> temp = CheckpointUtils.
1955 readObjectFromFile(result.getClass(), dbName, baseDir);
1956 result = temp;
1957 }
1958 result.initialize(getBdbEnvironment(), keyClass, valueClass,
1959 getBdbEnvironment().getClassCatalog());
1960
1961
1962 this.bigmaps.put(dbName, result);
1963 return result;
1964 }
1965
1966 protected void checkpointBigMaps(final File cpDir)
1967 throws Exception {
1968 for (final Iterator i = this.bigmaps.keySet().iterator(); i.hasNext();) {
1969 Object key = i.next();
1970 Object obj = this.bigmaps.get(key);
1971
1972
1973
1974
1975 ((CachedBdbMap)obj).sync();
1976 CheckpointUtils.writeObjectToFile(obj, (String)key, cpDir);
1977 }
1978 }
1979
1980 /***
1981 * Called whenever progress statistics logging event.
1982 * @param e Progress statistics event.
1983 */
1984 public void progressStatisticsEvent(final EventObject e) {
1985
1986
1987
1988
1989 }
1990
1991 /***
1992 * Log to the progress statistics log.
1993 * @param msg Message to write the progress statistics log.
1994 */
1995 public void logProgressStatistics(final String msg) {
1996 this.progressStats.info(msg);
1997 }
1998
1999 /***
2000 * @return CrawlController state.
2001 */
2002 public Object getState() {
2003 return this.state;
2004 }
2005
2006 public File getCheckpointsDisk() {
2007 return this.checkpointsDisk;
2008 }
2009 }