1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.frontier;
26
27 import java.io.BufferedWriter;
28 import java.io.File;
29 import java.io.FileWriter;
30 import java.io.IOException;
31 import java.io.PrintWriter;
32 import java.io.Serializable;
33 import java.io.StringWriter;
34 import java.io.Writer;
35 import java.util.Iterator;
36 import java.util.List;
37 import java.util.concurrent.atomic.AtomicLong;
38 import java.util.logging.Level;
39 import java.util.logging.Logger;
40 import java.util.regex.Pattern;
41
42 import javax.management.AttributeNotFoundException;
43
44 import org.apache.commons.httpclient.HttpStatus;
45 import org.archive.crawler.datamodel.CandidateURI;
46 import org.archive.crawler.datamodel.CoreAttributeConstants;
47 import org.archive.crawler.datamodel.CrawlHost;
48 import org.archive.crawler.datamodel.CrawlOrder;
49 import org.archive.crawler.datamodel.CrawlServer;
50 import org.archive.crawler.datamodel.CrawlSubstats;
51 import org.archive.crawler.datamodel.CrawlURI;
52 import org.archive.crawler.datamodel.FetchStatusCodes;
53 import org.archive.crawler.datamodel.CrawlSubstats.Stage;
54 import org.archive.crawler.event.CrawlStatusListener;
55 import org.archive.crawler.framework.CrawlController;
56 import org.archive.crawler.framework.Frontier;
57 import org.archive.crawler.framework.ToeThread;
58 import org.archive.crawler.framework.Frontier.FrontierGroup;
59 import org.archive.crawler.framework.exceptions.EndedException;
60 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
61 import org.archive.crawler.settings.ModuleType;
62 import org.archive.crawler.settings.RegularExpressionConstraint;
63 import org.archive.crawler.settings.SimpleType;
64 import org.archive.crawler.settings.Type;
65 import org.archive.crawler.url.Canonicalizer;
66 import org.archive.net.UURI;
67 import org.archive.util.ArchiveUtils;
68
69 /***
70 * Shared facilities for Frontier implementations.
71 *
72 * @author gojomo
73 */
74 public abstract class AbstractFrontier extends ModuleType
75 implements CrawlStatusListener, Frontier, FetchStatusCodes,
76 CoreAttributeConstants, Serializable {
77 private static final long serialVersionUID = -4766504935003203930L;
78
79 private static final Logger logger = Logger
80 .getLogger(AbstractFrontier.class.getName());
81
82 protected transient CrawlController controller;
83
84 /*** ordinal numbers to assign to created CrawlURIs */
85 protected AtomicLong nextOrdinal = new AtomicLong(1);
86
87 /*** should the frontier hold any threads asking for URIs? */
88 protected boolean shouldPause = false;
89
90 /***
91 * should the frontier send an EndedException to any threads asking for
92 * URIs?
93 */
94 protected transient boolean shouldTerminate = false;
95
96 /***
97 * how many multiples of last fetch elapsed time to wait before recontacting
98 * same server
99 */
100 public final static String ATTR_DELAY_FACTOR = "delay-factor";
101
102 protected final static Float DEFAULT_DELAY_FACTOR = new Float(5);
103
104 /***
105 * always wait this long after one completion before recontacting same
106 * server, regardless of multiple
107 */
108 public final static String ATTR_MIN_DELAY = "min-delay-ms";
109
110
111 protected final static Integer DEFAULT_MIN_DELAY = new Integer(3000);
112
113 /*** never wait more than this long, regardless of multiple */
114 public final static String ATTR_MAX_DELAY = "max-delay-ms";
115
116
117 protected final static Integer DEFAULT_MAX_DELAY = new Integer(30000);
118
119 /*** number of hops of embeds (ERX) to bump to front of host queue */
120 public final static String ATTR_PREFERENCE_EMBED_HOPS =
121 "preference-embed-hops";
122
123 protected final static Integer DEFAULT_PREFERENCE_EMBED_HOPS =
124 new Integer(1);
125
126 /*** maximum per-host bandwidth usage */
127 public final static String ATTR_MAX_HOST_BANDWIDTH_USAGE =
128 "max-per-host-bandwidth-usage-KB-sec";
129
130 protected final static Integer DEFAULT_MAX_HOST_BANDWIDTH_USAGE =
131 new Integer(0);
132
133 /*** maximum overall bandwidth usage */
134 public final static String ATTR_MAX_OVERALL_BANDWIDTH_USAGE =
135 "total-bandwidth-usage-KB-sec";
136
137 protected final static Integer DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE =
138 new Integer(0);
139
140 /*** for retryable problems, seconds to wait before a retry */
141 public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";
142
143
144 protected final static Long DEFAULT_RETRY_DELAY = new Long(900);
145
146 /*** maximum times to emit a CrawlURI without final disposition */
147 public final static String ATTR_MAX_RETRIES = "max-retries";
148
149 protected final static Integer DEFAULT_MAX_RETRIES = new Integer(30);
150
151 public final static String ATTR_QUEUE_ASSIGNMENT_POLICY =
152 "queue-assignment-policy";
153
154 /*** queue assignment to force onto CrawlURIs; intended to be overridden */
155 public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";
156
157 protected final static String DEFAULT_FORCE_QUEUE = "";
158
159
160 protected final static String ACCEPTABLE_FORCE_QUEUE = "[-//w//.,:]*";
161
162 /*** whether pause, rather than finish, when crawl appears done */
163 public final static String ATTR_PAUSE_AT_FINISH = "pause-at-finish";
164
165 protected final static Boolean DEFAULT_PAUSE_AT_FINISH = Boolean.FALSE;
166
167 /*** whether to pause at crawl start */
168 public final static String ATTR_PAUSE_AT_START = "pause-at-start";
169 protected final static Boolean DEFAULT_PAUSE_AT_START = Boolean.FALSE;
170
171 /*** whether to pause at crawl start */
172 public final static String ATTR_SOURCE_TAG_SEEDS = "source-tag-seeds";
173 protected final static Boolean DEFAULT_SOURCE_TAG_SEEDS = Boolean.FALSE;
174
175 /***
176 * Recover log on or off attribute.
177 */
178 protected final static String ATTR_RECOVERY_ENABLED =
179 "recovery-log-enabled";
180 protected final static Boolean DEFAULT_ATTR_RECOVERY_ENABLED =
181 Boolean.TRUE;
182
183
184 protected long queuedUriCount;
185 protected long succeededFetchCount;
186 protected long failedFetchCount;
187 protected long disregardedUriCount;
188
189
190 /*** total URIs queued to be visited */
191 transient protected AtomicLong liveQueuedUriCount = new AtomicLong(0);
192
193 transient protected AtomicLong liveSucceededFetchCount = new AtomicLong(0);
194
195 transient protected AtomicLong liveFailedFetchCount = new AtomicLong(0);
196
197 /*** URIs that are disregarded (for example because of robot.txt rules */
198 transient protected AtomicLong liveDisregardedUriCount = new AtomicLong(0);
199
200 /***
201 * Used when bandwidth constraint are used.
202 */
203 protected long totalProcessedBytes = 0;
204
205 private transient long nextURIEmitTime = 0;
206
207 protected long processedBytesAfterLastEmittedURI = 0;
208
209 protected int lastMaxBandwidthKB = 0;
210
211 /***
212 * Crawl replay logger.
213 *
214 * Currently captures Frontier/URI transitions.
215 * Can be null if user chose not to run a recovery.log.
216 */
217 private transient FrontierJournal recover = null;
218
219 /*** file collecting report of ignored seed-file entries (if any) */
220 public static final String IGNORED_SEEDS_FILENAME = "seeds.ignored";
221
222 /***
223 * @param name Name of this frontier.
224 * @param description Description for this frontier.
225 */
226 public AbstractFrontier(String name, String description) {
227 super(name, description);
228 addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
229 "How many multiples of last fetch elapsed time to wait before "
230 + "recontacting same server", DEFAULT_DELAY_FACTOR));
231 addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
232 "Never wait more than this long.", DEFAULT_MAX_DELAY));
233 addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
234 "Always wait this long after one completion before recontacting "
235 + "same server.", DEFAULT_MIN_DELAY));
236 addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,
237 "How often to retry fetching a URI that failed to be retrieved. "
238 + "If zero, the crawler will get the robots.txt only.",
239 DEFAULT_MAX_RETRIES));
240 addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,
241 "How long to wait by default until we retry fetching a"
242 + " URI that failed to be retrieved (seconds). ",
243 DEFAULT_RETRY_DELAY));
244 addElementToDefinition(new SimpleType(
245 ATTR_PREFERENCE_EMBED_HOPS,
246 "Number of embedded (or redirected) hops up to which "
247 + "a URI has higher priority scheduling. For example, if set "
248 + "to 1 (the default), items such as inline images (1-hop "
249 + "embedded resources) will be scheduled ahead of all regular "
250 + "links (or many-hop resources, like nested frames). If set to "
251 + "zero, no preferencing will occur, and embeds/redirects are "
252 + "scheduled the same as regular links.",
253 DEFAULT_PREFERENCE_EMBED_HOPS));
254 Type t;
255 t = addElementToDefinition(new SimpleType(
256 ATTR_MAX_OVERALL_BANDWIDTH_USAGE,
257 "The maximum average bandwidth the crawler is allowed to use. "
258 + "The actual read speed is not affected by this setting, it only "
259 + "holds back new URIs from being processed when the bandwidth "
260 + "usage has been to high. 0 means no bandwidth limitation.",
261 DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE));
262 t.setOverrideable(false);
263 t = addElementToDefinition(new SimpleType(
264 ATTR_MAX_HOST_BANDWIDTH_USAGE,
265 "The maximum average bandwidth the crawler is allowed to use per "
266 + "host. The actual read speed is not affected by this setting, "
267 + "it only holds back new URIs from being processed when the "
268 + "bandwidth usage has been to high. 0 means no bandwidth "
269 + "limitation.", DEFAULT_MAX_HOST_BANDWIDTH_USAGE));
270 t.setExpertSetting(true);
271
272
273
274 String queueStr = System.getProperty(AbstractFrontier.class.getName() +
275 "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
276 HostnameQueueAssignmentPolicy.class.getName() + " " +
277 IPQueueAssignmentPolicy.class.getName() + " " +
278 BucketQueueAssignmentPolicy.class.getName() + " " +
279 SurtAuthorityQueueAssignmentPolicy.class.getName() + " " +
280 TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
281 Pattern p = Pattern.compile("//s*,//s*|//s+");
282 String [] queues = p.split(queueStr);
283 if (queues.length <= 0) {
284 throw new RuntimeException("Failed parse of " +
285 " assignment queue policy string: " + queueStr);
286 }
287 t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
288 "Defines how to assign URIs to queues. Can assign by host, " +
289 "by ip, and into one of a fixed set of buckets (1k).",
290 queues[0], queues));
291 t.setExpertSetting(true);
292 t.setOverrideable(true);
293
294 t = addElementToDefinition(new SimpleType(
295 ATTR_FORCE_QUEUE,
296 "The queue name into which to force URIs. Should "
297 + "be left blank at global level. Specify a "
298 + "per-domain/per-host override to force URIs into "
299 + "a particular named queue, regardless of the assignment "
300 + "policy in effect (domain or ip-based politeness). "
301 + "This could be used on domains known to all be from "
302 + "the same small set of IPs (eg blogspot, dailykos, etc.) "
303 + "to simulate IP-based politeness, or it could be used if "
304 + "you wanted to enforce politeness over a whole domain, even "
305 + "though the subdomains are split across many IPs.",
306 DEFAULT_FORCE_QUEUE));
307 t.setOverrideable(true);
308 t.setExpertSetting(true);
309 t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,
310 Level.WARNING, "This field must contain only alphanumeric "
311 + "characters plus period, dash, comma, colon, or underscore."));
312 t = addElementToDefinition(new SimpleType(
313 ATTR_PAUSE_AT_START,
314 "Whether to pause when the crawl begins, before any URIs " +
315 "are tried. This gives the operator a chance to verify or " +
316 "adjust the crawl before actual work begins. " +
317 "Default is false.", DEFAULT_PAUSE_AT_START));
318 t = addElementToDefinition(new SimpleType(
319 ATTR_PAUSE_AT_FINISH,
320 "Whether to pause when the crawl appears finished, rather "
321 + "than immediately end the crawl. This gives the operator an "
322 + "opportunity to view crawl results, and possibly add URIs or "
323 + "adjust settings, while the crawl state is still available. "
324 + "Default is false.", DEFAULT_PAUSE_AT_FINISH));
325 t.setOverrideable(false);
326
327 t = addElementToDefinition(new SimpleType(
328 ATTR_SOURCE_TAG_SEEDS,
329 "Whether to tag seeds with their own URI as a heritable " +
330 "'source' String, which will be carried-forward to all URIs " +
331 "discovered on paths originating from that seed. When " +
332 "present, such source tags appear in the second-to-last " +
333 "crawl.log field.", DEFAULT_SOURCE_TAG_SEEDS));
334 t.setOverrideable(false);
335
336 t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED,
337 "Set to false to disable recovery log writing. Do this if " +
338 "you you are using the checkpoint feature for recovering " +
339 "crashed crawls.", DEFAULT_ATTR_RECOVERY_ENABLED));
340 t.setExpertSetting(true);
341
342 t.setOverrideable(false);
343 }
344
345 public void start() {
346 if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_START))
347 .booleanValue()) {
348
349 controller.requestCrawlPause();
350 } else {
351
352 unpause();
353 }
354 }
355
356 synchronized public void pause() {
357 shouldPause = true;
358 }
359
360 synchronized public void unpause() {
361 shouldPause = false;
362 notifyAll();
363 }
364
365 public void initialize(CrawlController c)
366 throws FatalConfigurationException, IOException {
367 c.addCrawlStatusListener(this);
368 File logsDisk = null;
369 try {
370 logsDisk = c.getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
371 } catch (AttributeNotFoundException e) {
372 logger.log(Level.SEVERE, "Failed to get logs directory", e);
373 }
374 if (logsDisk != null) {
375 String logsPath = logsDisk.getAbsolutePath() + File.separatorChar;
376 if (((Boolean)getUncheckedAttribute(null, ATTR_RECOVERY_ENABLED))
377 .booleanValue()) {
378 this.recover = new RecoveryJournal(logsPath,
379 FrontierJournal.LOGNAME_RECOVER);
380 }
381 }
382
383
384
385
386
387
388
389
390
391
392 }
393
394 synchronized public void terminate() {
395 shouldTerminate = true;
396 if (this.recover != null) {
397 this.recover.close();
398 this.recover = null;
399 }
400 unpause();
401 }
402
403 /***
404 * Report CrawlURI to each of the three 'substats' accumulators
405 * (group/queue, server, host) for a given stage.
406 *
407 * @param curi
408 * @param stage
409 */
410 protected void tally(CrawlURI curi, Stage stage) {
411
412 CrawlServer server =
413 controller.getServerCache().getServerFor(curi);
414 if (server != null) {
415 server.getSubstats().tally(curi, stage);
416 }
417 CrawlHost host =
418 controller.getServerCache().getHostFor(curi);
419 if (host != null) {
420 host.getSubstats().tally(curi, stage);
421 }
422 FrontierGroup group =
423 controller.getFrontier().getGroup(curi);
424 group.getSubstats().tally(curi, stage);
425 }
426
427 protected void doJournalFinishedSuccess(CrawlURI c) {
428 tally(c,CrawlSubstats.Stage.SUCCEEDED);
429 if (this.recover != null) {
430 this.recover.finishedSuccess(c);
431 }
432 }
433
434 protected void doJournalAdded(CrawlURI c) {
435 tally(c,CrawlSubstats.Stage.SCHEDULED);
436 if (this.recover != null) {
437 this.recover.added(c);
438 }
439 }
440
441 protected void doJournalRescheduled(CrawlURI c) {
442 tally(c,CrawlSubstats.Stage.RETRIED);
443 if (this.recover != null) {
444 this.recover.rescheduled(c);
445 }
446 }
447
448 protected void doJournalFinishedFailure(CrawlURI c) {
449 tally(c,CrawlSubstats.Stage.FAILED);
450 if (this.recover != null) {
451 this.recover.finishedFailure(c);
452 }
453 }
454
455 protected void doJournalDisregarded(CrawlURI c) {
456 tally(c,CrawlSubstats.Stage.DISREGARDED);
457 if (this.recover != null) {
458 this.recover.finishedDisregard(c);
459 }
460 }
461
462 protected void doJournalEmitted(CrawlURI c) {
463 if (this.recover != null) {
464 this.recover.emitted(c);
465 }
466 }
467
468 /***
469 * Frontier is empty only if all queues are empty and no URIs are in-process
470 *
471 * @return True if queues are empty.
472 */
473 public boolean isEmpty() {
474 return liveQueuedUriCount.get() == 0;
475 }
476
477 /***
478 * Increment the running count of queued URIs.
479 */
480 protected void incrementQueuedUriCount() {
481 liveQueuedUriCount.incrementAndGet();
482 }
483
484 /***
485 * Increment the running count of queued URIs. Synchronized because
486 * operations on longs are not atomic.
487 *
488 * @param increment
489 * amount to increment the queued count
490 */
491 protected void incrementQueuedUriCount(long increment) {
492 liveQueuedUriCount.addAndGet(increment);
493 }
494
495 /***
496 * Note that a number of queued Uris have been deleted.
497 *
498 * @param numberOfDeletes
499 */
500 protected void decrementQueuedCount(long numberOfDeletes) {
501 liveQueuedUriCount.addAndGet(-numberOfDeletes);
502 }
503
504 /***
505 * (non-Javadoc)
506 *
507 * @see org.archive.crawler.framework.Frontier#queuedUriCount()
508 */
509 public long queuedUriCount() {
510 return liveQueuedUriCount.get();
511 }
512
513 /***
514 * (non-Javadoc)
515 *
516 * @see org.archive.crawler.framework.Frontier#finishedUriCount()
517 */
518 public long finishedUriCount() {
519 return liveSucceededFetchCount.get() + liveFailedFetchCount.get() + liveDisregardedUriCount.get();
520 }
521
522 /***
523 * Increment the running count of successfully fetched URIs.
524 */
525 protected void incrementSucceededFetchCount() {
526 liveSucceededFetchCount.incrementAndGet();
527 }
528
529 /***
530 * (non-Javadoc)
531 *
532 * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
533 */
534 public long succeededFetchCount() {
535 return liveSucceededFetchCount.get();
536 }
537
538 /***
539 * Increment the running count of failed URIs.
540 */
541 protected void incrementFailedFetchCount() {
542 liveFailedFetchCount.incrementAndGet();
543 }
544
545 /***
546 * (non-Javadoc)
547 *
548 * @see org.archive.crawler.framework.Frontier#failedFetchCount()
549 */
550 public long failedFetchCount() {
551 return liveFailedFetchCount.get();
552 }
553
554 /***
555 * Increment the running count of disregarded URIs. Synchronized because
556 * operations on longs are not atomic.
557 */
558 protected void incrementDisregardedUriCount() {
559 liveDisregardedUriCount.incrementAndGet();
560 }
561
562 public long disregardedUriCount() {
563 return liveDisregardedUriCount.get();
564 }
565
566 /*** @deprecated misnomer; use StatisticsTracking figures instead */
567 public long totalBytesWritten() {
568 return totalProcessedBytes;
569 }
570
571 /***
572 * Load up the seeds.
573 *
574 * This method is called on initialize and inside in the crawlcontroller
575 * when it wants to force reloading of configuration.
576 *
577 * @see org.archive.crawler.framework.CrawlController#kickUpdate()
578 */
579 public void loadSeeds() {
580 Writer ignoredWriter = new StringWriter();
581 logger.info("beginning");
582
583 Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);
584 int count = 0;
585 while (iter.hasNext()) {
586 UURI u = (UURI)iter.next();
587 CandidateURI caUri = CandidateURI.createSeedCandidateURI(u);
588 caUri.setSchedulingDirective(CandidateURI.MEDIUM);
589 if (((Boolean)getUncheckedAttribute(null, ATTR_SOURCE_TAG_SEEDS))
590 .booleanValue()) {
591 caUri.putString(CoreAttributeConstants.A_SOURCE_TAG,caUri.toString());
592 caUri.makeHeritable(CoreAttributeConstants.A_SOURCE_TAG);
593 }
594 schedule(caUri);
595 count++;
596 if(count%1000==0) {
597 logger.info(count+" seeds");
598 }
599 }
600
601 saveIgnoredItems(ignoredWriter.toString(), controller.getDisk());
602 logger.info("finished");
603 }
604
605 /***
606 * Dump ignored seed items (if any) to disk; delete file otherwise.
607 * Static to allow non-derived sibling classes (frontiers not yet
608 * subclassed here) to reuse.
609 *
610 * @param ignoredItems
611 * @param dir
612 */
613 public static void saveIgnoredItems(String ignoredItems, File dir) {
614 File ignoredFile = new File(dir, IGNORED_SEEDS_FILENAME);
615 if(ignoredItems==null | ignoredItems.length()>0) {
616 try {
617 BufferedWriter bw = new BufferedWriter(new FileWriter(ignoredFile));
618 bw.write(ignoredItems);
619 bw.close();
620 } catch (IOException e) {
621
622 e.printStackTrace();
623 }
624 } else {
625
626 ignoredFile.delete();
627 }
628 }
629
630 protected CrawlURI asCrawlUri(CandidateURI caUri) {
631 CrawlURI curi;
632 if (caUri instanceof CrawlURI) {
633 curi = (CrawlURI)caUri;
634 } else {
635 curi = CrawlURI.from(caUri, nextOrdinal.getAndIncrement());
636 }
637 curi.setClassKey(getClassKey(curi));
638 return curi;
639 }
640
641 /***
642 * @param now
643 * @throws InterruptedException
644 * @throws EndedException
645 */
646 protected synchronized void preNext(long now) throws InterruptedException,
647 EndedException {
648 if (this.controller == null) {
649 return;
650 }
651
652
653 if (this.controller.atFinish()) {
654 if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_FINISH))
655 .booleanValue()) {
656 this.controller.requestCrawlPause();
657 } else {
658 this.controller.beginCrawlStop();
659 }
660 }
661
662
663 if (shouldPause) {
664 while (shouldPause) {
665 this.controller.toePaused();
666 wait();
667 }
668
669 if (controller != null && controller.atFinish()) {
670 this.controller.beginCrawlStop();
671 }
672 }
673
674
675 if (shouldTerminate
676 || ((ToeThread)Thread.currentThread()).shouldRetire()) {
677 throw new EndedException("terminated");
678 }
679
680 enforceBandwidthThrottle(now);
681 }
682
683 /***
684 * Perform any special handling of the CrawlURI, such as promoting its URI
685 * to seed-status, or preferencing it because it is an embed.
686 *
687 * @param curi
688 */
689 protected void applySpecialHandling(CrawlURI curi) {
690 if (curi.isSeed() && curi.getVia() != null
691 && curi.flattenVia().length() > 0) {
692
693
694
695
696
697
698 this.controller.getScope().addSeed(curi);
699
700 if (curi.getSchedulingDirective() == CandidateURI.NORMAL)
701 curi.setSchedulingDirective(CandidateURI.MEDIUM);
702 }
703
704
705 int prefHops = ((Integer)getUncheckedAttribute(curi,
706 ATTR_PREFERENCE_EMBED_HOPS)).intValue();
707 if (prefHops > 0) {
708 int embedHops = curi.getTransHops();
709 if (embedHops > 0 && embedHops <= prefHops
710 && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
711
712
713 curi.setSchedulingDirective(CandidateURI.MEDIUM);
714 }
715 }
716 }
717
718 /***
719 * Perform fixups on a CrawlURI about to be returned via next().
720 *
721 * @param curi
722 * CrawlURI about to be returned by next()
723 * @param q
724 * the queue from which the CrawlURI came
725 */
726 protected void noteAboutToEmit(CrawlURI curi, WorkQueue q) {
727 curi.setHolder(q);
728
729
730
731
732 doJournalEmitted(curi);
733 }
734
735 /***
736 * @param curi
737 * @return the CrawlServer to be associated with this CrawlURI
738 */
739 protected CrawlServer getServer(CrawlURI curi) {
740 return this.controller.getServerCache().getServerFor(curi);
741 }
742
743 /***
744 * Return a suitable value to wait before retrying the given URI.
745 *
746 * @param curi
747 * CrawlURI to be retried
748 * @return millisecond delay before retry
749 */
750 protected long retryDelayFor(CrawlURI curi) {
751 int status = curi.getFetchStatus();
752 return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST ||
753 status == S_DOMAIN_UNRESOLVABLE)?
754 ((Long)getUncheckedAttribute(curi, ATTR_RETRY_DELAY)).longValue():
755 0;
756 }
757
758 /***
759 * Update any scheduling structures with the new information in this
760 * CrawlURI. Chiefly means make necessary arrangements for no other URIs at
761 * the same host to be visited within the appropriate politeness window.
762 *
763 * @param curi
764 * The CrawlURI
765 * @return millisecond politeness delay
766 */
767 protected long politenessDelayFor(CrawlURI curi) {
768 long durationToWait = 0;
769 if (curi.containsKey(A_FETCH_BEGAN_TIME)
770 && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
771
772 long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
773 long durationTaken = (completeTime - curi
774 .getLong(A_FETCH_BEGAN_TIME));
775 durationToWait = (long)(((Float)getUncheckedAttribute(curi,
776 ATTR_DELAY_FACTOR)).floatValue() * durationTaken);
777
778 long minDelay = ((Integer)getUncheckedAttribute(curi,
779 ATTR_MIN_DELAY)).longValue();
780 if (minDelay > durationToWait) {
781
782 durationToWait = minDelay;
783 }
784
785 long maxDelay = ((Integer)getUncheckedAttribute(curi,
786 ATTR_MAX_DELAY)).longValue();
787 if (durationToWait > maxDelay) {
788
789 durationToWait = maxDelay;
790 }
791
792 long now = System.currentTimeMillis();
793 int maxBandwidthKB = ((Integer)getUncheckedAttribute(curi,
794 ATTR_MAX_HOST_BANDWIDTH_USAGE)).intValue();
795 if (maxBandwidthKB > 0) {
796
797 CrawlHost host = controller.getServerCache().getHostFor(curi);
798 long minDurationToWait = host.getEarliestNextURIEmitTime()
799 - now;
800 float maxBandwidth = maxBandwidthKB * 1.024F;
801 long processedBytes = curi.getContentSize();
802 host
803 .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)
804 + now);
805
806 if (minDurationToWait > durationToWait) {
807 durationToWait = minDurationToWait;
808 }
809 }
810 }
811 return durationToWait;
812 }
813
814 /***
815 * Ensure that any overall-bandwidth-usage limit is respected, by pausing as
816 * long as necessary.
817 *
818 * @param now
819 * @throws InterruptedException
820 */
821 private void enforceBandwidthThrottle(long now) throws InterruptedException {
822 int maxBandwidthKB = ((Integer)getUncheckedAttribute(null,
823 ATTR_MAX_OVERALL_BANDWIDTH_USAGE)).intValue();
824 if (maxBandwidthKB > 0) {
825
826 if (maxBandwidthKB != lastMaxBandwidthKB) {
827 lastMaxBandwidthKB = maxBandwidthKB;
828 processedBytesAfterLastEmittedURI = totalProcessedBytes;
829 }
830
831
832 long sleepTime = nextURIEmitTime - now;
833 float maxBandwidth = maxBandwidthKB * 1.024F;
834 long processedBytes = totalProcessedBytes
835 - processedBytesAfterLastEmittedURI;
836 long shouldHaveEmittedDiff = nextURIEmitTime == 0? 0
837 : nextURIEmitTime - now;
838 nextURIEmitTime = (long)(processedBytes / maxBandwidth) + now
839 + shouldHaveEmittedDiff;
840 processedBytesAfterLastEmittedURI = totalProcessedBytes;
841 if (sleepTime > 0) {
842 long targetTime = now + sleepTime;
843 now = System.currentTimeMillis();
844 while (now < targetTime) {
845 synchronized (this) {
846 if (logger.isLoggable(Level.FINE)) {
847 logger.fine("Frontier waits for: " + sleepTime
848 + "ms to respect bandwidth limit.");
849 }
850
851
852
853
854
855 wait(targetTime - now);
856 }
857 now = System.currentTimeMillis();
858 }
859 }
860 }
861 }
862
863 /***
864 * Take note of any processor-local errors that have been entered into the
865 * CrawlURI.
866 *
867 * @param curi
868 *
869 */
870 protected void logLocalizedErrors(CrawlURI curi) {
871 if (curi.containsKey(A_LOCALIZED_ERRORS)) {
872 List localErrors = (List)curi.getObject(A_LOCALIZED_ERRORS);
873 Iterator iter = localErrors.iterator();
874 while (iter.hasNext()) {
875 Object array[] = {curi, iter.next()};
876 controller.localErrors.log(Level.WARNING, curi.getUURI()
877 .toString(), array);
878 }
879
880 curi.remove(A_LOCALIZED_ERRORS);
881 }
882 }
883
884 /***
885 * Utility method to return a scratch dir for the given key's temp files.
886 * Every key gets its own subdir. To avoid having any one directory with
887 * thousands of files, there are also two levels of enclosing directory
888 * named by the least-significant hex digits of the key string's java
889 * hashcode.
890 *
891 * @param key
892 * @return File representing scratch directory
893 */
894 protected File scratchDirFor(String key) {
895 String hex = Integer.toHexString(key.hashCode());
896 while (hex.length() < 4) {
897 hex = "0" + hex;
898 }
899 int len = hex.length();
900 return new File(this.controller.getStateDisk(), hex.substring(len - 2,
901 len)
902 + File.separator
903 + hex.substring(len - 4, len - 2)
904 + File.separator + key);
905 }
906
907 protected boolean overMaxRetries(CrawlURI curi) {
908
909 if (curi.getFetchAttempts() >= ((Integer)getUncheckedAttribute(curi,
910 ATTR_MAX_RETRIES)).intValue()) {
911 return true;
912 }
913 return false;
914 }
915
916 public void importRecoverLog(String pathToLog, boolean retainFailures)
917 throws IOException {
918 File source = new File(pathToLog);
919 if (!source.isAbsolute()) {
920 source = new File(getSettingsHandler().getOrder().getController()
921 .getDisk(), pathToLog);
922 }
923 RecoveryJournal.importRecoverLog(source, controller, retainFailures);
924 }
925
926
927
928
929
930
931 public void kickUpdate() {
932
933
934 }
935
936 /***
937 * Log to the main crawl.log
938 *
939 * @param curi
940 */
941 protected void log(CrawlURI curi) {
942 curi.aboutToLog();
943 Object array[] = {curi};
944 this.controller.uriProcessing.log(Level.INFO,
945 curi.getUURI().toString(), array);
946 }
947
948 protected boolean isDisregarded(CrawlURI curi) {
949 switch (curi.getFetchStatus()) {
950 case S_ROBOTS_PRECLUDED:
951 case S_BLOCKED_BY_CUSTOM_PROCESSOR:
952 case S_OUT_OF_SCOPE:
953 case S_BLOCKED_BY_USER:
954 case S_TOO_MANY_EMBED_HOPS:
955 case S_TOO_MANY_LINK_HOPS:
956 case S_DELETED_BY_USER:
957 return true;
958 default:
959 return false;
960 }
961 }
962
963 /***
964 * Checks if a recently completed CrawlURI that did not finish successfully
965 * needs to be retried (processed again after some time elapses)
966 *
967 * @param curi
968 * The CrawlURI to check
969 * @return True if we need to retry.
970 */
971 protected boolean needsRetrying(CrawlURI curi) {
972 if (overMaxRetries(curi)) {
973 return false;
974 }
975
976 switch (curi.getFetchStatus()) {
977 case HttpStatus.SC_UNAUTHORIZED:
978
979
980
981
982
983
984 boolean loaded = curi.hasRfc2617CredentialAvatar();
985 if (!loaded && logger.isLoggable(Level.INFO)) {
986 logger.info("Have 401 but no creds loaded " + curi);
987 }
988 return loaded;
989 case S_DEFERRED:
990 case S_CONNECT_FAILED:
991 case S_CONNECT_LOST:
992 case S_DOMAIN_UNRESOLVABLE:
993
994
995
996 return true;
997 default:
998 return false;
999 }
1000 }
1001
1002 /***
1003 * Canonicalize passed uuri. Its would be sweeter if this canonicalize
1004 * function was encapsulated by that which it canonicalizes but because
1005 * settings change with context -- i.e. there may be overrides in operation
1006 * for a particular URI -- its not so easy; Each CandidateURI would need a
1007 * reference to the settings system. That's awkward to pass in.
1008 *
1009 * @param uuri Candidate URI to canonicalize.
1010 * @return Canonicalized version of passed <code>uuri</code>.
1011 */
1012 protected String canonicalize(UURI uuri) {
1013 return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
1014 }
1015
1016 /***
1017 * Canonicalize passed CandidateURI. This method differs from
1018 * {@link #canonicalize(UURI)} in that it takes a look at
1019 * the CandidateURI context possibly overriding any canonicalization effect if
1020 * it could make us miss content. If canonicalization produces an URL that
1021 * was 'alreadyseen', but the entry in the 'alreadyseen' database did
1022 * nothing but redirect to the current URL, we won't get the current URL;
1023 * we'll think we've already see it. Examples would be archive.org
1024 * redirecting to www.archive.org or the inverse, www.netarkivet.net
1025 * redirecting to netarkivet.net (assuming stripWWW rule enabled).
1026 * <p>Note, this method under circumstance sets the forceFetch flag.
1027 *
1028 * @param cauri CandidateURI to examine.
1029 * @return Canonicalized <code>cacuri</code>.
1030 */
1031 protected String canonicalize(CandidateURI cauri) {
1032 String canon = canonicalize(cauri.getUURI());
1033 if (cauri.isLocation()) {
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044 if (!cauri.toString().equals(cauri.getVia().toString()) &&
1045 canonicalize(cauri.getVia()).equals(canon)) {
1046 cauri.setForceFetch(true);
1047 }
1048 }
1049 return canon;
1050 }
1051
1052 /***
1053 * @param cauri CrawlURI we're to get a key for.
1054 * @return a String token representing a queue
1055 */
1056 public String getClassKey(CandidateURI cauri) {
1057 String queueKey = (String)getUncheckedAttribute(cauri,
1058 ATTR_FORCE_QUEUE);
1059 if ("".equals(queueKey)) {
1060
1061 QueueAssignmentPolicy queueAssignmentPolicy =
1062 getQueueAssignmentPolicy(cauri);
1063 queueKey =
1064 queueAssignmentPolicy.getClassKey(this.controller, cauri);
1065 }
1066 return queueKey;
1067 }
1068
1069 protected QueueAssignmentPolicy getQueueAssignmentPolicy(CandidateURI cauri) {
1070 String clsName = (String)getUncheckedAttribute(cauri,
1071 ATTR_QUEUE_ASSIGNMENT_POLICY);
1072 try {
1073 return (QueueAssignmentPolicy) Class.forName(clsName).newInstance();
1074 } catch (Exception e) {
1075 throw new RuntimeException(e);
1076 }
1077 }
1078
1079 /***
1080 * @return RecoveryJournal instance. May be null.
1081 */
1082 public FrontierJournal getFrontierJournal() {
1083 return this.recover;
1084 }
1085
1086 public void crawlEnding(String sExitMessage) {
1087
1088 }
1089
1090 public void crawlEnded(String sExitMessage) {
1091 if (logger.isLoggable(Level.INFO)) {
1092 logger.info("Closing with " + Long.toString(queuedUriCount()) +
1093 " urls still in queue.");
1094 }
1095 }
1096
1097 public void crawlStarted(String message) {
1098
1099 }
1100
1101 public void crawlPausing(String statusMessage) {
1102
1103 }
1104
1105 public void crawlPaused(String statusMessage) {
1106
1107 }
1108
1109 public void crawlResuming(String statusMessage) {
1110
1111 }
1112
1113 public void crawlCheckpoint(File checkpointDir)
1114 throws Exception {
1115 if (this.recover == null) {
1116 return;
1117 }
1118 this.recover.checkpoint(checkpointDir);
1119 }
1120
1121
1122
1123
1124 public String singleLineReport() {
1125 return ArchiveUtils.singleLineReport(this);
1126 }
1127
1128 public void reportTo(PrintWriter writer) {
1129 reportTo(null, writer);
1130 }
1131
1132
1133
1134 private void writeObject(java.io.ObjectOutputStream out)
1135 throws IOException {
1136 queuedUriCount = liveQueuedUriCount.get();
1137 succeededFetchCount = liveSucceededFetchCount.get();
1138 failedFetchCount = liveFailedFetchCount.get();
1139 disregardedUriCount = liveDisregardedUriCount.get();
1140 out.defaultWriteObject();
1141 }
1142 private void readObject(java.io.ObjectInputStream in)
1143 throws IOException, ClassNotFoundException {
1144 in.defaultReadObject();
1145 liveQueuedUriCount = new AtomicLong(queuedUriCount);
1146 liveSucceededFetchCount = new AtomicLong(succeededFetchCount);
1147 liveFailedFetchCount = new AtomicLong(failedFetchCount);
1148 liveDisregardedUriCount = new AtomicLong(disregardedUriCount);
1149 }
1150 }