1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.frontier;
24
25 import java.io.IOException;
26 import java.io.PrintWriter;
27 import java.io.Serializable;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.Collections;
31 import java.util.HashMap;
32 import java.util.Iterator;
33 import java.util.Map;
34 import java.util.Queue;
35 import java.util.SortedSet;
36 import java.util.Timer;
37 import java.util.TimerTask;
38 import java.util.TreeSet;
39 import java.util.concurrent.BlockingQueue;
40 import java.util.concurrent.LinkedBlockingQueue;
41 import java.util.concurrent.Semaphore;
42 import java.util.concurrent.TimeUnit;
43 import java.util.logging.Level;
44 import java.util.logging.Logger;
45
46 import org.apache.commons.collections.Bag;
47 import org.apache.commons.collections.BagUtils;
48 import org.apache.commons.collections.bag.HashBag;
49 import org.apache.commons.lang.StringUtils;
50 import org.archive.crawler.datamodel.CandidateURI;
51 import org.archive.crawler.datamodel.CoreAttributeConstants;
52 import org.archive.crawler.datamodel.CrawlURI;
53 import org.archive.crawler.datamodel.FetchStatusCodes;
54 import org.archive.crawler.datamodel.UriUniqFilter;
55 import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;
56 import org.archive.crawler.framework.CrawlController;
57 import org.archive.crawler.framework.Frontier;
58 import org.archive.crawler.framework.exceptions.EndedException;
59 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
60 import org.archive.crawler.settings.SimpleType;
61 import org.archive.crawler.settings.Type;
62 import org.archive.net.UURI;
63 import org.archive.util.ArchiveUtils;
64
65 import com.sleepycat.collections.StoredIterator;
66
67 /***
68 * A common Frontier base using several queues to hold pending URIs.
69 *
70 * Uses in-memory map of all known 'queues' inside a single database.
71 * Round-robins between all queues.
72 *
73 * @author Gordon Mohr
74 * @author Christian Kohlschuetter
75 */
76 public abstract class WorkQueueFrontier extends AbstractFrontier
77 implements FetchStatusCodes, CoreAttributeConstants, HasUriReceiver,
78 Serializable {
79 private static final long serialVersionUID = 570384305871965843L;
80
81 public class WakeTask extends TimerTask {
82 @Override
83 public void run() {
84 synchronized(snoozedClassQueues) {
85 if(this!=nextWake) {
86
87 return;
88 }
89 wakeQueues();
90 }
91 }
92 }
93
94 /*** truncate reporting of queues at some large but not unbounded number */
95 private static final int REPORT_MAX_QUEUES = 2000;
96
97 /***
98 * If we know that only a small amount of queues is held in memory,
99 * we can avoid using a disk-based BigMap.
100 * This only works efficiently if the WorkQueue does not hold its
101 * entries in memory as well.
102 */
103 private static final int MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY = 3000;
104
105 /***
106 * When a snooze target for a queue is longer than this amount, and
107 * there are already ready queues, deactivate rather than snooze
108 * the current queue -- so other more responsive sites get a chance
109 * in active rotation. (As a result, queue's next try may be much
110 * further in the future than the snooze target delay.)
111 */
112 public final static String ATTR_SNOOZE_DEACTIVATE_MS =
113 "snooze-deactivate-ms";
114 public static Long DEFAULT_SNOOZE_DEACTIVATE_MS = new Long(5*60*1000);
115
116 private static final Logger logger =
117 Logger.getLogger(WorkQueueFrontier.class.getName());
118
119 /*** whether to hold queues INACTIVE until needed for throughput */
120 public final static String ATTR_HOLD_QUEUES = "hold-queues";
121 protected final static Boolean DEFAULT_HOLD_QUEUES = new Boolean(true);
122
123 /*** amount to replenish budget on each activation (duty cycle) */
124 public final static String ATTR_BALANCE_REPLENISH_AMOUNT =
125 "balance-replenish-amount";
126 protected final static Integer DEFAULT_BALANCE_REPLENISH_AMOUNT =
127 new Integer(3000);
128
129 /*** whether to hold queues INACTIVE until needed for throughput */
130 public final static String ATTR_ERROR_PENALTY_AMOUNT =
131 "error-penalty-amount";
132 protected final static Integer DEFAULT_ERROR_PENALTY_AMOUNT =
133 new Integer(100);
134
135
136 /*** total expenditure to allow a queue before 'retiring' it */
137 public final static String ATTR_QUEUE_TOTAL_BUDGET = "queue-total-budget";
138 protected final static Long DEFAULT_QUEUE_TOTAL_BUDGET = new Long(-1);
139
140 /*** cost assignment policy to use (by class name) */
141 public final static String ATTR_COST_POLICY = "cost-policy";
142 protected final static String DEFAULT_COST_POLICY =
143 UnitCostAssignmentPolicy.class.getName();
144
145 /*** target size of ready queues backlog */
146 public final static String ATTR_TARGET_READY_QUEUES_BACKLOG =
147 "target-ready-backlog";
148 protected final static Integer DEFAULT_TARGET_READY_QUEUES_BACKLOG =
149 new Integer(50);
150
151 /*** those UURIs which are already in-process (or processed), and
152 thus should not be rescheduled */
153 protected transient UriUniqFilter alreadyIncluded;
154
155 /*** All known queues.
156 */
157 protected transient Map<String,WorkQueue> allQueues = null;
158
159
160 /***
161 * All per-class queues whose first item may be handed out.
162 * Linked-list of keys for the queues.
163 */
164 protected BlockingQueue<String> readyClassQueues;
165
166 /*** Target (minimum) size to keep readyClassQueues */
167 protected int targetSizeForReadyQueues;
168
169 /*** single-thread access to ready-filling code */
170 protected transient Semaphore readyFiller = new Semaphore(1);
171
172 /***
173 * All 'inactive' queues, not yet in active rotation.
174 * Linked-list of keys for the queues.
175 */
176 protected Queue<String> inactiveQueues;
177
178 /***
179 * 'retired' queues, no longer considered for activation.
180 * Linked-list of keys for queues.
181 */
182 protected Queue<String> retiredQueues;
183
184 /*** all per-class queues from whom a URI is outstanding */
185 protected Bag inProcessQueues =
186 BagUtils.synchronizedBag(new HashBag());
187
188 /***
189 * All per-class queues held in snoozed state, sorted by wake time.
190 */
191 protected SortedSet<WorkQueue> snoozedClassQueues;
192
193 /*** Timer for tasks which wake head item of snoozedClassQueues */
194 protected transient Timer wakeTimer;
195
196 /*** Task for next wake */
197 protected transient WakeTask nextWake;
198
199 protected WorkQueue longestActiveQueue = null;
200
201 /*** how long to wait for a ready queue when there's nothing snoozed */
202 private static final long DEFAULT_WAIT = 1000;
203
204 /*** a policy for assigning 'cost' values to CrawlURIs */
205 private transient CostAssignmentPolicy costAssignmentPolicy;
206
207 /*** all policies available to be chosen */
208 String[] AVAILABLE_COST_POLICIES = new String[] {
209 ZeroCostAssignmentPolicy.class.getName(),
210 UnitCostAssignmentPolicy.class.getName(),
211 WagCostAssignmentPolicy.class.getName(),
212 AntiCalendarCostAssignmentPolicy.class.getName()};
213
214 /***
215 * Create the CommonFrontier
216 *
217 * @param name
218 * @param description
219 */
220 public WorkQueueFrontier(String name, String description) {
221
222
223 super(Frontier.ATTR_NAME, description);
224 Type t = addElementToDefinition(new SimpleType(ATTR_HOLD_QUEUES,
225 "Whether to hold newly-created per-host URI work" +
226 " queues until needed to stay busy. If false (default)," +
227 " all queues may contribute URIs for crawling at all" +
228 " times. If true, queues begin (and collect URIs) in" +
229 " an 'inactive' state, and only when the Frontier needs" +
230 " another queue to keep all ToeThreads busy will new" +
231 " queues be activated.", DEFAULT_HOLD_QUEUES));
232 t.setExpertSetting(true);
233 t.setOverrideable(false);
234 t = addElementToDefinition(new SimpleType(ATTR_BALANCE_REPLENISH_AMOUNT,
235 "Amount to replenish a queue's activity balance when it becomes " +
236 "active. Larger amounts mean more URIs will be tried from the " +
237 "queue before it is deactivated in favor of waiting queues. " +
238 "Default is 3000", DEFAULT_BALANCE_REPLENISH_AMOUNT));
239 t.setExpertSetting(true);
240 t.setOverrideable(true);
241 t = addElementToDefinition(new SimpleType(ATTR_ERROR_PENALTY_AMOUNT,
242 "Amount to additionally penalize a queue when one of" +
243 "its URIs fails completely. Accelerates deactivation or " +
244 "full retirement of problem queues and unresponsive sites. " +
245 "Default is 100", DEFAULT_ERROR_PENALTY_AMOUNT));
246 t.setExpertSetting(true);
247 t.setOverrideable(true);
248 t = addElementToDefinition(new SimpleType(ATTR_QUEUE_TOTAL_BUDGET,
249 "Total activity expenditure allowable to a single queue; queues " +
250 "over this expenditure will be 'retired' and crawled no more. " +
251 "Default of -1 means no ceiling on activity expenditures is " +
252 "enforced.", DEFAULT_QUEUE_TOTAL_BUDGET));
253 t.setExpertSetting(true);
254 t.setOverrideable(true);
255
256 t = addElementToDefinition(new SimpleType(ATTR_COST_POLICY,
257 "Policy for calculating the cost of each URI attempted. " +
258 "The default UnitCostAssignmentPolicy considers the cost of " +
259 "each URI to be '1'.", DEFAULT_COST_POLICY, AVAILABLE_COST_POLICIES));
260 t.setExpertSetting(true);
261
262 t = addElementToDefinition(new SimpleType(ATTR_SNOOZE_DEACTIVATE_MS,
263 "Threshold above which any 'snooze' delay will cause the " +
264 "affected queue to go inactive, allowing other queues a " +
265 "chance to rotate into active state. Typically set to be " +
266 "longer than the politeness pauses between successful " +
267 "fetches, but shorter than the connection-failed " +
268 "'retry-delay-seconds'. (Default is 5 minutes.)",
269 DEFAULT_SNOOZE_DEACTIVATE_MS));
270 t.setExpertSetting(true);
271 t.setOverrideable(false);
272 t = addElementToDefinition(new SimpleType(ATTR_TARGET_READY_QUEUES_BACKLOG,
273 "Target size for backlog of ready queues. This many queues " +
274 "will be brought into 'ready' state even if a thread is " +
275 "not waiting. Only has effect if 'hold-queues' is true. " +
276 "Default is 50.", DEFAULT_TARGET_READY_QUEUES_BACKLOG));
277 t.setExpertSetting(true);
278 t.setOverrideable(false);
279 }
280
281 /***
282 * Initializes the Frontier, given the supplied CrawlController.
283 *
284 * @see org.archive.crawler.framework.Frontier#initialize(org.archive.crawler.framework.CrawlController)
285 */
286 public void initialize(CrawlController c)
287 throws FatalConfigurationException, IOException {
288
289 super.initialize(c);
290 this.controller = c;
291
292 initQueuesOfQueues();
293
294 this.targetSizeForReadyQueues = (Integer)getUncheckedAttribute(null,
295 ATTR_TARGET_READY_QUEUES_BACKLOG);
296 if (this.targetSizeForReadyQueues < 1) {
297 this.targetSizeForReadyQueues = 1;
298 }
299 this.wakeTimer = new Timer("waker for " + c.toString());
300
301 try {
302 if (workQueueDataOnDisk()
303 && getQueueAssignmentPolicy(null).maximumNumberOfKeys() >= 0
304 && getQueueAssignmentPolicy(null).maximumNumberOfKeys() <=
305 MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY) {
306 this.allQueues = Collections.synchronizedMap(
307 new HashMap<String,WorkQueue>());
308 } else {
309 this.allQueues = c.getBigMap("allqueues",
310 String.class, WorkQueue.class);
311 if (logger.isLoggable(Level.FINE)) {
312 Iterator i = this.allQueues.keySet().iterator();
313 try {
314 for (; i.hasNext();) {
315 logger.fine((String) i.next());
316 }
317 } finally {
318 StoredIterator.close(i);
319 }
320 }
321 }
322 this.alreadyIncluded = createAlreadyIncluded();
323 initQueue();
324 } catch (IOException e) {
325 e.printStackTrace();
326 throw (FatalConfigurationException)
327 new FatalConfigurationException(e.getMessage()).initCause(e);
328 } catch (Exception e) {
329 e.printStackTrace();
330 throw (FatalConfigurationException)
331 new FatalConfigurationException(e.getMessage()).initCause(e);
332 }
333
334 initCostPolicy();
335
336 loadSeeds();
337 }
338
339 /***
340 * Set up the various queues-of-queues used by the frontier. Override
341 * in implementing subclasses to reduce or eliminate risk of queues
342 * growing without bound.
343 */
344 protected void initQueuesOfQueues() {
345
346
347 readyClassQueues = new LinkedBlockingQueue<String>();
348
349
350 inactiveQueues = new LinkedBlockingQueue<String>();
351
352
353 retiredQueues = new LinkedBlockingQueue<String>();
354
355
356
357 snoozedClassQueues = Collections.synchronizedSortedSet(new TreeSet<WorkQueue>());
358 }
359
360 /***
361 * Set (or reset after configuration change) the cost policy in effect.
362 *
363 * @throws FatalConfigurationException
364 */
365 private void initCostPolicy() throws FatalConfigurationException {
366 try {
367 costAssignmentPolicy = (CostAssignmentPolicy) Class.forName(
368 (String) getUncheckedAttribute(null, ATTR_COST_POLICY))
369 .newInstance();
370 } catch (Exception e) {
371 e.printStackTrace();
372 throw new FatalConfigurationException(e.getMessage());
373 }
374 }
375
376
377
378
379 public void crawlEnded(String sExitMessage) {
380
381
382 if (this.alreadyIncluded != null) {
383 this.alreadyIncluded.close();
384 this.alreadyIncluded = null;
385 }
386
387 try {
388 closeQueue();
389 } catch (IOException e) {
390
391 e.printStackTrace();
392 }
393 this.wakeTimer.cancel();
394
395 this.allQueues.clear();
396 this.allQueues = null;
397 this.inProcessQueues = null;
398 this.readyClassQueues = null;
399 this.snoozedClassQueues = null;
400 this.inactiveQueues = null;
401 this.retiredQueues = null;
402
403 this.costAssignmentPolicy = null;
404
405
406 super.crawlEnded(sExitMessage);
407 this.controller = null;
408 }
409
410 /***
411 * Create a UriUniqFilter that will serve as record
412 * of already seen URIs.
413 *
414 * @return A UURISet that will serve as a record of already seen URIs
415 * @throws IOException
416 */
417 protected abstract UriUniqFilter createAlreadyIncluded() throws IOException;
418
419 /***
420 * Arrange for the given CandidateURI to be visited, if it is not
421 * already scheduled/completed.
422 *
423 * @see org.archive.crawler.framework.Frontier#schedule(org.archive.crawler.datamodel.CandidateURI)
424 */
425 public void schedule(CandidateURI caUri) {
426
427
428 String canon = canonicalize(caUri);
429 if (caUri.forceFetch()) {
430 alreadyIncluded.addForce(canon, caUri);
431 } else {
432 alreadyIncluded.add(canon, caUri);
433 }
434 }
435
436 /***
437 * Accept the given CandidateURI for scheduling, as it has
438 * passed the alreadyIncluded filter.
439 *
440 * Choose a per-classKey queue and enqueue it. If this
441 * item has made an unready queue ready, place that
442 * queue on the readyClassQueues queue.
443 * @param caUri CandidateURI.
444 */
445 public void receive(CandidateURI caUri) {
446 CrawlURI curi = asCrawlUri(caUri);
447 applySpecialHandling(curi);
448 sendToQueue(curi);
449
450 doJournalAdded(curi);
451 }
452
453
454
455
456 protected CrawlURI asCrawlUri(CandidateURI caUri) {
457 CrawlURI curi = super.asCrawlUri(caUri);
458
459 getCost(curi);
460 return curi;
461 }
462
463 /***
464 * Send a CrawlURI to the appropriate subqueue.
465 *
466 * @param curi
467 */
468 protected void sendToQueue(CrawlURI curi) {
469 WorkQueue wq = getQueueFor(curi);
470 synchronized (wq) {
471 wq.enqueue(this, curi);
472 if(!wq.isRetired()) {
473 incrementQueuedUriCount();
474 }
475 if(!wq.isHeld()) {
476 wq.setHeld();
477 if(holdQueues() && readyClassQueues.size()>=targetSizeForReadyQueues()) {
478 deactivateQueue(wq);
479 } else {
480 replenishSessionBalance(wq);
481 readyQueue(wq);
482 }
483 }
484 WorkQueue laq = longestActiveQueue;
485 if(!wq.isRetired()&&((laq==null) || wq.getCount() > laq.getCount())) {
486 longestActiveQueue = wq;
487 }
488 }
489 }
490
491 /***
492 * Whether queues should start inactive (only becoming active when needed
493 * to keep the crawler busy), or if queues should start out ready.
494 *
495 * @return true if new queues should held inactive
496 */
497 private boolean holdQueues() {
498 return ((Boolean) getUncheckedAttribute(null, ATTR_HOLD_QUEUES))
499 .booleanValue();
500 }
501
502 /***
503 * Put the given queue on the readyClassQueues queue
504 * @param wq
505 */
506 private void readyQueue(WorkQueue wq) {
507 try {
508 wq.setActive(this, true);
509 readyClassQueues.put(wq.getClassKey());
510 } catch (InterruptedException e) {
511 e.printStackTrace();
512 System.err.println("unable to ready queue "+wq);
513
514 throw new RuntimeException(e);
515 }
516 }
517
518 /***
519 * Put the given queue on the inactiveQueues queue
520 * @param wq
521 */
522 private void deactivateQueue(WorkQueue wq) {
523
524 wq.setSessionBalance(0);
525 inactiveQueues.add(wq.getClassKey());
526 wq.setActive(this, false);
527
528
529
530
531
532
533 }
534
535 /***
536 * Put the given queue on the retiredQueues queue
537 * @param wq
538 */
539 private void retireQueue(WorkQueue wq) {
540
541 retiredQueues.add(wq.getClassKey());
542 decrementQueuedCount(wq.getCount());
543 wq.setRetired(true);
544 wq.setActive(this, false);
545
546
547
548
549
550
551 }
552
553 /***
554 * Accomodate any changes in settings.
555 *
556 * @see org.archive.crawler.framework.Frontier#kickUpdate()
557 */
558 public void kickUpdate() {
559 super.kickUpdate();
560 int target = (Integer)getUncheckedAttribute(null,
561 ATTR_TARGET_READY_QUEUES_BACKLOG);
562 if (target < 1) {
563 target = 1;
564 }
565 this.targetSizeForReadyQueues = target;
566 try {
567 initCostPolicy();
568 } catch (FatalConfigurationException fce) {
569 throw new RuntimeException(fce);
570 }
571
572
573
574
575
576 Object key = this.retiredQueues.poll();
577 while (key != null) {
578 WorkQueue q = (WorkQueue)this.allQueues.get(key);
579 if(q != null) {
580 unretireQueue(q);
581 }
582 key = this.retiredQueues.poll();
583 }
584 }
585 /***
586 * Restore a retired queue to the 'inactive' state.
587 *
588 * @param q
589 */
590 private void unretireQueue(WorkQueue q) {
591 deactivateQueue(q);
592 q.setRetired(false);
593 incrementQueuedUriCount(q.getCount());
594 }
595
596 /***
597 * Return the work queue for the given CrawlURI's classKey. URIs
598 * are ordered and politeness-delayed within their 'class'.
599 * If the requested queue is not found, a new instance is created.
600 *
601 * @param curi CrawlURI to base queue on
602 * @return the found or created ClassKeyQueue
603 */
604 protected abstract WorkQueue getQueueFor(CrawlURI curi);
605
606 /***
607 * Return the work queue for the given classKey, or null
608 * if no such queue exists.
609 *
610 * @param classKey key to look for
611 * @return the found WorkQueue
612 */
613 protected abstract WorkQueue getQueueFor(String classKey);
614
615 /***
616 * Return the next CrawlURI to be processed (and presumably
617 * visited/fetched) by a a worker thread.
618 *
619 * Relies on the readyClassQueues having been loaded with
620 * any work queues that are eligible to provide a URI.
621 *
622 * @return next CrawlURI to be processed. Or null if none is available.
623 *
624 * @see org.archive.crawler.framework.Frontier#next()
625 */
626 public CrawlURI next()
627 throws InterruptedException, EndedException {
628 while (true) {
629 long now = System.currentTimeMillis();
630
631
632 preNext(now);
633
634
635 if(readyFiller.tryAcquire()) {
636 try {
637 int activationsNeeded = targetSizeForReadyQueues() - readyClassQueues.size();
638 while(activationsNeeded > 0 && !inactiveQueues.isEmpty()) {
639 activateInactiveQueue();
640 activationsNeeded--;
641 }
642 } finally {
643 readyFiller.release();
644 }
645 }
646
647 WorkQueue readyQ = null;
648 Object key = readyClassQueues.poll(DEFAULT_WAIT,TimeUnit.MILLISECONDS);
649 if (key != null) {
650 readyQ = (WorkQueue)this.allQueues.get(key);
651 }
652 if (readyQ != null) {
653 while(true) {
654 CrawlURI curi = null;
655 synchronized(readyQ) {
656 curi = readyQ.peek(this);
657 if (curi != null) {
658
659 String currentQueueKey = getClassKey(curi);
660 if (currentQueueKey.equals(curi.getClassKey())) {
661
662 noteAboutToEmit(curi, readyQ);
663 inProcessQueues.add(readyQ);
664 return curi;
665 }
666
667
668
669 curi.setClassKey(currentQueueKey);
670 readyQ.dequeue(this);
671 decrementQueuedCount(1);
672 curi.setHolderKey(null);
673
674
675 } else {
676
677
678
679 readyQ.clearHeld();
680 break;
681 }
682 }
683 if(curi!=null) {
684
685 sendToQueue(curi);
686 }
687 }
688 } else {
689
690 if (key != null) {
691 logger.severe("Key "+ key +
692 " in readyClassQueues but not allQueues");
693 }
694 }
695
696 if(shouldTerminate) {
697
698 throw new EndedException("shouldTerminate is true");
699 }
700
701 if(inProcessQueues.size()==0) {
702
703
704 this.alreadyIncluded.requestFlush();
705 }
706 }
707 }
708
709 private int targetSizeForReadyQueues() {
710 return targetSizeForReadyQueues;
711 }
712
713 /***
714 * Return the 'cost' of a CrawlURI (how much of its associated
715 * queue's budget it depletes upon attempted processing)
716 *
717 * @param curi
718 * @return the associated cost
719 */
720 private int getCost(CrawlURI curi) {
721 int cost = curi.getHolderCost();
722 if (cost == CrawlURI.UNCALCULATED) {
723 cost = costAssignmentPolicy.costOf(curi);
724 curi.setHolderCost(cost);
725 }
726 return cost;
727 }
728
729 /***
730 * Activate an inactive queue, if any are available.
731 */
732 private void activateInactiveQueue() {
733 Object key = this.inactiveQueues.poll();
734 if (key == null) {
735 return;
736 }
737 WorkQueue candidateQ = (WorkQueue)this.allQueues.get(key);
738 if(candidateQ != null) {
739 synchronized(candidateQ) {
740 replenishSessionBalance(candidateQ);
741 if(candidateQ.isOverBudget()){
742
743
744 retireQueue(candidateQ);
745 return;
746 }
747 long now = System.currentTimeMillis();
748 long delay_ms = candidateQ.getWakeTime() - now;
749 if(delay_ms>0) {
750
751 snoozeQueue(candidateQ,now,delay_ms);
752 return;
753 }
754 candidateQ.setWakeTime(0);
755 readyQueue(candidateQ);
756 if (logger.isLoggable(Level.FINE)) {
757 logger.fine("ACTIVATED queue: " +
758 candidateQ.getClassKey());
759
760 }
761 }
762 }
763 }
764
765 /***
766 * Replenish the budget of the given queue by the appropriate amount.
767 *
768 * @param queue queue to replenish
769 */
770 private void replenishSessionBalance(WorkQueue queue) {
771 UURI contextUri = queue.getContextUURI(this);
772
773
774 queue.setSessionBalance(((Integer) getUncheckedAttribute(contextUri,
775 ATTR_BALANCE_REPLENISH_AMOUNT)).intValue());
776
777
778 long totalBudget = ((Long)getUncheckedAttribute(contextUri,ATTR_QUEUE_TOTAL_BUDGET)).longValue();
779 queue.setTotalBudget(totalBudget);
780 }
781
782 /***
783 * Enqueue the given queue to either readyClassQueues or inactiveQueues,
784 * as appropriate.
785 *
786 * @param wq
787 */
788 private void reenqueueQueue(WorkQueue wq) {
789 if(wq.isOverBudget()) {
790
791 if (logger.isLoggable(Level.FINE)) {
792 logger.fine("DEACTIVATED queue: " +
793 wq.getClassKey());
794 }
795 deactivateQueue(wq);
796 } else {
797 readyQueue(wq);
798 }
799 }
800
801 /***
802 * Wake any queues sitting in the snoozed queue whose time has come.
803 */
804 void wakeQueues() {
805 synchronized (snoozedClassQueues) {
806 long now = System.currentTimeMillis();
807 long nextWakeDelay = 0;
808 int wokenQueuesCount = 0;
809 while (true) {
810 if (snoozedClassQueues.isEmpty()) {
811 return;
812 }
813 WorkQueue peek = (WorkQueue) snoozedClassQueues.first();
814 nextWakeDelay = peek.getWakeTime() - now;
815 if (nextWakeDelay <= 0) {
816 snoozedClassQueues.remove(peek);
817 peek.setWakeTime(0);
818 reenqueueQueue(peek);
819 wokenQueuesCount++;
820 } else {
821 break;
822 }
823 }
824 this.nextWake = new WakeTask();
825 this.wakeTimer.schedule(nextWake,nextWakeDelay);
826 }
827 }
828
829 /***
830 * Note that the previously emitted CrawlURI has completed
831 * its processing (for now).
832 *
833 * The CrawlURI may be scheduled to retry, if appropriate,
834 * and other related URIs may become eligible for release
835 * via the next next() call, as a result of finished().
836 *
837 * (non-Javadoc)
838 * @see org.archive.crawler.framework.Frontier#finished(org.archive.crawler.datamodel.CrawlURI)
839 */
840 public void finished(CrawlURI curi) {
841 long now = System.currentTimeMillis();
842
843 curi.incrementFetchAttempts();
844 logLocalizedErrors(curi);
845 WorkQueue wq = (WorkQueue) curi.getHolder();
846 assert (wq.peek(this) == curi) : "unexpected peek " + wq;
847 inProcessQueues.remove(wq, 1);
848
849 if(includesRetireDirective(curi)) {
850
851 curi.processingCleanup();
852 wq.unpeek();
853 wq.update(this, curi);
854 retireQueue(wq);
855 return;
856 }
857
858 if (needsRetrying(curi)) {
859
860 if(curi.getFetchStatus()!=S_DEFERRED) {
861 wq.expend(getCost(curi));
862 }
863 long delay_sec = retryDelayFor(curi);
864 curi.processingCleanup();
865 synchronized(wq) {
866 wq.unpeek();
867
868 wq.update(this, curi);
869 if (delay_sec > 0) {
870 long delay_ms = delay_sec * 1000;
871 snoozeQueue(wq, now, delay_ms);
872 } else {
873 reenqueueQueue(wq);
874 }
875 }
876
877 controller.fireCrawledURINeedRetryEvent(curi);
878 doJournalRescheduled(curi);
879 return;
880 }
881
882
883 wq.dequeue(this);
884 decrementQueuedCount(1);
885 log(curi);
886
887 if (curi.isSuccess()) {
888 totalProcessedBytes += curi.getRecordedSize();
889 incrementSucceededFetchCount();
890
891 controller.fireCrawledURISuccessfulEvent(curi);
892 doJournalFinishedSuccess(curi);
893 wq.expend(getCost(curi));
894 } else if (isDisregarded(curi)) {
895
896
897 incrementDisregardedUriCount();
898
899 controller.fireCrawledURIDisregardEvent(curi);
900 doJournalDisregarded(curi);
901
902 if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
903 Object[] array = { curi };
904 controller.runtimeErrors.log(Level.WARNING, curi.getUURI()
905 .toString(), array);
906 }
907
908 } else {
909
910
911 this.controller.fireCrawledURIFailureEvent(curi);
912
913 if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
914 Object[] array = { curi };
915 this.controller.runtimeErrors.log(Level.WARNING, curi.getUURI()
916 .toString(), array);
917 }
918 incrementFailedFetchCount();
919
920 wq.noteError(((Integer) getUncheckedAttribute(curi,
921 ATTR_ERROR_PENALTY_AMOUNT)).intValue());
922 doJournalFinishedFailure(curi);
923 wq.expend(getCost(curi));
924 }
925
926 long delay_ms = politenessDelayFor(curi);
927 synchronized(wq) {
928 if (delay_ms > 0) {
929 snoozeQueue(wq,now,delay_ms);
930 } else {
931 reenqueueQueue(wq);
932 }
933 }
934
935 curi.stripToMinimal();
936 curi.processingCleanup();
937
938 }
939
940 private boolean includesRetireDirective(CrawlURI curi) {
941 return curi.containsKey(A_FORCE_RETIRE) && (Boolean)curi.getObject(A_FORCE_RETIRE);
942 }
943
944 /***
945 * Place the given queue into 'snoozed' state, ineligible to
946 * supply any URIs for crawling, for the given amount of time.
947 *
948 * @param wq queue to snooze
949 * @param now time now in ms
950 * @param delay_ms time to snooze in ms
951 */
952 private void snoozeQueue(WorkQueue wq, long now, long delay_ms) {
953 long nextTime = now + delay_ms;
954 wq.setWakeTime(nextTime);
955 long snoozeToInactiveDelayMs = ((Long)getUncheckedAttribute(null,
956 ATTR_SNOOZE_DEACTIVATE_MS)).longValue();
957 if (delay_ms > snoozeToInactiveDelayMs && !inactiveQueues.isEmpty()) {
958 deactivateQueue(wq);
959 } else {
960 synchronized(snoozedClassQueues) {
961 snoozedClassQueues.add(wq);
962 if(wq == snoozedClassQueues.first()) {
963 this.nextWake = new WakeTask();
964 this.wakeTimer.schedule(nextWake, delay_ms);
965 }
966 }
967 }
968 }
969
970 /***
971 * Forget the given CrawlURI. This allows a new instance
972 * to be created in the future, if it is reencountered under
973 * different circumstances.
974 *
975 * @param curi The CrawlURI to forget
976 */
977 protected void forget(CrawlURI curi) {
978 logger.finer("Forgetting " + curi);
979 alreadyIncluded.forget(canonicalize(curi.getUURI()), curi);
980 }
981
982 /*** (non-Javadoc)
983 * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
984 */
985 public long discoveredUriCount() {
986 return (this.alreadyIncluded != null)? this.alreadyIncluded.count(): 0;
987 }
988
989 /***
990 * Delete all scheduled URIs matching the given regex.
991 *
992 * @param match regex of URIs to delete
993 * @return Number of items deleted.
994 */
995 public long deleteURIs(String uriMatch) {
996 return deleteURIs(uriMatch,null);
997 }
998
999 /***
1000 * Delete all scheduled URIs matching the given regex, in queues with
1001 * names matching the second given regex.
1002 *
1003 * @param uriMatch regex of URIs to delete
1004 * @param queueMatch regex of queues to affect, or null for all
1005 * @return Number of items deleted.
1006 */
1007 public long deleteURIs(String uriMatch, String queueMatch) {
1008 long count = 0;
1009
1010 Iterator iter = allQueues.keySet().iterator();
1011 while(iter.hasNext()) {
1012 String queueKey = ((String)iter.next());
1013 if(StringUtils.isNotEmpty(queueMatch) && !queueKey.matches(queueMatch)) {
1014
1015 continue;
1016 }
1017 WorkQueue wq = getQueueFor(queueKey);
1018 wq.unpeek();
1019 count += wq.deleteMatching(this, uriMatch);
1020 }
1021 decrementQueuedCount(count);
1022 return count;
1023 }
1024
1025
1026
1027
1028
1029 public static String STANDARD_REPORT = "standard";
1030 public static String ALL_NONEMPTY = "nonempty";
1031 public static String ALL_QUEUES = "all";
1032 protected static String[] REPORTS = {STANDARD_REPORT,ALL_NONEMPTY,ALL_QUEUES};
1033
1034 public String[] getReports() {
1035 return REPORTS;
1036 }
1037
1038 /***
1039 * @param w Where to write to.
1040 */
1041 public void singleLineReportTo(PrintWriter w) {
1042 if (this.allQueues == null) {
1043 return;
1044 }
1045 int allCount = allQueues.size();
1046 int inProcessCount = inProcessQueues.uniqueSet().size();
1047 int readyCount = readyClassQueues.size();
1048 int snoozedCount = snoozedClassQueues.size();
1049 int activeCount = inProcessCount + readyCount + snoozedCount;
1050 int inactiveCount = inactiveQueues.size();
1051 int retiredCount = retiredQueues.size();
1052 int exhaustedCount =
1053 allCount - activeCount - inactiveCount - retiredCount;
1054 w.print(allCount);
1055 w.print(" queues: ");
1056 w.print(activeCount);
1057 w.print(" active (");
1058 w.print(inProcessCount);
1059 w.print(" in-process; ");
1060 w.print(readyCount);
1061 w.print(" ready; ");
1062 w.print(snoozedCount);
1063 w.print(" snoozed); ");
1064 w.print(inactiveCount);
1065 w.print(" inactive; ");
1066 w.print(retiredCount);
1067 w.print(" retired; ");
1068 w.print(exhaustedCount);
1069 w.print(" exhausted");
1070 w.flush();
1071 }
1072
1073
1074
1075
1076 public String singleLineLegend() {
1077 return "total active in-process ready snoozed inactive retired exhausted";
1078 }
1079
1080 /***
1081 * This method compiles a human readable report on the status of the frontier
1082 * at the time of the call.
1083 * @param name Name of report.
1084 * @param writer Where to write to.
1085 */
1086 public synchronized void reportTo(String name, PrintWriter writer) {
1087 if(ALL_NONEMPTY.equals(name)) {
1088 allNonemptyReportTo(writer);
1089 return;
1090 }
1091 if(ALL_QUEUES.equals(name)) {
1092 allQueuesReportTo(writer);
1093 return;
1094 }
1095 if(name!=null && !STANDARD_REPORT.equals(name)) {
1096 writer.print(name);
1097 writer.print(" unavailable; standard report:\n");
1098 }
1099 standardReportTo(writer);
1100 }
1101
1102 /*** Compact report of all nonempty queues (one queue per line)
1103 *
1104 * @param writer
1105 */
1106 private void allNonemptyReportTo(PrintWriter writer) {
1107 ArrayList<WorkQueue> inProcessQueuesCopy;
1108 synchronized(this.inProcessQueues) {
1109
1110 @SuppressWarnings("unchecked")
1111 Collection<WorkQueue> inProcess = this.inProcessQueues;
1112 inProcessQueuesCopy = new ArrayList<WorkQueue>(inProcess);
1113 }
1114 writer.print("\n -----===== IN-PROCESS QUEUES =====-----\n");
1115 queueSingleLinesTo(writer, inProcessQueuesCopy.iterator());
1116
1117 writer.print("\n -----===== READY QUEUES =====-----\n");
1118 queueSingleLinesTo(writer, this.readyClassQueues.iterator());
1119
1120 writer.print("\n -----===== SNOOZED QUEUES =====-----\n");
1121 queueSingleLinesTo(writer, this.snoozedClassQueues.iterator());
1122
1123 writer.print("\n -----===== INACTIVE QUEUES =====-----\n");
1124 queueSingleLinesTo(writer, this.inactiveQueues.iterator());
1125
1126 writer.print("\n -----===== RETIRED QUEUES =====-----\n");
1127 queueSingleLinesTo(writer, this.retiredQueues.iterator());
1128 }
1129
1130 /*** Compact report of all nonempty queues (one queue per line)
1131 *
1132 * @param writer
1133 */
1134 private void allQueuesReportTo(PrintWriter writer) {
1135 queueSingleLinesTo(writer, allQueues.keySet().iterator());
1136 }
1137
1138 /***
1139 * Writer the single-line reports of all queues in the
1140 * iterator to the writer
1141 *
1142 * @param writer to receive report
1143 * @param iterator over queues of interest.
1144 */
1145 private void queueSingleLinesTo(PrintWriter writer, Iterator iterator) {
1146 Object obj;
1147 WorkQueue q;
1148 boolean legendWritten = false;
1149 while( iterator.hasNext()) {
1150 obj = iterator.next();
1151 if (obj == null) {
1152 continue;
1153 }
1154 q = (obj instanceof WorkQueue)?
1155 (WorkQueue)obj:
1156 (WorkQueue)this.allQueues.get(obj);
1157 if(q == null) {
1158 writer.print(" ERROR: "+obj);
1159 }
1160 if(!legendWritten) {
1161 writer.println(q.singleLineLegend());
1162 legendWritten = true;
1163 }
1164 q.singleLineReportTo(writer);
1165 }
1166 }
1167
1168 /***
1169 * @param w Writer to print to.
1170 */
1171 private void standardReportTo(PrintWriter w) {
1172 int allCount = allQueues.size();
1173 int inProcessCount = inProcessQueues.uniqueSet().size();
1174 int readyCount = readyClassQueues.size();
1175 int snoozedCount = snoozedClassQueues.size();
1176 int activeCount = inProcessCount + readyCount + snoozedCount;
1177 int inactiveCount = inactiveQueues.size();
1178 int retiredCount = retiredQueues.size();
1179 int exhaustedCount =
1180 allCount - activeCount - inactiveCount - retiredCount;
1181
1182 w.print("Frontier report - ");
1183 w.print(ArchiveUtils.get12DigitDate());
1184 w.print("\n");
1185 w.print(" Job being crawled: ");
1186 w.print(controller.getOrder().getCrawlOrderName());
1187 w.print("\n");
1188 w.print("\n -----===== STATS =====-----\n");
1189 w.print(" Discovered: ");
1190 w.print(Long.toString(discoveredUriCount()));
1191 w.print("\n");
1192 w.print(" Queued: ");
1193 w.print(Long.toString(queuedUriCount()));
1194 w.print("\n");
1195 w.print(" Finished: ");
1196 w.print(Long.toString(finishedUriCount()));
1197 w.print("\n");
1198 w.print(" Successfully: ");
1199 w.print(Long.toString(succeededFetchCount()));
1200 w.print("\n");
1201 w.print(" Failed: ");
1202 w.print(Long.toString(failedFetchCount()));
1203 w.print("\n");
1204 w.print(" Disregarded: ");
1205 w.print(Long.toString(disregardedUriCount()));
1206 w.print("\n");
1207 w.print("\n -----===== QUEUES =====-----\n");
1208 w.print(" Already included size: ");
1209 w.print(Long.toString(alreadyIncluded.count()));
1210 w.print("\n");
1211 w.print(" pending: ");
1212 w.print(Long.toString(alreadyIncluded.pending()));
1213 w.print("\n");
1214 w.print("\n All class queues map size: ");
1215 w.print(Long.toString(allCount));
1216 w.print("\n");
1217 w.print( " Active queues: ");
1218 w.print(activeCount);
1219 w.print("\n");
1220 w.print(" In-process: ");
1221 w.print(inProcessCount);
1222 w.print("\n");
1223 w.print(" Ready: ");
1224 w.print(readyCount);
1225 w.print("\n");
1226 w.print(" Snoozed: ");
1227 w.print(snoozedCount);
1228 w.print("\n");
1229 w.print(" Inactive queues: ");
1230 w.print(inactiveCount);
1231 w.print("\n");
1232 w.print(" Retired queues: ");
1233 w.print(retiredCount);
1234 w.print("\n");
1235 w.print(" Exhausted queues: ");
1236 w.print(exhaustedCount);
1237 w.print("\n");
1238
1239 w.print("\n -----===== IN-PROCESS QUEUES =====-----\n");
1240 @SuppressWarnings("unchecked")
1241 Collection<WorkQueue> inProcess = inProcessQueues;
1242 ArrayList<WorkQueue> copy = extractSome(inProcess, REPORT_MAX_QUEUES);
1243 appendQueueReports(w, copy.iterator(), copy.size(), REPORT_MAX_QUEUES);
1244
1245 w.print("\n -----===== READY QUEUES =====-----\n");
1246 appendQueueReports(w, this.readyClassQueues.iterator(),
1247 this.readyClassQueues.size(), REPORT_MAX_QUEUES);
1248
1249 w.print("\n -----===== SNOOZED QUEUES =====-----\n");
1250 copy = extractSome(snoozedClassQueues, REPORT_MAX_QUEUES);
1251 appendQueueReports(w, copy.iterator(), copy.size(), REPORT_MAX_QUEUES);
1252
1253 WorkQueue longest = longestActiveQueue;
1254 if (longest != null) {
1255 w.print("\n -----===== LONGEST QUEUE =====-----\n");
1256 longest.reportTo(w);
1257 }
1258
1259 w.print("\n -----===== INACTIVE QUEUES =====-----\n");
1260 appendQueueReports(w, this.inactiveQueues.iterator(),
1261 this.inactiveQueues.size(), REPORT_MAX_QUEUES);
1262
1263 w.print("\n -----===== RETIRED QUEUES =====-----\n");
1264 appendQueueReports(w, this.retiredQueues.iterator(),
1265 this.retiredQueues.size(), REPORT_MAX_QUEUES);
1266
1267 w.flush();
1268 }
1269
1270
1271 /***
1272 * Extract some of the elements in the given collection to an
1273 * ArrayList. This method synchronizes on the given collection's
1274 * monitor. The returned list will never contain more than the
1275 * specified maximum number of elements.
1276 *
1277 * @param c the collection whose elements to extract
1278 * @param max the maximum number of elements to extract
1279 * @return the extraction
1280 */
1281 private static <T> ArrayList<T> extractSome(Collection<T> c, int max) {
1282
1283
1284
1285 int initial = Math.min(c.size() + 10, max);
1286 int count = 0;
1287 ArrayList<T> list = new ArrayList<T>(initial);
1288 synchronized (c) {
1289 Iterator<T> iter = c.iterator();
1290 while (iter.hasNext() && (count < max)) {
1291 list.add(iter.next());
1292 count++;
1293 }
1294 }
1295 return list;
1296 }
1297
1298 /***
1299 * Append queue report to general Frontier report.
1300 * @param w StringBuffer to append to.
1301 * @param iterator An iterator over
1302 * @param total
1303 * @param max
1304 */
1305 protected void appendQueueReports(PrintWriter w, Iterator iterator,
1306 int total, int max) {
1307 Object obj;
1308 WorkQueue q;
1309 for(int count = 0; iterator.hasNext() && (count < max); count++) {
1310 obj = iterator.next();
1311 if (obj == null) {
1312 continue;
1313 }
1314 q = (obj instanceof WorkQueue)?
1315 (WorkQueue)obj:
1316 (WorkQueue)this.allQueues.get(obj);
1317 if(q == null) {
1318 w.print("WARNING: No report for queue "+obj);
1319 }
1320 q.reportTo(w);
1321 }
1322 if(total > max) {
1323 w.print("...and " + (total - max) + " more.\n");
1324 }
1325 }
1326
1327 /***
1328 * Force logging, etc. of operator- deleted CrawlURIs
1329 *
1330 * @see org.archive.crawler.framework.Frontier#deleted(org.archive.crawler.datamodel.CrawlURI)
1331 */
1332 public synchronized void deleted(CrawlURI curi) {
1333
1334 controller.fireCrawledURIDisregardEvent(curi);
1335 log(curi);
1336 incrementDisregardedUriCount();
1337 curi.stripToMinimal();
1338 curi.processingCleanup();
1339 }
1340
1341 public void considerIncluded(UURI u) {
1342 this.alreadyIncluded.note(canonicalize(u));
1343 CrawlURI temp = new CrawlURI(u);
1344 temp.setClassKey(getClassKey(temp));
1345 getQueueFor(temp).expend(getCost(temp));
1346 }
1347
1348 protected abstract void initQueue() throws IOException;
1349 protected abstract void closeQueue() throws IOException;
1350
1351 /***
1352 * Returns <code>true</code> if the WorkQueue implementation of this
1353 * Frontier stores its workload on disk instead of relying
1354 * on serialization mechanisms.
1355 *
1356 * TODO: rename! (this is a very misleading name) or kill (don't
1357 * see any implementations that return false)
1358 *
1359 * @return a constant boolean value for this class/instance
1360 */
1361 protected abstract boolean workQueueDataOnDisk();
1362
1363
1364 public FrontierGroup getGroup(CrawlURI curi) {
1365 return getQueueFor(curi);
1366 }
1367
1368
1369 public long averageDepth() {
1370 int inProcessCount = inProcessQueues.uniqueSet().size();
1371 int readyCount = readyClassQueues.size();
1372 int snoozedCount = snoozedClassQueues.size();
1373 int activeCount = inProcessCount + readyCount + snoozedCount;
1374 int inactiveCount = inactiveQueues.size();
1375 int totalQueueCount = (activeCount+inactiveCount);
1376 return (totalQueueCount == 0) ? 0 : liveQueuedUriCount.get() / totalQueueCount;
1377 }
1378 public float congestionRatio() {
1379 int inProcessCount = inProcessQueues.uniqueSet().size();
1380 int readyCount = readyClassQueues.size();
1381 int snoozedCount = snoozedClassQueues.size();
1382 int activeCount = inProcessCount + readyCount + snoozedCount;
1383 int inactiveCount = inactiveQueues.size();
1384 return (float)(activeCount + inactiveCount) / (inProcessCount + snoozedCount);
1385 }
1386 public long deepestUri() {
1387 return longestActiveQueue==null ? -1 : longestActiveQueue.getCount();
1388 }
1389
1390
1391
1392
1393
1394 public synchronized boolean isEmpty() {
1395 return liveQueuedUriCount.get() == 0 && alreadyIncluded.pending() == 0;
1396 }
1397 }
1398