View Javadoc

1   /* $Id: WorkQueueFrontier.java 5439 2007-08-28 05:15:25Z gojomo $
2    * Created on Sep 24, 2004
3    *
4    *  Copyright (C) 2004 Internet Archive.
5    *
6    * This file is part of the Heritrix web crawler (crawler.archive.org).
7    *
8    * Heritrix is free software; you can redistribute it and/or modify
9    * it under the terms of the GNU Lesser Public License as published by
10   * the Free Software Foundation; either version 2.1 of the License, or
11   * any later version.
12   *
13   * Heritrix is distributed in the hope that it will be useful,
14   * but WITHOUT ANY WARRANTY; without even the implied warranty of
15   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   * GNU Lesser Public License for more details.
17   *
18   * You should have received a copy of the GNU Lesser Public License
19   * along with Heritrix; if not, write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21   *
22    */
23  package org.archive.crawler.frontier;
24  
25  import java.io.IOException;
26  import java.io.PrintWriter;
27  import java.io.Serializable;
28  import java.util.ArrayList;
29  import java.util.Collection;
30  import java.util.Collections;
31  import java.util.HashMap;
32  import java.util.Iterator;
33  import java.util.Map;
34  import java.util.Queue;
35  import java.util.SortedSet;
36  import java.util.Timer;
37  import java.util.TimerTask;
38  import java.util.TreeSet;
39  import java.util.concurrent.BlockingQueue;
40  import java.util.concurrent.LinkedBlockingQueue;
41  import java.util.concurrent.Semaphore;
42  import java.util.concurrent.TimeUnit;
43  import java.util.logging.Level;
44  import java.util.logging.Logger;
45  
46  import org.apache.commons.collections.Bag;
47  import org.apache.commons.collections.BagUtils;
48  import org.apache.commons.collections.bag.HashBag;
49  import org.apache.commons.lang.StringUtils;
50  import org.archive.crawler.datamodel.CandidateURI;
51  import org.archive.crawler.datamodel.CoreAttributeConstants;
52  import org.archive.crawler.datamodel.CrawlURI;
53  import org.archive.crawler.datamodel.FetchStatusCodes;
54  import org.archive.crawler.datamodel.UriUniqFilter;
55  import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;
56  import org.archive.crawler.framework.CrawlController;
57  import org.archive.crawler.framework.Frontier;
58  import org.archive.crawler.framework.exceptions.EndedException;
59  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
60  import org.archive.crawler.settings.SimpleType;
61  import org.archive.crawler.settings.Type;
62  import org.archive.net.UURI;
63  import org.archive.util.ArchiveUtils;
64  
65  import com.sleepycat.collections.StoredIterator;
66  
67  /***
68   * A common Frontier base using several queues to hold pending URIs. 
69   * 
70   * Uses in-memory map of all known 'queues' inside a single database.
71   * Round-robins between all queues.
72   *
73   * @author Gordon Mohr
74   * @author Christian Kohlschuetter
75   */
76  public abstract class WorkQueueFrontier extends AbstractFrontier
77  implements FetchStatusCodes, CoreAttributeConstants, HasUriReceiver,
78          Serializable {
79  	private static final long serialVersionUID = 570384305871965843L;
80  	
81      public class WakeTask extends TimerTask {
82          @Override
83          public void run() {
84              synchronized(snoozedClassQueues) {
85                  if(this!=nextWake) {
86                      // an intervening waketask was made
87                      return;
88                  }
89                  wakeQueues();
90              }
91          }
92      }
93  
94      /*** truncate reporting of queues at some large but not unbounded number */
95      private static final int REPORT_MAX_QUEUES = 2000;
96      
97      /***
98       * If we know that only a small amount of queues is held in memory,
99       * we can avoid using a disk-based BigMap.
100      * This only works efficiently if the WorkQueue does not hold its
101      * entries in memory as well.
102      */ 
103     private static final int MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY = 3000;
104 
105     /***
106      * When a snooze target for a queue is longer than this amount, and 
107      * there are already ready queues, deactivate rather than snooze 
108      * the current queue -- so other more responsive sites get a chance
109      * in active rotation. (As a result, queue's next try may be much
110      * further in the future than the snooze target delay.)
111      */
112     public final static String ATTR_SNOOZE_DEACTIVATE_MS =
113         "snooze-deactivate-ms";
114     public static Long DEFAULT_SNOOZE_DEACTIVATE_MS = new Long(5*60*1000); // 5 minutes
115     
116     private static final Logger logger =
117         Logger.getLogger(WorkQueueFrontier.class.getName());
118     
119     /*** whether to hold queues INACTIVE until needed for throughput */
120     public final static String ATTR_HOLD_QUEUES = "hold-queues";
121     protected final static Boolean DEFAULT_HOLD_QUEUES = new Boolean(true); 
122 
123     /*** amount to replenish budget on each activation (duty cycle) */
124     public final static String ATTR_BALANCE_REPLENISH_AMOUNT =
125         "balance-replenish-amount";
126     protected final static Integer DEFAULT_BALANCE_REPLENISH_AMOUNT =
127         new Integer(3000);
128     
129     /*** whether to hold queues INACTIVE until needed for throughput */
130     public final static String ATTR_ERROR_PENALTY_AMOUNT =
131         "error-penalty-amount";
132     protected final static Integer DEFAULT_ERROR_PENALTY_AMOUNT =
133         new Integer(100);
134 
135 
136     /*** total expenditure to allow a queue before 'retiring' it  */
137     public final static String ATTR_QUEUE_TOTAL_BUDGET = "queue-total-budget";
138     protected final static Long DEFAULT_QUEUE_TOTAL_BUDGET = new Long(-1);
139 
140     /*** cost assignment policy to use (by class name) */
141     public final static String ATTR_COST_POLICY = "cost-policy";
142     protected final static String DEFAULT_COST_POLICY =
143         UnitCostAssignmentPolicy.class.getName();
144 
145     /*** target size of ready queues backlog */
146     public final static String ATTR_TARGET_READY_QUEUES_BACKLOG =
147         "target-ready-backlog";
148     protected final static Integer DEFAULT_TARGET_READY_QUEUES_BACKLOG =
149         new Integer(50);
150     
151     /*** those UURIs which are already in-process (or processed), and
152      thus should not be rescheduled */
153     protected transient UriUniqFilter alreadyIncluded;
154 
155     /*** All known queues.
156      */
157     protected transient Map<String,WorkQueue> allQueues = null; 
158     // of classKey -> ClassKeyQueue
159 
160     /***
161      * All per-class queues whose first item may be handed out.
162      * Linked-list of keys for the queues.
163      */
164     protected BlockingQueue<String> readyClassQueues;
165     
166     /*** Target (minimum) size to keep readyClassQueues */
167     protected int targetSizeForReadyQueues;
168     
169     /*** single-thread access to ready-filling code */
170     protected transient Semaphore readyFiller = new Semaphore(1);
171     
172     /*** 
173      * All 'inactive' queues, not yet in active rotation.
174      * Linked-list of keys for the queues.
175      */
176     protected Queue<String> inactiveQueues;
177 
178     /***
179      * 'retired' queues, no longer considered for activation.
180      * Linked-list of keys for queues.
181      */
182     protected Queue<String> retiredQueues;
183     
184     /*** all per-class queues from whom a URI is outstanding */
185     protected Bag inProcessQueues = 
186         BagUtils.synchronizedBag(new HashBag()); // of ClassKeyQueue
187     
188     /***
189      * All per-class queues held in snoozed state, sorted by wake time.
190      */
191     protected SortedSet<WorkQueue> snoozedClassQueues;
192     
193     /*** Timer for tasks which wake head item of snoozedClassQueues */
194     protected transient Timer wakeTimer;
195     
196     /*** Task for next wake */ 
197     protected transient WakeTask nextWake; 
198     
199     protected WorkQueue longestActiveQueue = null;
200     
201     /*** how long to wait for a ready queue when there's nothing snoozed */
202     private static final long DEFAULT_WAIT = 1000; // 1 second
203 
204     /*** a policy for assigning 'cost' values to CrawlURIs */
205     private transient CostAssignmentPolicy costAssignmentPolicy;
206     
207     /*** all policies available to be chosen */
208     String[] AVAILABLE_COST_POLICIES = new String[] {
209             ZeroCostAssignmentPolicy.class.getName(),
210             UnitCostAssignmentPolicy.class.getName(),
211             WagCostAssignmentPolicy.class.getName(),
212             AntiCalendarCostAssignmentPolicy.class.getName()};
213 
214     /***
215      * Create the CommonFrontier
216      * 
217      * @param name
218      * @param description
219      */
220     public WorkQueueFrontier(String name, String description) {
221         // The 'name' of all frontiers should be the same (URIFrontier.ATTR_NAME)
222         // therefore we'll ignore the supplied parameter.
223         super(Frontier.ATTR_NAME, description);
224         Type t = addElementToDefinition(new SimpleType(ATTR_HOLD_QUEUES,
225             "Whether to hold newly-created per-host URI work" +
226             " queues until needed to stay busy. If false (default)," +
227             " all queues may contribute URIs for crawling at all" +
228             " times. If true, queues begin (and collect URIs) in" +
229             " an 'inactive' state, and only when the Frontier needs" +
230             " another queue to keep all ToeThreads busy will new" +
231             " queues be activated.", DEFAULT_HOLD_QUEUES));
232         t.setExpertSetting(true);
233         t.setOverrideable(false);
234         t = addElementToDefinition(new SimpleType(ATTR_BALANCE_REPLENISH_AMOUNT,
235             "Amount to replenish a queue's activity balance when it becomes " +
236             "active. Larger amounts mean more URIs will be tried from the " +
237             "queue before it is deactivated in favor of waiting queues. " +
238             "Default is 3000", DEFAULT_BALANCE_REPLENISH_AMOUNT));
239         t.setExpertSetting(true);
240         t.setOverrideable(true);
241         t = addElementToDefinition(new SimpleType(ATTR_ERROR_PENALTY_AMOUNT,
242                 "Amount to additionally penalize a queue when one of" +
243                 "its URIs fails completely. Accelerates deactivation or " +
244                 "full retirement of problem queues and unresponsive sites. " +
245                 "Default is 100", DEFAULT_ERROR_PENALTY_AMOUNT));
246         t.setExpertSetting(true);
247         t.setOverrideable(true);
248         t = addElementToDefinition(new SimpleType(ATTR_QUEUE_TOTAL_BUDGET,
249             "Total activity expenditure allowable to a single queue; queues " +
250             "over this expenditure will be 'retired' and crawled no more. " +
251             "Default of -1 means no ceiling on activity expenditures is " +
252             "enforced.", DEFAULT_QUEUE_TOTAL_BUDGET));
253         t.setExpertSetting(true);
254         t.setOverrideable(true);
255 
256         t = addElementToDefinition(new SimpleType(ATTR_COST_POLICY,
257                 "Policy for calculating the cost of each URI attempted. " +
258                 "The default UnitCostAssignmentPolicy considers the cost of " +
259                 "each URI to be '1'.", DEFAULT_COST_POLICY, AVAILABLE_COST_POLICIES));
260         t.setExpertSetting(true);
261         
262         t = addElementToDefinition(new SimpleType(ATTR_SNOOZE_DEACTIVATE_MS,
263                 "Threshold above which any 'snooze' delay will cause the " +
264                 "affected queue to go inactive, allowing other queues a " +
265                 "chance to rotate into active state. Typically set to be " +
266                 "longer than the politeness pauses between successful " +
267                 "fetches, but shorter than the connection-failed " +
268                 "'retry-delay-seconds'. (Default is 5 minutes.)", 
269                 DEFAULT_SNOOZE_DEACTIVATE_MS));
270         t.setExpertSetting(true);
271         t.setOverrideable(false);
272         t = addElementToDefinition(new SimpleType(ATTR_TARGET_READY_QUEUES_BACKLOG,
273                 "Target size for backlog of ready queues. This many queues " +
274                 "will be brought into 'ready' state even if a thread is " +
275                 "not waiting. Only has effect if 'hold-queues' is true. " +
276                 "Default is 50.", DEFAULT_TARGET_READY_QUEUES_BACKLOG));
277         t.setExpertSetting(true);
278         t.setOverrideable(false);
279     }
280 
281     /***
282      * Initializes the Frontier, given the supplied CrawlController.
283      *
284      * @see org.archive.crawler.framework.Frontier#initialize(org.archive.crawler.framework.CrawlController)
285      */
286     public void initialize(CrawlController c)
287             throws FatalConfigurationException, IOException {
288         // Call the super method. It sets up frontier journalling.
289         super.initialize(c);
290         this.controller = c;
291         
292         initQueuesOfQueues();
293         
294         this.targetSizeForReadyQueues = (Integer)getUncheckedAttribute(null,
295             ATTR_TARGET_READY_QUEUES_BACKLOG);
296         if (this.targetSizeForReadyQueues < 1) {
297             this.targetSizeForReadyQueues = 1;
298         }
299         this.wakeTimer = new Timer("waker for " + c.toString());
300         
301         try {
302             if (workQueueDataOnDisk()
303                     && getQueueAssignmentPolicy(null).maximumNumberOfKeys() >= 0
304                     && getQueueAssignmentPolicy(null).maximumNumberOfKeys() <= 
305                         MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY) {
306                 this.allQueues = Collections.synchronizedMap(
307                         new HashMap<String,WorkQueue>());
308             } else {
309                 this.allQueues = c.getBigMap("allqueues",
310                         String.class, WorkQueue.class);
311                 if (logger.isLoggable(Level.FINE)) {
312                     Iterator i = this.allQueues.keySet().iterator();
313                     try {
314                         for (; i.hasNext();) {
315                             logger.fine((String) i.next());
316                         }
317                     } finally {
318                         StoredIterator.close(i);
319                     }
320                 }
321             }
322             this.alreadyIncluded = createAlreadyIncluded();
323             initQueue();
324         } catch (IOException e) {
325             e.printStackTrace();
326             throw (FatalConfigurationException)
327                 new FatalConfigurationException(e.getMessage()).initCause(e);
328         } catch (Exception e) {
329             e.printStackTrace();
330             throw (FatalConfigurationException)
331                 new FatalConfigurationException(e.getMessage()).initCause(e);
332         }
333         
334         initCostPolicy();
335         
336         loadSeeds();
337     }
338     
339     /***
340      * Set up the various queues-of-queues used by the frontier. Override
341      * in implementing subclasses to reduce or eliminate risk of queues
342      * growing without bound. 
343      */
344     protected void initQueuesOfQueues() {
345         // small risk of OutOfMemoryError: if 'hold-queues' is false,
346         // readyClassQueues may grow in size without bound
347         readyClassQueues = new LinkedBlockingQueue<String>();
348         // risk of OutOfMemoryError: in large crawls, 
349         // inactiveQueues may grow in size without bound
350         inactiveQueues = new LinkedBlockingQueue<String>();
351         // risk of OutOfMemoryError: in large crawls with queue max-budgets, 
352         // inactiveQueues may grow in size without bound
353         retiredQueues = new LinkedBlockingQueue<String>();
354         // small risk of OutOfMemoryError: in large crawls with many 
355         // unresponsive queues, an unbounded number of snoozed queues 
356         // may exist
357         snoozedClassQueues = Collections.synchronizedSortedSet(new TreeSet<WorkQueue>());
358     }
359 
360     /***
361      * Set (or reset after configuration change) the cost policy in effect.
362      * 
363      * @throws FatalConfigurationException
364      */
365     private void initCostPolicy() throws FatalConfigurationException {
366         try {
367             costAssignmentPolicy = (CostAssignmentPolicy) Class.forName(
368                     (String) getUncheckedAttribute(null, ATTR_COST_POLICY))
369                     .newInstance();
370         } catch (Exception e) {
371             e.printStackTrace();
372             throw new FatalConfigurationException(e.getMessage());
373         }
374     }
375 
376     /* (non-Javadoc)
377      * @see org.archive.crawler.frontier.AbstractFrontier#crawlEnded(java.lang.String)
378      */
379     public void crawlEnded(String sExitMessage) {
380         // Cleanup.  CrawlJobs persist after crawl has finished so undo any
381         // references.
382         if (this.alreadyIncluded != null) {
383             this.alreadyIncluded.close();
384             this.alreadyIncluded = null;
385         }
386         
387         try {
388             closeQueue();
389         } catch (IOException e) {
390             // FIXME exception handling
391             e.printStackTrace();
392         }
393         this.wakeTimer.cancel();
394         
395         this.allQueues.clear();
396         this.allQueues = null;
397         this.inProcessQueues = null;
398         this.readyClassQueues = null;
399         this.snoozedClassQueues = null;
400         this.inactiveQueues = null;
401         this.retiredQueues = null;
402         
403         this.costAssignmentPolicy = null;
404         
405         // Clearing controller is a problem. We get NPEs in #preNext.
406         super.crawlEnded(sExitMessage);
407         this.controller = null;
408     }
409 
410     /***
411      * Create a UriUniqFilter that will serve as record 
412      * of already seen URIs.
413      *
414      * @return A UURISet that will serve as a record of already seen URIs
415      * @throws IOException
416      */
417     protected abstract UriUniqFilter createAlreadyIncluded() throws IOException;
418 
419     /***
420      * Arrange for the given CandidateURI to be visited, if it is not
421      * already scheduled/completed.
422      *
423      * @see org.archive.crawler.framework.Frontier#schedule(org.archive.crawler.datamodel.CandidateURI)
424      */
425     public void schedule(CandidateURI caUri) {
426         // Canonicalization may set forceFetch flag.  See
427         // #canonicalization(CandidateURI) javadoc for circumstance.
428         String canon = canonicalize(caUri);
429         if (caUri.forceFetch()) {
430             alreadyIncluded.addForce(canon, caUri);
431         } else {
432             alreadyIncluded.add(canon, caUri);
433         }
434     }
435 
436     /***
437      * Accept the given CandidateURI for scheduling, as it has
438      * passed the alreadyIncluded filter. 
439      * 
440      * Choose a per-classKey queue and enqueue it. If this
441      * item has made an unready queue ready, place that 
442      * queue on the readyClassQueues queue. 
443      * @param caUri CandidateURI.
444      */
445     public void receive(CandidateURI caUri) {
446         CrawlURI curi = asCrawlUri(caUri);
447         applySpecialHandling(curi);
448         sendToQueue(curi);
449         // Update recovery log.
450         doJournalAdded(curi);
451     }
452 
453 	/* (non-Javadoc)
454 	 * @see org.archive.crawler.frontier.AbstractFrontier#asCrawlUri(org.archive.crawler.datamodel.CandidateURI)
455 	 */
456 	protected CrawlURI asCrawlUri(CandidateURI caUri) {
457 		CrawlURI curi = super.asCrawlUri(caUri);
458 		// force cost to be calculated, pre-insert
459 		getCost(curi);
460 		return curi;
461 	}
462 	
463     /***
464      * Send a CrawlURI to the appropriate subqueue.
465      * 
466      * @param curi
467      */
468     protected void sendToQueue(CrawlURI curi) {
469         WorkQueue wq = getQueueFor(curi);
470         synchronized (wq) {
471             wq.enqueue(this, curi);
472             if(!wq.isRetired()) {
473                 incrementQueuedUriCount();
474             }
475             if(!wq.isHeld()) {
476                 wq.setHeld();
477                 if(holdQueues() && readyClassQueues.size()>=targetSizeForReadyQueues()) {
478                     deactivateQueue(wq);
479                 } else {
480                     replenishSessionBalance(wq);
481                     readyQueue(wq);
482                 }
483             }
484             WorkQueue laq = longestActiveQueue;
485             if(!wq.isRetired()&&((laq==null) || wq.getCount() > laq.getCount())) {
486                 longestActiveQueue = wq; 
487             }
488         }
489     }
490 
491     /***
492      * Whether queues should start inactive (only becoming active when needed
493      * to keep the crawler busy), or if queues should start out ready.
494      * 
495      * @return true if new queues should held inactive
496      */
497     private boolean holdQueues() {
498         return ((Boolean) getUncheckedAttribute(null, ATTR_HOLD_QUEUES))
499                 .booleanValue();
500     }
501 
502     /***
503      * Put the given queue on the readyClassQueues queue
504      * @param wq
505      */
506     private void readyQueue(WorkQueue wq) {
507         try {
508             wq.setActive(this, true);
509             readyClassQueues.put(wq.getClassKey());
510         } catch (InterruptedException e) {
511             e.printStackTrace();
512             System.err.println("unable to ready queue "+wq);
513             // propagate interrupt up 
514             throw new RuntimeException(e);
515         }
516     }
517 
518     /***
519      * Put the given queue on the inactiveQueues queue
520      * @param wq
521      */
522     private void deactivateQueue(WorkQueue wq) {
523 //        try {
524             wq.setSessionBalance(0); // zero out session balance
525             inactiveQueues.add(wq.getClassKey());
526             wq.setActive(this, false);
527 //        } catch (InterruptedException e) {
528 //            e.printStackTrace();
529 //            System.err.println("unable to deactivate queue "+wq);
530 //            // propagate interrupt up 
531 //            throw new RuntimeException(e);
532 //        }
533     }
534     
535     /***
536      * Put the given queue on the retiredQueues queue
537      * @param wq
538      */
539     private void retireQueue(WorkQueue wq) {
540 //        try {
541             retiredQueues.add(wq.getClassKey());
542             decrementQueuedCount(wq.getCount());
543             wq.setRetired(true);
544             wq.setActive(this, false);
545 //        } catch (InterruptedException e) {
546 //            e.printStackTrace();
547 //            System.err.println("unable to retire queue "+wq);
548 //            // propagate interrupt up 
549 //            throw new RuntimeException(e);
550 //        }
551     }
552     
553     /*** 
554      * Accomodate any changes in settings.
555      * 
556      * @see org.archive.crawler.framework.Frontier#kickUpdate()
557      */
558     public void kickUpdate() {
559         super.kickUpdate();
560         int target = (Integer)getUncheckedAttribute(null,
561                 ATTR_TARGET_READY_QUEUES_BACKLOG);
562         if (target < 1) {
563             target = 1;
564         }
565         this.targetSizeForReadyQueues = target; 
566         try {
567             initCostPolicy();
568         } catch (FatalConfigurationException fce) {
569             throw new RuntimeException(fce);
570         }
571         // The rules for a 'retired' queue may have changed; so,
572         // unretire all queues to 'inactive'. If they still qualify
573         // as retired/overbudget next time they come up, they'll
574         // be re-retired; if not, they'll get a chance to become
575         // active under the new rules.
576         Object key = this.retiredQueues.poll();
577         while (key != null) {
578             WorkQueue q = (WorkQueue)this.allQueues.get(key);
579             if(q != null) {
580                 unretireQueue(q);
581             }
582             key = this.retiredQueues.poll();
583         }
584     }
585     /***
586      * Restore a retired queue to the 'inactive' state. 
587      * 
588      * @param q
589      */
590     private void unretireQueue(WorkQueue q) {
591         deactivateQueue(q);
592         q.setRetired(false); 
593         incrementQueuedUriCount(q.getCount());
594     }
595 
596     /***
597      * Return the work queue for the given CrawlURI's classKey. URIs
598      * are ordered and politeness-delayed within their 'class'.
599      * If the requested queue is not found, a new instance is created.
600      * 
601      * @param curi CrawlURI to base queue on
602      * @return the found or created ClassKeyQueue
603      */
604     protected abstract WorkQueue getQueueFor(CrawlURI curi);
605 
606     /***
607      * Return the work queue for the given classKey, or null
608      * if no such queue exists.
609      * 
610      * @param classKey key to look for
611      * @return the found WorkQueue
612      */
613     protected abstract WorkQueue getQueueFor(String classKey);
614     
615     /***
616      * Return the next CrawlURI to be processed (and presumably
617      * visited/fetched) by a a worker thread.
618      *
619      * Relies on the readyClassQueues having been loaded with
620      * any work queues that are eligible to provide a URI. 
621      *
622      * @return next CrawlURI to be processed. Or null if none is available.
623      *
624      * @see org.archive.crawler.framework.Frontier#next()
625      */
626     public CrawlURI next()
627     throws InterruptedException, EndedException {
628         while (true) { // loop left only by explicit return or exception
629             long now = System.currentTimeMillis();
630 
631             // Do common checks for pause, terminate, bandwidth-hold
632             preNext(now);
633             
634             // allow up-to-1 thread to fill readyClassQueues to target
635             if(readyFiller.tryAcquire()) {
636                 try {
637                     int activationsNeeded = targetSizeForReadyQueues() - readyClassQueues.size();
638                     while(activationsNeeded > 0 && !inactiveQueues.isEmpty()) {
639                         activateInactiveQueue();
640                         activationsNeeded--;
641                     }
642                 } finally {
643                     readyFiller.release();
644                 }
645             }
646                    
647             WorkQueue readyQ = null;
648             Object key = readyClassQueues.poll(DEFAULT_WAIT,TimeUnit.MILLISECONDS);
649             if (key != null) {
650                 readyQ = (WorkQueue)this.allQueues.get(key);
651             }
652             if (readyQ != null) {
653                 while(true) { // loop left by explicit return or break on empty
654                     CrawlURI curi = null;
655                     synchronized(readyQ) {
656                         curi = readyQ.peek(this);                     
657                         if (curi != null) {
658                             // check if curi belongs in different queue
659                             String currentQueueKey = getClassKey(curi);
660                             if (currentQueueKey.equals(curi.getClassKey())) {
661                                 // curi was in right queue, emit
662                                 noteAboutToEmit(curi, readyQ);
663                                 inProcessQueues.add(readyQ);
664                                 return curi;
665                             }
666                             // URI's assigned queue has changed since it
667                             // was queued (eg because its IP has become
668                             // known). Requeue to new queue.
669                             curi.setClassKey(currentQueueKey);
670                             readyQ.dequeue(this);
671                             decrementQueuedCount(1);
672                             curi.setHolderKey(null);
673                             // curi will be requeued to true queue after lock
674                             //  on readyQ is released, to prevent deadlock
675                         } else {
676                             // readyQ is empty and ready: it's exhausted
677                             // release held status, allowing any subsequent 
678                             // enqueues to again put queue in ready
679                             readyQ.clearHeld();
680                             break;
681                         }
682                     }
683                     if(curi!=null) {
684                         // complete the requeuing begun earlier
685                         sendToQueue(curi);
686                     }
687                 }
688             } else {
689                 // ReadyQ key wasn't in all queues: unexpected
690                 if (key != null) {
691                     logger.severe("Key "+ key +
692                         " in readyClassQueues but not allQueues");
693                 }
694             }
695 
696             if(shouldTerminate) {
697                 // skip subsequent steps if already on last legs
698                 throw new EndedException("shouldTerminate is true");
699             }
700                 
701             if(inProcessQueues.size()==0) {
702                 // Nothing was ready or in progress or imminent to wake; ensure 
703                 // any piled-up pending-scheduled URIs are considered
704                 this.alreadyIncluded.requestFlush();
705             }    
706         }
707     }
708 
709     private int targetSizeForReadyQueues() {
710         return targetSizeForReadyQueues;
711     }
712 
713     /***
714      * Return the 'cost' of a CrawlURI (how much of its associated
715      * queue's budget it depletes upon attempted processing)
716      * 
717      * @param curi
718      * @return the associated cost
719      */
720     private int getCost(CrawlURI curi) {
721         int cost = curi.getHolderCost();
722         if (cost == CrawlURI.UNCALCULATED) {
723             cost = costAssignmentPolicy.costOf(curi);
724             curi.setHolderCost(cost);
725         }
726         return cost;
727     }
728     
729     /***
730      * Activate an inactive queue, if any are available. 
731      */
732     private void activateInactiveQueue() {
733         Object key = this.inactiveQueues.poll();
734         if (key == null) {
735             return;
736         }
737         WorkQueue candidateQ = (WorkQueue)this.allQueues.get(key);
738         if(candidateQ != null) {
739             synchronized(candidateQ) {
740                 replenishSessionBalance(candidateQ);
741                 if(candidateQ.isOverBudget()){
742                     // if still over-budget after an activation & replenishing,
743                     // retire
744                     retireQueue(candidateQ);
745                     return;
746                 } 
747                 long now = System.currentTimeMillis();
748                 long delay_ms = candidateQ.getWakeTime() - now;
749                 if(delay_ms>0) {
750                     // queue still due for snoozing
751                     snoozeQueue(candidateQ,now,delay_ms);
752                     return;
753                 }
754                 candidateQ.setWakeTime(0); // clear obsolete wake time, if any
755                 readyQueue(candidateQ);
756                 if (logger.isLoggable(Level.FINE)) {
757                     logger.fine("ACTIVATED queue: " +
758                         candidateQ.getClassKey());
759                    
760                 }
761             }
762         }
763     }
764 
765     /***
766      * Replenish the budget of the given queue by the appropriate amount.
767      * 
768      * @param queue queue to replenish
769      */
770     private void replenishSessionBalance(WorkQueue queue) {
771         UURI contextUri = queue.getContextUURI(this); 
772         
773         // TODO: consider confusing cross-effects of this and IP-based politeness
774         queue.setSessionBalance(((Integer) getUncheckedAttribute(contextUri,
775                 ATTR_BALANCE_REPLENISH_AMOUNT)).intValue());
776         // reset total budget (it may have changed)
777         // TODO: is this the best way to be sensitive to potential mid-crawl changes
778         long totalBudget = ((Long)getUncheckedAttribute(contextUri,ATTR_QUEUE_TOTAL_BUDGET)).longValue();
779         queue.setTotalBudget(totalBudget);
780     }
781 
782     /***
783      * Enqueue the given queue to either readyClassQueues or inactiveQueues,
784      * as appropriate.
785      * 
786      * @param wq
787      */
788     private void reenqueueQueue(WorkQueue wq) {
789         if(wq.isOverBudget()) {
790             // if still over budget, deactivate
791             if (logger.isLoggable(Level.FINE)) {
792                 logger.fine("DEACTIVATED queue: " +
793                     wq.getClassKey());
794             }
795             deactivateQueue(wq);
796         } else {
797             readyQueue(wq);
798         }
799     }
800     
801     /***
802      * Wake any queues sitting in the snoozed queue whose time has come.
803      */
804     void wakeQueues() {
805         synchronized (snoozedClassQueues) {
806             long now = System.currentTimeMillis();
807             long nextWakeDelay = 0;
808             int wokenQueuesCount = 0;
809             while (true) {
810                 if (snoozedClassQueues.isEmpty()) {
811                     return;
812                 }
813                 WorkQueue peek = (WorkQueue) snoozedClassQueues.first();
814                 nextWakeDelay = peek.getWakeTime() - now;
815                 if (nextWakeDelay <= 0) {
816                     snoozedClassQueues.remove(peek);
817                     peek.setWakeTime(0);
818                     reenqueueQueue(peek);
819                     wokenQueuesCount++;
820                 } else {
821                     break;
822                 }
823             }
824             this.nextWake = new WakeTask();
825             this.wakeTimer.schedule(nextWake,nextWakeDelay);
826         }
827     }
828 
829     /***
830      * Note that the previously emitted CrawlURI has completed
831      * its processing (for now).
832      *
833      * The CrawlURI may be scheduled to retry, if appropriate,
834      * and other related URIs may become eligible for release
835      * via the next next() call, as a result of finished().
836      *
837      *  (non-Javadoc)
838      * @see org.archive.crawler.framework.Frontier#finished(org.archive.crawler.datamodel.CrawlURI)
839      */
840     public void finished(CrawlURI curi) {
841         long now = System.currentTimeMillis();
842 
843         curi.incrementFetchAttempts();
844         logLocalizedErrors(curi);
845         WorkQueue wq = (WorkQueue) curi.getHolder();
846         assert (wq.peek(this) == curi) : "unexpected peek " + wq;
847         inProcessQueues.remove(wq, 1);
848 
849         if(includesRetireDirective(curi)) {
850             // CrawlURI is marked to trigger retirement of its queue
851             curi.processingCleanup();
852             wq.unpeek();
853             wq.update(this, curi); // rewrite any changes
854             retireQueue(wq);
855             return;
856         }
857         
858         if (needsRetrying(curi)) {
859             // Consider errors which can be retried, leaving uri atop queue
860             if(curi.getFetchStatus()!=S_DEFERRED) {
861                 wq.expend(getCost(curi)); // all retries but DEFERRED cost
862             }
863             long delay_sec = retryDelayFor(curi);
864             curi.processingCleanup(); // lose state that shouldn't burden retry
865             synchronized(wq) {
866                 wq.unpeek();
867                 // TODO: consider if this should happen automatically inside unpeek()
868                 wq.update(this, curi); // rewrite any changes
869                 if (delay_sec > 0) {
870                     long delay_ms = delay_sec * 1000;
871                     snoozeQueue(wq, now, delay_ms);
872                 } else {
873                     reenqueueQueue(wq);
874                 }
875             }
876             // Let everyone interested know that it will be retried.
877             controller.fireCrawledURINeedRetryEvent(curi);
878             doJournalRescheduled(curi);
879             return;
880         }
881 
882         // Curi will definitely be disposed of without retry, so remove from queue
883         wq.dequeue(this);
884         decrementQueuedCount(1);
885         log(curi);
886 
887         if (curi.isSuccess()) {
888             totalProcessedBytes += curi.getRecordedSize();
889             incrementSucceededFetchCount();
890             // Let everyone know in case they want to do something before we strip the curi.
891             controller.fireCrawledURISuccessfulEvent(curi);
892             doJournalFinishedSuccess(curi);
893             wq.expend(getCost(curi)); // successes cost
894         } else if (isDisregarded(curi)) {
895             // Check for codes that mean that while we the crawler did
896             // manage to schedule it, it must be disregarded for some reason.
897             incrementDisregardedUriCount();
898             // Let interested listeners know of disregard disposition.
899             controller.fireCrawledURIDisregardEvent(curi);
900             doJournalDisregarded(curi);
901             // if exception, also send to crawlErrors
902             if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
903                 Object[] array = { curi };
904                 controller.runtimeErrors.log(Level.WARNING, curi.getUURI()
905                         .toString(), array);
906             }
907             // TODO: consider reinstating forget-uri
908         } else {
909             // In that case FAILURE, note & log
910             //Let interested listeners know of failed disposition.
911             this.controller.fireCrawledURIFailureEvent(curi);
912             // if exception, also send to crawlErrors
913             if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
914                 Object[] array = { curi };
915                 this.controller.runtimeErrors.log(Level.WARNING, curi.getUURI()
916                         .toString(), array);
917             }
918             incrementFailedFetchCount();
919             // let queue note error
920             wq.noteError(((Integer) getUncheckedAttribute(curi,
921                     ATTR_ERROR_PENALTY_AMOUNT)).intValue()); 
922             doJournalFinishedFailure(curi);
923             wq.expend(getCost(curi)); // failures cost
924         }
925 
926         long delay_ms = politenessDelayFor(curi);
927         synchronized(wq) {
928             if (delay_ms > 0) {
929                 snoozeQueue(wq,now,delay_ms);
930             } else {
931                 reenqueueQueue(wq);
932             }
933         }
934 
935         curi.stripToMinimal();
936         curi.processingCleanup();
937 
938     }
939 
940     private boolean includesRetireDirective(CrawlURI curi) {
941         return curi.containsKey(A_FORCE_RETIRE) && (Boolean)curi.getObject(A_FORCE_RETIRE);
942     }
943 
944     /***
945      * Place the given queue into 'snoozed' state, ineligible to
946      * supply any URIs for crawling, for the given amount of time. 
947      * 
948      * @param wq queue to snooze 
949      * @param now time now in ms 
950      * @param delay_ms time to snooze in ms
951      */
952     private void snoozeQueue(WorkQueue wq, long now, long delay_ms) {
953         long nextTime = now + delay_ms;
954         wq.setWakeTime(nextTime);
955         long snoozeToInactiveDelayMs = ((Long)getUncheckedAttribute(null,
956                 ATTR_SNOOZE_DEACTIVATE_MS)).longValue();
957         if (delay_ms > snoozeToInactiveDelayMs && !inactiveQueues.isEmpty()) {
958             deactivateQueue(wq);
959         } else {
960             synchronized(snoozedClassQueues) {
961                 snoozedClassQueues.add(wq);
962                 if(wq == snoozedClassQueues.first()) {
963                     this.nextWake = new WakeTask();
964                     this.wakeTimer.schedule(nextWake, delay_ms);
965                 }
966             }
967         }
968     }
969 
970     /***
971      * Forget the given CrawlURI. This allows a new instance
972      * to be created in the future, if it is reencountered under
973      * different circumstances.
974      *
975      * @param curi The CrawlURI to forget
976      */
977     protected void forget(CrawlURI curi) {
978         logger.finer("Forgetting " + curi);
979         alreadyIncluded.forget(canonicalize(curi.getUURI()), curi);
980     }
981 
982     /***  (non-Javadoc)
983      * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
984      */
985     public long discoveredUriCount() {
986         return (this.alreadyIncluded != null)? this.alreadyIncluded.count(): 0;
987     }
988 
989     /***
990      * Delete all scheduled URIs matching the given regex. 
991      * 
992      * @param match regex of URIs to delete
993      * @return Number of items deleted.
994      */
995     public long deleteURIs(String uriMatch) {
996         return deleteURIs(uriMatch,null);
997     }
998 
999     /***
1000      * Delete all scheduled URIs matching the given regex, in queues with
1001      * names matching the second given regex. 
1002      * 
1003      * @param uriMatch regex of URIs to delete
1004      * @param queueMatch regex of queues to affect, or null for all
1005      * @return Number of items deleted.
1006      */
1007     public long deleteURIs(String uriMatch, String queueMatch) {
1008         long count = 0;
1009         // TODO: DANGER/ values() may not work right from CachedBdbMap
1010         Iterator iter = allQueues.keySet().iterator(); 
1011         while(iter.hasNext()) {
1012             String queueKey = ((String)iter.next());
1013             if(StringUtils.isNotEmpty(queueMatch) && !queueKey.matches(queueMatch)) {
1014                 // skip this queue
1015                 continue; 
1016             }
1017             WorkQueue wq = getQueueFor(queueKey);
1018             wq.unpeek();
1019             count += wq.deleteMatching(this, uriMatch);
1020         }
1021         decrementQueuedCount(count);
1022         return count;
1023     }
1024     
1025     //
1026     // Reporter implementation
1027     //
1028     
1029     public static String STANDARD_REPORT = "standard";
1030     public static String ALL_NONEMPTY = "nonempty";
1031     public static String ALL_QUEUES = "all";
1032     protected static String[] REPORTS = {STANDARD_REPORT,ALL_NONEMPTY,ALL_QUEUES};
1033     
1034     public String[] getReports() {
1035         return REPORTS;
1036     }
1037     
1038     /***
1039      * @param w Where to write to.
1040      */
1041     public void singleLineReportTo(PrintWriter w) {
1042         if (this.allQueues == null) {
1043             return;
1044         }
1045         int allCount = allQueues.size();
1046         int inProcessCount = inProcessQueues.uniqueSet().size();
1047         int readyCount = readyClassQueues.size();
1048         int snoozedCount = snoozedClassQueues.size();
1049         int activeCount = inProcessCount + readyCount + snoozedCount;
1050         int inactiveCount = inactiveQueues.size();
1051         int retiredCount = retiredQueues.size();
1052         int exhaustedCount = 
1053             allCount - activeCount - inactiveCount - retiredCount;
1054         w.print(allCount);
1055         w.print(" queues: ");
1056         w.print(activeCount);
1057         w.print(" active (");
1058         w.print(inProcessCount);
1059         w.print(" in-process; ");
1060         w.print(readyCount);
1061         w.print(" ready; ");
1062         w.print(snoozedCount);
1063         w.print(" snoozed); ");
1064         w.print(inactiveCount);
1065         w.print(" inactive; ");
1066         w.print(retiredCount);
1067         w.print(" retired; ");
1068         w.print(exhaustedCount);
1069         w.print(" exhausted");
1070         w.flush();
1071     }
1072     
1073     /* (non-Javadoc)
1074      * @see org.archive.util.Reporter#singleLineLegend()
1075      */
1076     public String singleLineLegend() {
1077         return "total active in-process ready snoozed inactive retired exhausted";
1078     }
1079 
1080     /***
1081      * This method compiles a human readable report on the status of the frontier
1082      * at the time of the call.
1083      * @param name Name of report.
1084      * @param writer Where to write to.
1085      */
1086     public synchronized void reportTo(String name, PrintWriter writer) {
1087         if(ALL_NONEMPTY.equals(name)) {
1088             allNonemptyReportTo(writer);
1089             return;
1090         }
1091         if(ALL_QUEUES.equals(name)) {
1092             allQueuesReportTo(writer);
1093             return;
1094         }
1095         if(name!=null && !STANDARD_REPORT.equals(name)) {
1096             writer.print(name);
1097             writer.print(" unavailable; standard report:\n");
1098         }
1099         standardReportTo(writer);
1100     }   
1101     
1102     /*** Compact report of all nonempty queues (one queue per line)
1103      * 
1104      * @param writer
1105      */
1106     private void allNonemptyReportTo(PrintWriter writer) {
1107         ArrayList<WorkQueue> inProcessQueuesCopy;
1108         synchronized(this.inProcessQueues) {
1109             // grab a copy that will be stable against mods for report duration 
1110             @SuppressWarnings("unchecked")
1111             Collection<WorkQueue> inProcess = this.inProcessQueues;
1112             inProcessQueuesCopy = new ArrayList<WorkQueue>(inProcess);
1113         }
1114         writer.print("\n -----===== IN-PROCESS QUEUES =====-----\n");
1115         queueSingleLinesTo(writer, inProcessQueuesCopy.iterator());
1116 
1117         writer.print("\n -----===== READY QUEUES =====-----\n");
1118         queueSingleLinesTo(writer, this.readyClassQueues.iterator());
1119 
1120         writer.print("\n -----===== SNOOZED QUEUES =====-----\n");
1121         queueSingleLinesTo(writer, this.snoozedClassQueues.iterator());
1122         
1123         writer.print("\n -----===== INACTIVE QUEUES =====-----\n");
1124         queueSingleLinesTo(writer, this.inactiveQueues.iterator());
1125         
1126         writer.print("\n -----===== RETIRED QUEUES =====-----\n");
1127         queueSingleLinesTo(writer, this.retiredQueues.iterator());
1128     }
1129 
1130     /*** Compact report of all nonempty queues (one queue per line)
1131      * 
1132      * @param writer
1133      */
1134     private void allQueuesReportTo(PrintWriter writer) {
1135         queueSingleLinesTo(writer, allQueues.keySet().iterator());
1136     }
1137     
1138     /***
1139      * Writer the single-line reports of all queues in the
1140      * iterator to the writer 
1141      * 
1142      * @param writer to receive report
1143      * @param iterator over queues of interest.
1144      */
1145     private void queueSingleLinesTo(PrintWriter writer, Iterator iterator) {
1146         Object obj;
1147         WorkQueue q;
1148         boolean legendWritten = false;
1149         while( iterator.hasNext()) {
1150             obj = iterator.next();
1151             if (obj ==  null) {
1152                 continue;
1153             }
1154             q = (obj instanceof WorkQueue)?
1155                 (WorkQueue)obj:
1156                 (WorkQueue)this.allQueues.get(obj);
1157             if(q == null) {
1158                 writer.print(" ERROR: "+obj);
1159             }
1160             if(!legendWritten) {
1161                 writer.println(q.singleLineLegend());
1162                 legendWritten = true;
1163             }
1164             q.singleLineReportTo(writer);
1165         }       
1166     }
1167 
1168     /***
1169      * @param w Writer to print to.
1170      */
1171     private void standardReportTo(PrintWriter w) {
1172         int allCount = allQueues.size();
1173         int inProcessCount = inProcessQueues.uniqueSet().size();
1174         int readyCount = readyClassQueues.size();
1175         int snoozedCount = snoozedClassQueues.size();
1176         int activeCount = inProcessCount + readyCount + snoozedCount;
1177         int inactiveCount = inactiveQueues.size();
1178         int retiredCount = retiredQueues.size();
1179         int exhaustedCount = 
1180             allCount - activeCount - inactiveCount - retiredCount;
1181 
1182         w.print("Frontier report - ");
1183         w.print(ArchiveUtils.get12DigitDate());
1184         w.print("\n");
1185         w.print(" Job being crawled: ");
1186         w.print(controller.getOrder().getCrawlOrderName());
1187         w.print("\n");
1188         w.print("\n -----===== STATS =====-----\n");
1189         w.print(" Discovered:    ");
1190         w.print(Long.toString(discoveredUriCount()));
1191         w.print("\n");
1192         w.print(" Queued:        ");
1193         w.print(Long.toString(queuedUriCount()));
1194         w.print("\n");
1195         w.print(" Finished:      ");
1196         w.print(Long.toString(finishedUriCount()));
1197         w.print("\n");
1198         w.print("  Successfully: ");
1199         w.print(Long.toString(succeededFetchCount()));
1200         w.print("\n");
1201         w.print("  Failed:       ");
1202         w.print(Long.toString(failedFetchCount()));
1203         w.print("\n");
1204         w.print("  Disregarded:  ");
1205         w.print(Long.toString(disregardedUriCount()));
1206         w.print("\n");
1207         w.print("\n -----===== QUEUES =====-----\n");
1208         w.print(" Already included size:     ");
1209         w.print(Long.toString(alreadyIncluded.count()));
1210         w.print("\n");
1211         w.print("               pending:     ");
1212         w.print(Long.toString(alreadyIncluded.pending()));
1213         w.print("\n");
1214         w.print("\n All class queues map size: ");
1215         w.print(Long.toString(allCount));
1216         w.print("\n");
1217         w.print( "             Active queues: ");
1218         w.print(activeCount);
1219         w.print("\n");
1220         w.print("                    In-process: ");
1221         w.print(inProcessCount);
1222         w.print("\n");
1223         w.print("                         Ready: ");
1224         w.print(readyCount);
1225         w.print("\n");
1226         w.print("                       Snoozed: ");
1227         w.print(snoozedCount);
1228         w.print("\n");
1229         w.print("           Inactive queues: ");
1230         w.print(inactiveCount);
1231         w.print("\n");
1232         w.print("            Retired queues: ");
1233         w.print(retiredCount);
1234         w.print("\n");
1235         w.print("          Exhausted queues: ");
1236         w.print(exhaustedCount);
1237         w.print("\n");
1238         
1239         w.print("\n -----===== IN-PROCESS QUEUES =====-----\n");
1240         @SuppressWarnings("unchecked")
1241         Collection<WorkQueue> inProcess = inProcessQueues;
1242         ArrayList<WorkQueue> copy = extractSome(inProcess, REPORT_MAX_QUEUES);
1243         appendQueueReports(w, copy.iterator(), copy.size(), REPORT_MAX_QUEUES);
1244         
1245         w.print("\n -----===== READY QUEUES =====-----\n");
1246         appendQueueReports(w, this.readyClassQueues.iterator(),
1247             this.readyClassQueues.size(), REPORT_MAX_QUEUES);
1248 
1249         w.print("\n -----===== SNOOZED QUEUES =====-----\n");
1250         copy = extractSome(snoozedClassQueues, REPORT_MAX_QUEUES);
1251         appendQueueReports(w, copy.iterator(), copy.size(), REPORT_MAX_QUEUES);
1252         
1253         WorkQueue longest = longestActiveQueue;
1254         if (longest != null) {
1255             w.print("\n -----===== LONGEST QUEUE =====-----\n");
1256             longest.reportTo(w);
1257         }
1258 
1259         w.print("\n -----===== INACTIVE QUEUES =====-----\n");
1260         appendQueueReports(w, this.inactiveQueues.iterator(),
1261             this.inactiveQueues.size(), REPORT_MAX_QUEUES);
1262         
1263         w.print("\n -----===== RETIRED QUEUES =====-----\n");
1264         appendQueueReports(w, this.retiredQueues.iterator(),
1265             this.retiredQueues.size(), REPORT_MAX_QUEUES);
1266 
1267         w.flush();
1268     }
1269     
1270     
1271     /***
1272      * Extract some of the elements in the given collection to an
1273      * ArrayList.  This method synchronizes on the given collection's
1274      * monitor.  The returned list will never contain more than the
1275      * specified maximum number of elements.
1276      * 
1277      * @param c    the collection whose elements to extract
1278      * @param max  the maximum number of elements to extract
1279      * @return  the extraction
1280      */
1281     private static <T> ArrayList<T> extractSome(Collection<T> c, int max) {
1282         // Try to guess a sane initial capacity for ArrayList
1283         // Hopefully given collection won't grow more than 10 items
1284         // between now and the synchronized block...
1285         int initial = Math.min(c.size() + 10, max);
1286         int count = 0;
1287         ArrayList<T> list = new ArrayList<T>(initial);
1288         synchronized (c) {
1289             Iterator<T> iter = c.iterator();
1290             while (iter.hasNext() && (count < max)) {
1291                 list.add(iter.next());
1292                 count++;
1293             }
1294         }
1295         return list;
1296     }
1297 
1298     /***
1299      * Append queue report to general Frontier report.
1300      * @param w StringBuffer to append to.
1301      * @param iterator An iterator over 
1302      * @param total
1303      * @param max
1304      */
1305     protected void appendQueueReports(PrintWriter w, Iterator iterator,
1306             int total, int max) {
1307         Object obj;
1308         WorkQueue q;
1309         for(int count = 0; iterator.hasNext() && (count < max); count++) {
1310             obj = iterator.next();
1311             if (obj ==  null) {
1312                 continue;
1313             }
1314             q = (obj instanceof WorkQueue)?
1315                 (WorkQueue)obj:
1316                 (WorkQueue)this.allQueues.get(obj);
1317             if(q == null) {
1318                 w.print("WARNING: No report for queue "+obj);
1319             }
1320             q.reportTo(w);
1321         }
1322         if(total > max) {
1323             w.print("...and " + (total - max) + " more.\n");
1324         }
1325     }
1326 
1327     /***
1328      * Force logging, etc. of operator- deleted CrawlURIs
1329      * 
1330      * @see org.archive.crawler.framework.Frontier#deleted(org.archive.crawler.datamodel.CrawlURI)
1331      */
1332     public synchronized void deleted(CrawlURI curi) {
1333         //treat as disregarded
1334         controller.fireCrawledURIDisregardEvent(curi);
1335         log(curi);
1336         incrementDisregardedUriCount();
1337         curi.stripToMinimal();
1338         curi.processingCleanup();
1339     }
1340 
1341     public void considerIncluded(UURI u) {
1342         this.alreadyIncluded.note(canonicalize(u));
1343         CrawlURI temp = new CrawlURI(u);
1344         temp.setClassKey(getClassKey(temp));
1345         getQueueFor(temp).expend(getCost(temp));
1346     }
1347     
1348     protected abstract void initQueue() throws IOException;
1349     protected abstract void closeQueue() throws IOException;
1350     
1351     /***
1352      * Returns <code>true</code> if the WorkQueue implementation of this
1353      * Frontier stores its workload on disk instead of relying
1354      * on serialization mechanisms.
1355      * 
1356      * TODO: rename! (this is a very misleading name) or kill (don't
1357      * see any implementations that return false)
1358      * 
1359      * @return a constant boolean value for this class/instance
1360      */
1361     protected abstract boolean workQueueDataOnDisk();
1362     
1363     
1364     public FrontierGroup getGroup(CrawlURI curi) {
1365         return getQueueFor(curi);
1366     }
1367     
1368     
1369     public long averageDepth() {
1370         int inProcessCount = inProcessQueues.uniqueSet().size();
1371         int readyCount = readyClassQueues.size();
1372         int snoozedCount = snoozedClassQueues.size();
1373         int activeCount = inProcessCount + readyCount + snoozedCount;
1374         int inactiveCount = inactiveQueues.size();
1375         int totalQueueCount = (activeCount+inactiveCount);
1376         return (totalQueueCount == 0) ? 0 : liveQueuedUriCount.get() / totalQueueCount;
1377     }
1378     public float congestionRatio() {
1379         int inProcessCount = inProcessQueues.uniqueSet().size();
1380         int readyCount = readyClassQueues.size();
1381         int snoozedCount = snoozedClassQueues.size();
1382         int activeCount = inProcessCount + readyCount + snoozedCount;
1383         int inactiveCount = inactiveQueues.size();
1384         return (float)(activeCount + inactiveCount) / (inProcessCount + snoozedCount);
1385     }
1386     public long deepestUri() {
1387         return longestActiveQueue==null ? -1 : longestActiveQueue.getCount();
1388     }
1389     
1390     
1391     /* (non-Javadoc)
1392      * @see org.archive.crawler.framework.Frontier#isEmpty()
1393      */
1394     public synchronized boolean isEmpty() {
1395         return liveQueuedUriCount.get() == 0 && alreadyIncluded.pending() == 0;
1396     }
1397 }
1398