1   package org.archive.crawler.frontier;
2   
3   import java.io.IOException;
4   import java.io.PrintWriter;
5   import java.io.Serializable;
6   import java.util.logging.Level;
7   import java.util.logging.Logger;
8   
9   import org.apache.commons.httpclient.URIException;
10  import org.archive.crawler.datamodel.CrawlSubstats;
11  import org.archive.crawler.datamodel.CrawlURI;
12  import org.archive.crawler.framework.Frontier;
13  import org.archive.net.UURI;
14  import org.archive.net.UURIFactory;
15  import org.archive.util.ArchiveUtils;
16  import org.archive.util.Reporter;
17  
18  /***
19   * A single queue of related URIs to visit, grouped by a classKey
20   * (typically "hostname:port" or similar) 
21   * 
22   * @author gojomo
23   * @author Christian Kohlschuetter 
24   */
25  public abstract class WorkQueue implements Frontier.FrontierGroup, Comparable,
26          Serializable, Reporter {
27      static final long serialVersionUID = -1939168792663316048L;
28      
29      private static final Logger logger =
30          Logger.getLogger(WorkQueue.class.getName());
31      
32      /*** The classKey */
33      protected final String classKey;
34  
35      private boolean active = true;
36  
37      /*** Total number of stored items */
38      private long count = 0;
39  
40      /*** Total number of items ever enqueued */
41      private long enqueueCount = 0;
42      
43      /*** Whether queue is already in lifecycle stage */
44      private boolean isHeld = false;
45  
46      /*** Time to wake, if snoozed */
47      private long wakeTime = 0;
48  
49      /*** Running 'budget' indicating whether queue should stay active */
50      private int sessionBalance = 0;
51  
52      /*** Cost of the last item to be charged against queue */
53      private int lastCost = 0;
54  
55      /*** Total number of items charged against queue; with totalExpenditure
56       * can be used to calculate 'average cost'. */
57      private long costCount = 0;
58  
59      /*** Running tally of total expenditures on this queue */
60      private long totalExpenditure = 0;
61  
62      /*** Total to spend on this queue over its lifetime */
63      private long totalBudget = 0;
64  
65      /*** The next item to be returned */
66      private CrawlURI peekItem = null;
67  
68      /*** Last URI enqueued */
69      private String lastQueued;
70  
71      /*** Last URI peeked */
72      private String lastPeeked;
73  
74      /*** time of last dequeue (disposition of some URI) **/ 
75      private long lastDequeueTime;
76      
77      /*** count of errors encountered */
78      private long errorCount = 0;
79      
80      /*** Substats for all CrawlURIs in this group */
81      protected CrawlSubstats substats = new CrawlSubstats();
82  
83      private boolean retired;
84      
85      public WorkQueue(final String pClassKey) {
86          this.classKey = pClassKey;
87      }
88  
89      /***
90       * Delete URIs matching the given pattern from this queue. 
91       * @param frontier
92       * @param match
93       * @return count of deleted URIs
94       */
95      public long deleteMatching(final WorkQueueFrontier frontier, String match) {
96          try {
97              final long deleteCount = deleteMatchingFromQueue(frontier, match);
98              this.count -= deleteCount;
99              return deleteCount;
100         } catch (IOException e) {
101             //FIXME better exception handling
102             e.printStackTrace();
103             throw new RuntimeException(e);
104         }
105     }
106 
107     /***
108      * Add the given CrawlURI, noting its addition in running count. (It
109      * should not already be present.)
110      * 
111      * @param frontier Work queues manager.
112      * @param curi CrawlURI to insert.
113      */
114     public synchronized void enqueue(final WorkQueueFrontier frontier,
115         CrawlURI curi) {
116         try {
117             insert(frontier, curi, false);
118         } catch (IOException e) {
119             //FIXME better exception handling
120             e.printStackTrace();
121             throw new RuntimeException(e);
122         }
123         count++;
124         enqueueCount++;
125     }
126 
127     /***
128      * Return the topmost queue item -- and remember it,
129      * such that even later higher-priority inserts don't
130      * change it. 
131      * 
132      * TODO: evaluate if this is really necessary
133      * @param frontier Work queues manager
134      * 
135      * @return topmost queue item, or null
136      */
137     public CrawlURI peek(final WorkQueueFrontier frontier) {
138         if(peekItem == null && count > 0) {
139             try {
140                 peekItem = peekItem(frontier);
141             } catch (IOException e) {
142                 //FIXME better exception handling
143                 logger.log(Level.SEVERE,"peek failure",e);
144                 e.printStackTrace();
145                 // throw new RuntimeException(e);
146             }
147             if(peekItem != null) {
148                 lastPeeked = peekItem.toString();
149             }
150         }
151         return peekItem;
152     }
153 
154     /***
155      * Remove the peekItem from the queue and adjusts the count.
156      * 
157      * @param frontier  Work queues manager.
158      */
159     public synchronized void dequeue(final WorkQueueFrontier frontier) {
160         try {
161             deleteItem(frontier, peekItem);
162         } catch (IOException e) {
163             //FIXME better exception handling
164             e.printStackTrace();
165             throw new RuntimeException(e);
166         }
167         unpeek();
168         count--;
169         lastDequeueTime = System.currentTimeMillis();
170     }
171 
172     /***
173      * Set the session 'activity budget balance' to the given value
174      * 
175      * @param balance to use
176      */
177     public void setSessionBalance(int balance) {
178         this.sessionBalance = balance;
179     }
180 
181     /***
182      * Return current session 'activity budget balance' 
183      * 
184      * @return session balance
185      */
186     public int getSessionBalance() {
187         return this.sessionBalance;
188     }
189 
190     /***
191      * Set the total expenditure level allowable before queue is 
192      * considered inherently 'over-budget'. 
193      * 
194      * @param budget
195      */
196     public void setTotalBudget(long budget) {
197         this.totalBudget = budget;
198     }
199 
200     /***
201      * Check whether queue has temporarily or permanently exceeded
202      * its budget. 
203      * 
204      * @return true if queue is over its set budget(s)
205      */
206     public boolean isOverBudget() {
207         // check whether running balance is depleted 
208         // or totalExpenditure exceeds totalBudget
209         return this.sessionBalance <= 0
210             || (this.totalBudget >= 0 && this.totalExpenditure >= this.totalBudget);
211     }
212 
213     /***
214      * Return the tally of all expenditures on this queue
215      * 
216      * @return total amount expended on this queue
217      */
218     public long getTotalExpenditure() {
219         return totalExpenditure;
220     }
221 
222     /***
223      * Increase the internal running budget to be used before 
224      * deactivating the queue
225      * 
226      * @param amount amount to increment
227      * @return updated budget value
228      */
229     public int incrementSessionBalance(int amount) {
230         this.sessionBalance = this.sessionBalance + amount;
231         return this.sessionBalance;
232     }
233 
234     /***
235      * Decrease the internal running budget by the given amount. 
236      * @param amount tp decrement
237      * @return updated budget value
238      */
239     public int expend(int amount) {
240         this.sessionBalance = this.sessionBalance - amount;
241         this.totalExpenditure = this.totalExpenditure + amount;
242         this.lastCost = amount;
243         this.costCount++;
244         return this.sessionBalance;
245     }
246 
247     /***
248      * A URI should not have been charged against queue (eg
249      * it was disregarded); return the amount expended 
250      * @param amount to return
251      * @return updated budget value
252      */
253     public int refund(int amount) {
254         this.sessionBalance = this.sessionBalance + amount;
255         this.totalExpenditure = this.totalExpenditure - amount;
256         this.costCount--;
257         return this.sessionBalance;
258     }
259     
260     /***
261      * Note an error and assess an extra penalty. 
262      * @param penalty additional amount to deduct
263      */
264     public void noteError(int penalty) {
265         this.sessionBalance = this.sessionBalance - penalty;
266         this.totalExpenditure = this.totalExpenditure + penalty;
267         errorCount++;
268     }
269     
270     /***
271      * @param l
272      */
273     public void setWakeTime(long l) {
274         wakeTime = l;
275     }
276 
277     /***
278      * @return wakeTime
279      */
280     public long getWakeTime() {
281         return wakeTime;
282     }
283 
284     /***
285      * @return classKey, the 'identifier', for this queue.
286      */
287     public String getClassKey() {
288         return this.classKey;
289     }
290 
291     /***
292      * Clear isHeld to false
293      */
294     public void clearHeld() {
295         isHeld = false;
296     }
297 
298     /***
299      * Whether the queue is already in a lifecycle stage --
300      * such as ready, in-progress, snoozed -- and thus should
301      * not be redundantly inserted to readyClassQueues
302      * 
303      * @return isHeld
304      */
305     public boolean isHeld() {
306         return isHeld;
307     }
308 
309     /***
310      * Set isHeld to true
311      */
312     public void setHeld() {
313         isHeld = true;
314     }
315 
316     /***
317      * Forgive the peek, allowing a subsequent peek to 
318      * return a different item. 
319      * 
320      */
321     public void unpeek() {
322         peekItem = null;
323     }
324 
325     public final int compareTo(Object obj) {
326         if(this == obj) {
327             return 0; // for exact identity only
328         }
329         WorkQueue other = (WorkQueue) obj;
330         if(getWakeTime() > other.getWakeTime()) {
331             return 1;
332         }
333         if(getWakeTime() < other.getWakeTime()) {
334             return -1;
335         }
336         // at this point, the ordering is arbitrary, but still
337         // must be consistent/stable over time
338         return this.classKey.compareTo(other.getClassKey());
339     }
340 
341     /***
342      * Update the given CrawlURI, which should already be present. (This
343      * is not checked.) Equivalent to an enqueue without affecting the count.
344      * 
345      * @param frontier Work queues manager.
346      * @param curi CrawlURI to update.
347      */
348     public void update(final WorkQueueFrontier frontier, CrawlURI curi) {
349         try {
350             insert(frontier, curi, true);
351         } catch (IOException e) {
352             //FIXME better exception handling
353             e.printStackTrace();
354             throw new RuntimeException(e);
355         }
356     }
357 
358     /***
359      * @return Returns the count.
360      */
361     public synchronized long getCount() {
362         return this.count;
363     }
364 
365     /***
366      * Insert the given curi, whether it is already present or not. 
367      * @param frontier WorkQueueFrontier.
368      * @param curi CrawlURI to insert.
369      * @throws IOException
370      */
371     private void insert(final WorkQueueFrontier frontier, CrawlURI curi, 
372             boolean overwriteIfPresent)
373         throws IOException {
374         insertItem(frontier, curi, overwriteIfPresent);
375         lastQueued = curi.toString();
376     }
377 
378     /***
379      * Insert the given curi, whether it is already present or not.
380      * Hook for subclasses. 
381      * 
382      * @param frontier WorkQueueFrontier.
383      * @param curi CrawlURI to insert.
384      * @throws IOException  if there was a problem while inserting the item
385      */
386     protected abstract void insertItem(final WorkQueueFrontier frontier,
387         CrawlURI curi, boolean expectedPresent) throws IOException;
388 
389     /***
390      * Delete URIs matching the given pattern from this queue. 
391      * @param frontier WorkQueues manager.
392      * @param match  the pattern to match
393      * @return count of deleted URIs
394      * @throws IOException  if there was a problem while deleting
395      */
396     protected abstract long deleteMatchingFromQueue(
397         final WorkQueueFrontier frontier, final String match)
398         throws IOException;
399 
400     /***
401      * Removes the given item from the queue.
402      * 
403      * This is only used to remove the first item in the queue,
404      * so it is not necessary to implement a random-access queue.
405      * 
406      * @param frontier  Work queues manager.
407      * @throws IOException  if there was a problem while deleting the item
408      */
409     protected abstract void deleteItem(final WorkQueueFrontier frontier,
410         final CrawlURI item) throws IOException;
411 
412     /***
413      * Returns first item from queue (does not delete)
414      * 
415      * @return The peeked item, or null
416      * @throws IOException  if there was a problem while peeking
417      */
418     protected abstract CrawlURI peekItem(final WorkQueueFrontier frontier)
419         throws IOException;
420 
421     /***
422      * Suspends this WorkQueue. Closes all connections to resources etc.
423      * 
424      * @param frontier
425      * @throws IOException
426      */
427     protected void suspend(final WorkQueueFrontier frontier) throws IOException {
428     }
429 
430     /***
431      * Resumes this WorkQueue. Eventually opens connections to resources etc.
432      * 
433      * @param frontier
434      * @throws IOException
435      */
436     protected void resume(final WorkQueueFrontier frontier) throws IOException {
437     }
438 
439     public void setActive(final WorkQueueFrontier frontier, final boolean b) {
440         if(active != b) {
441             active = b;
442             try {
443                 if(active) {
444                     resume(frontier);
445                 } else {
446                     suspend(frontier);
447                 }
448             } catch (IOException e) {
449                 //FIXME better exception handling
450                 e.printStackTrace();
451                 throw new RuntimeException(e);
452             }
453         }
454     }
455     
456     // 
457     // Reporter
458     //
459 
460     /* (non-Javadoc)
461      * @see org.archive.util.Reporter#getReports()
462      */
463     public String[] getReports() {
464         return new String[] {};
465     }
466 
467     /* (non-Javadoc)
468      * @see org.archive.util.Reporter#reportTo(java.io.Writer)
469      */
470     public void reportTo(PrintWriter writer) {
471         reportTo(null,writer);
472     }
473 
474     /* (non-Javadoc)
475      * @see org.archive.util.Reporter#singleLineReportTo(java.io.Writer)
476      */
477     public void singleLineReportTo(PrintWriter writer) {
478         // queue name
479         writer.print(classKey);
480         writer.print(" ");
481         // count of items
482         writer.print(Long.toString(count));
483         writer.print(" ");
484         // enqueue count
485         writer.print(Long.toString(enqueueCount));
486         writer.print(" ");
487         writer.print(sessionBalance);
488         writer.print(" ");
489         writer.print(lastCost);
490         writer.print("(");
491         writer.print(ArchiveUtils.doubleToString(
492                     ((double) totalExpenditure / costCount), 1));
493         writer.print(")");
494         writer.print(" ");
495         // last dequeue time, if any, or '-'
496         if (lastDequeueTime != 0) {
497             writer.print(ArchiveUtils.getLog17Date(lastDequeueTime));
498         } else {
499             writer.print("-");
500         }
501         writer.print(" ");
502         // wake time if snoozed, or '-'
503         if (wakeTime != 0) {
504             writer.print(ArchiveUtils.formatMillisecondsToConventional(wakeTime - System.currentTimeMillis()));
505         } else {
506             writer.print("-");
507         }
508         writer.print(" ");
509         writer.print(Long.toString(totalExpenditure));
510         writer.print("/");
511         writer.print(Long.toString(totalBudget));
512         writer.print(" ");
513         writer.print(Long.toString(errorCount));
514         writer.print(" ");
515         writer.print(lastPeeked);
516         writer.print(" ");
517         writer.print(lastQueued);
518         writer.print("\n");
519     }
520 
521     /* (non-Javadoc)
522      * @see org.archive.util.Reporter#singleLineLegend()
523      */
524     public String singleLineLegend() {
525         return "queue currentSize totalEnqueues sessionBalance lastCost " +
526                 "(averageCost) lastDequeueTime wakeTime " +
527                 "totalSpend/totalBudget errorCount lastPeekUri lastQueuedUri";
528     }
529     
530     /* (non-Javadoc)
531      * @see org.archive.util.Reporter#singleLineReport()
532      */
533     public String singleLineReport() {
534         return ArchiveUtils.singleLineReport(this);
535     }
536     
537     /***
538      * @param writer
539      * @throws IOException
540      */
541     public void reportTo(String name, PrintWriter writer) {
542         // name is ignored: only one kind of report for now
543         writer.print("Queue ");
544         writer.print(classKey);
545         writer.print("\n");
546         writer.print("  ");
547         writer.print(Long.toString(count));
548         writer.print(" items");
549         if (wakeTime != 0) {
550             writer.print("\n   wakes in: "+ArchiveUtils.formatMillisecondsToConventional(wakeTime - System.currentTimeMillis()));
551         }
552         writer.print("\n    last enqueued: ");
553         writer.print(lastQueued);
554         writer.print("\n      last peeked: ");
555         writer.print(lastPeeked);
556         writer.print("\n");
557         writer.print("   total expended: ");
558         writer.print(Long.toString(totalExpenditure));
559         writer.print(" (total budget: ");
560         writer.print(Long.toString(totalBudget));
561         writer.print(")\n");
562         writer.print("   active balance: ");
563         writer.print(sessionBalance);
564         writer.print("\n   last(avg) cost: ");
565         writer.print(lastCost);
566         writer.print("(");
567         writer.print(ArchiveUtils.doubleToString(
568                     ((double) totalExpenditure / costCount), 1));
569         writer.print(")\n\n");
570     }
571     
572     public CrawlSubstats getSubstats() {
573         return substats;
574     }
575 
576     /***
577      * Set the retired status of this queue.
578      * 
579      * @param b new value for retired status
580      */
581     public void setRetired(boolean b) {
582         this.retired = b;
583     }
584     
585     public boolean isRetired() {
586         return retired;
587     }
588 
589     public UURI getContextUURI(WorkQueueFrontier wqf) {
590         if(lastPeeked!=null) {
591             try {
592                 return UURIFactory.getInstance(lastPeeked);
593             } catch (URIException e) {
594                 // just move along to next try
595             }
596         }
597         if(lastQueued!=null) {
598             try {
599                 return UURIFactory.getInstance(lastQueued);
600             } catch (URIException e) {
601                 // just move along to next try
602             }
603         }
604         if(peekItem!=null) {
605             return peekItem.getUURI();
606         }
607         // peek a CrawlURI temporarily just for context 
608         UURI contextUri = peek(wqf).getUURI(); 
609         unpeek(); // but don't insist on that URI being next released
610         return contextUri;
611     }
612 }