1 package org.archive.crawler.frontier;
2
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.io.Serializable;
6 import java.util.logging.Level;
7 import java.util.logging.Logger;
8
9 import org.apache.commons.httpclient.URIException;
10 import org.archive.crawler.datamodel.CrawlSubstats;
11 import org.archive.crawler.datamodel.CrawlURI;
12 import org.archive.crawler.framework.Frontier;
13 import org.archive.net.UURI;
14 import org.archive.net.UURIFactory;
15 import org.archive.util.ArchiveUtils;
16 import org.archive.util.Reporter;
17
18 /***
19 * A single queue of related URIs to visit, grouped by a classKey
20 * (typically "hostname:port" or similar)
21 *
22 * @author gojomo
23 * @author Christian Kohlschuetter
24 */
25 public abstract class WorkQueue implements Frontier.FrontierGroup, Comparable,
26 Serializable, Reporter {
27 static final long serialVersionUID = -1939168792663316048L;
28
29 private static final Logger logger =
30 Logger.getLogger(WorkQueue.class.getName());
31
32 /*** The classKey */
33 protected final String classKey;
34
35 private boolean active = true;
36
37 /*** Total number of stored items */
38 private long count = 0;
39
40 /*** Total number of items ever enqueued */
41 private long enqueueCount = 0;
42
43 /*** Whether queue is already in lifecycle stage */
44 private boolean isHeld = false;
45
46 /*** Time to wake, if snoozed */
47 private long wakeTime = 0;
48
49 /*** Running 'budget' indicating whether queue should stay active */
50 private int sessionBalance = 0;
51
52 /*** Cost of the last item to be charged against queue */
53 private int lastCost = 0;
54
55 /*** Total number of items charged against queue; with totalExpenditure
56 * can be used to calculate 'average cost'. */
57 private long costCount = 0;
58
59 /*** Running tally of total expenditures on this queue */
60 private long totalExpenditure = 0;
61
62 /*** Total to spend on this queue over its lifetime */
63 private long totalBudget = 0;
64
65 /*** The next item to be returned */
66 private CrawlURI peekItem = null;
67
68 /*** Last URI enqueued */
69 private String lastQueued;
70
71 /*** Last URI peeked */
72 private String lastPeeked;
73
74 /*** time of last dequeue (disposition of some URI) **/
75 private long lastDequeueTime;
76
77 /*** count of errors encountered */
78 private long errorCount = 0;
79
80 /*** Substats for all CrawlURIs in this group */
81 protected CrawlSubstats substats = new CrawlSubstats();
82
83 private boolean retired;
84
85 public WorkQueue(final String pClassKey) {
86 this.classKey = pClassKey;
87 }
88
89 /***
90 * Delete URIs matching the given pattern from this queue.
91 * @param frontier
92 * @param match
93 * @return count of deleted URIs
94 */
95 public long deleteMatching(final WorkQueueFrontier frontier, String match) {
96 try {
97 final long deleteCount = deleteMatchingFromQueue(frontier, match);
98 this.count -= deleteCount;
99 return deleteCount;
100 } catch (IOException e) {
101
102 e.printStackTrace();
103 throw new RuntimeException(e);
104 }
105 }
106
107 /***
108 * Add the given CrawlURI, noting its addition in running count. (It
109 * should not already be present.)
110 *
111 * @param frontier Work queues manager.
112 * @param curi CrawlURI to insert.
113 */
114 public synchronized void enqueue(final WorkQueueFrontier frontier,
115 CrawlURI curi) {
116 try {
117 insert(frontier, curi, false);
118 } catch (IOException e) {
119
120 e.printStackTrace();
121 throw new RuntimeException(e);
122 }
123 count++;
124 enqueueCount++;
125 }
126
127 /***
128 * Return the topmost queue item -- and remember it,
129 * such that even later higher-priority inserts don't
130 * change it.
131 *
132 * TODO: evaluate if this is really necessary
133 * @param frontier Work queues manager
134 *
135 * @return topmost queue item, or null
136 */
137 public CrawlURI peek(final WorkQueueFrontier frontier) {
138 if(peekItem == null && count > 0) {
139 try {
140 peekItem = peekItem(frontier);
141 } catch (IOException e) {
142
143 logger.log(Level.SEVERE,"peek failure",e);
144 e.printStackTrace();
145
146 }
147 if(peekItem != null) {
148 lastPeeked = peekItem.toString();
149 }
150 }
151 return peekItem;
152 }
153
154 /***
155 * Remove the peekItem from the queue and adjusts the count.
156 *
157 * @param frontier Work queues manager.
158 */
159 public synchronized void dequeue(final WorkQueueFrontier frontier) {
160 try {
161 deleteItem(frontier, peekItem);
162 } catch (IOException e) {
163
164 e.printStackTrace();
165 throw new RuntimeException(e);
166 }
167 unpeek();
168 count--;
169 lastDequeueTime = System.currentTimeMillis();
170 }
171
172 /***
173 * Set the session 'activity budget balance' to the given value
174 *
175 * @param balance to use
176 */
177 public void setSessionBalance(int balance) {
178 this.sessionBalance = balance;
179 }
180
181 /***
182 * Return current session 'activity budget balance'
183 *
184 * @return session balance
185 */
186 public int getSessionBalance() {
187 return this.sessionBalance;
188 }
189
190 /***
191 * Set the total expenditure level allowable before queue is
192 * considered inherently 'over-budget'.
193 *
194 * @param budget
195 */
196 public void setTotalBudget(long budget) {
197 this.totalBudget = budget;
198 }
199
200 /***
201 * Check whether queue has temporarily or permanently exceeded
202 * its budget.
203 *
204 * @return true if queue is over its set budget(s)
205 */
206 public boolean isOverBudget() {
207
208
209 return this.sessionBalance <= 0
210 || (this.totalBudget >= 0 && this.totalExpenditure >= this.totalBudget);
211 }
212
213 /***
214 * Return the tally of all expenditures on this queue
215 *
216 * @return total amount expended on this queue
217 */
218 public long getTotalExpenditure() {
219 return totalExpenditure;
220 }
221
222 /***
223 * Increase the internal running budget to be used before
224 * deactivating the queue
225 *
226 * @param amount amount to increment
227 * @return updated budget value
228 */
229 public int incrementSessionBalance(int amount) {
230 this.sessionBalance = this.sessionBalance + amount;
231 return this.sessionBalance;
232 }
233
234 /***
235 * Decrease the internal running budget by the given amount.
236 * @param amount tp decrement
237 * @return updated budget value
238 */
239 public int expend(int amount) {
240 this.sessionBalance = this.sessionBalance - amount;
241 this.totalExpenditure = this.totalExpenditure + amount;
242 this.lastCost = amount;
243 this.costCount++;
244 return this.sessionBalance;
245 }
246
247 /***
248 * A URI should not have been charged against queue (eg
249 * it was disregarded); return the amount expended
250 * @param amount to return
251 * @return updated budget value
252 */
253 public int refund(int amount) {
254 this.sessionBalance = this.sessionBalance + amount;
255 this.totalExpenditure = this.totalExpenditure - amount;
256 this.costCount--;
257 return this.sessionBalance;
258 }
259
260 /***
261 * Note an error and assess an extra penalty.
262 * @param penalty additional amount to deduct
263 */
264 public void noteError(int penalty) {
265 this.sessionBalance = this.sessionBalance - penalty;
266 this.totalExpenditure = this.totalExpenditure + penalty;
267 errorCount++;
268 }
269
270 /***
271 * @param l
272 */
273 public void setWakeTime(long l) {
274 wakeTime = l;
275 }
276
277 /***
278 * @return wakeTime
279 */
280 public long getWakeTime() {
281 return wakeTime;
282 }
283
284 /***
285 * @return classKey, the 'identifier', for this queue.
286 */
287 public String getClassKey() {
288 return this.classKey;
289 }
290
291 /***
292 * Clear isHeld to false
293 */
294 public void clearHeld() {
295 isHeld = false;
296 }
297
298 /***
299 * Whether the queue is already in a lifecycle stage --
300 * such as ready, in-progress, snoozed -- and thus should
301 * not be redundantly inserted to readyClassQueues
302 *
303 * @return isHeld
304 */
305 public boolean isHeld() {
306 return isHeld;
307 }
308
309 /***
310 * Set isHeld to true
311 */
312 public void setHeld() {
313 isHeld = true;
314 }
315
316 /***
317 * Forgive the peek, allowing a subsequent peek to
318 * return a different item.
319 *
320 */
321 public void unpeek() {
322 peekItem = null;
323 }
324
325 public final int compareTo(Object obj) {
326 if(this == obj) {
327 return 0;
328 }
329 WorkQueue other = (WorkQueue) obj;
330 if(getWakeTime() > other.getWakeTime()) {
331 return 1;
332 }
333 if(getWakeTime() < other.getWakeTime()) {
334 return -1;
335 }
336
337
338 return this.classKey.compareTo(other.getClassKey());
339 }
340
341 /***
342 * Update the given CrawlURI, which should already be present. (This
343 * is not checked.) Equivalent to an enqueue without affecting the count.
344 *
345 * @param frontier Work queues manager.
346 * @param curi CrawlURI to update.
347 */
348 public void update(final WorkQueueFrontier frontier, CrawlURI curi) {
349 try {
350 insert(frontier, curi, true);
351 } catch (IOException e) {
352
353 e.printStackTrace();
354 throw new RuntimeException(e);
355 }
356 }
357
358 /***
359 * @return Returns the count.
360 */
361 public synchronized long getCount() {
362 return this.count;
363 }
364
365 /***
366 * Insert the given curi, whether it is already present or not.
367 * @param frontier WorkQueueFrontier.
368 * @param curi CrawlURI to insert.
369 * @throws IOException
370 */
371 private void insert(final WorkQueueFrontier frontier, CrawlURI curi,
372 boolean overwriteIfPresent)
373 throws IOException {
374 insertItem(frontier, curi, overwriteIfPresent);
375 lastQueued = curi.toString();
376 }
377
378 /***
379 * Insert the given curi, whether it is already present or not.
380 * Hook for subclasses.
381 *
382 * @param frontier WorkQueueFrontier.
383 * @param curi CrawlURI to insert.
384 * @throws IOException if there was a problem while inserting the item
385 */
386 protected abstract void insertItem(final WorkQueueFrontier frontier,
387 CrawlURI curi, boolean expectedPresent) throws IOException;
388
389 /***
390 * Delete URIs matching the given pattern from this queue.
391 * @param frontier WorkQueues manager.
392 * @param match the pattern to match
393 * @return count of deleted URIs
394 * @throws IOException if there was a problem while deleting
395 */
396 protected abstract long deleteMatchingFromQueue(
397 final WorkQueueFrontier frontier, final String match)
398 throws IOException;
399
400 /***
401 * Removes the given item from the queue.
402 *
403 * This is only used to remove the first item in the queue,
404 * so it is not necessary to implement a random-access queue.
405 *
406 * @param frontier Work queues manager.
407 * @throws IOException if there was a problem while deleting the item
408 */
409 protected abstract void deleteItem(final WorkQueueFrontier frontier,
410 final CrawlURI item) throws IOException;
411
412 /***
413 * Returns first item from queue (does not delete)
414 *
415 * @return The peeked item, or null
416 * @throws IOException if there was a problem while peeking
417 */
418 protected abstract CrawlURI peekItem(final WorkQueueFrontier frontier)
419 throws IOException;
420
421 /***
422 * Suspends this WorkQueue. Closes all connections to resources etc.
423 *
424 * @param frontier
425 * @throws IOException
426 */
427 protected void suspend(final WorkQueueFrontier frontier) throws IOException {
428 }
429
430 /***
431 * Resumes this WorkQueue. Eventually opens connections to resources etc.
432 *
433 * @param frontier
434 * @throws IOException
435 */
436 protected void resume(final WorkQueueFrontier frontier) throws IOException {
437 }
438
439 public void setActive(final WorkQueueFrontier frontier, final boolean b) {
440 if(active != b) {
441 active = b;
442 try {
443 if(active) {
444 resume(frontier);
445 } else {
446 suspend(frontier);
447 }
448 } catch (IOException e) {
449
450 e.printStackTrace();
451 throw new RuntimeException(e);
452 }
453 }
454 }
455
456
457
458
459
460
461
462
463 public String[] getReports() {
464 return new String[] {};
465 }
466
467
468
469
470 public void reportTo(PrintWriter writer) {
471 reportTo(null,writer);
472 }
473
474
475
476
477 public void singleLineReportTo(PrintWriter writer) {
478
479 writer.print(classKey);
480 writer.print(" ");
481
482 writer.print(Long.toString(count));
483 writer.print(" ");
484
485 writer.print(Long.toString(enqueueCount));
486 writer.print(" ");
487 writer.print(sessionBalance);
488 writer.print(" ");
489 writer.print(lastCost);
490 writer.print("(");
491 writer.print(ArchiveUtils.doubleToString(
492 ((double) totalExpenditure / costCount), 1));
493 writer.print(")");
494 writer.print(" ");
495
496 if (lastDequeueTime != 0) {
497 writer.print(ArchiveUtils.getLog17Date(lastDequeueTime));
498 } else {
499 writer.print("-");
500 }
501 writer.print(" ");
502
503 if (wakeTime != 0) {
504 writer.print(ArchiveUtils.formatMillisecondsToConventional(wakeTime - System.currentTimeMillis()));
505 } else {
506 writer.print("-");
507 }
508 writer.print(" ");
509 writer.print(Long.toString(totalExpenditure));
510 writer.print("/");
511 writer.print(Long.toString(totalBudget));
512 writer.print(" ");
513 writer.print(Long.toString(errorCount));
514 writer.print(" ");
515 writer.print(lastPeeked);
516 writer.print(" ");
517 writer.print(lastQueued);
518 writer.print("\n");
519 }
520
521
522
523
524 public String singleLineLegend() {
525 return "queue currentSize totalEnqueues sessionBalance lastCost " +
526 "(averageCost) lastDequeueTime wakeTime " +
527 "totalSpend/totalBudget errorCount lastPeekUri lastQueuedUri";
528 }
529
530
531
532
533 public String singleLineReport() {
534 return ArchiveUtils.singleLineReport(this);
535 }
536
537 /***
538 * @param writer
539 * @throws IOException
540 */
541 public void reportTo(String name, PrintWriter writer) {
542
543 writer.print("Queue ");
544 writer.print(classKey);
545 writer.print("\n");
546 writer.print(" ");
547 writer.print(Long.toString(count));
548 writer.print(" items");
549 if (wakeTime != 0) {
550 writer.print("\n wakes in: "+ArchiveUtils.formatMillisecondsToConventional(wakeTime - System.currentTimeMillis()));
551 }
552 writer.print("\n last enqueued: ");
553 writer.print(lastQueued);
554 writer.print("\n last peeked: ");
555 writer.print(lastPeeked);
556 writer.print("\n");
557 writer.print(" total expended: ");
558 writer.print(Long.toString(totalExpenditure));
559 writer.print(" (total budget: ");
560 writer.print(Long.toString(totalBudget));
561 writer.print(")\n");
562 writer.print(" active balance: ");
563 writer.print(sessionBalance);
564 writer.print("\n last(avg) cost: ");
565 writer.print(lastCost);
566 writer.print("(");
567 writer.print(ArchiveUtils.doubleToString(
568 ((double) totalExpenditure / costCount), 1));
569 writer.print(")\n\n");
570 }
571
572 public CrawlSubstats getSubstats() {
573 return substats;
574 }
575
576 /***
577 * Set the retired status of this queue.
578 *
579 * @param b new value for retired status
580 */
581 public void setRetired(boolean b) {
582 this.retired = b;
583 }
584
585 public boolean isRetired() {
586 return retired;
587 }
588
589 public UURI getContextUURI(WorkQueueFrontier wqf) {
590 if(lastPeeked!=null) {
591 try {
592 return UURIFactory.getInstance(lastPeeked);
593 } catch (URIException e) {
594
595 }
596 }
597 if(lastQueued!=null) {
598 try {
599 return UURIFactory.getInstance(lastQueued);
600 } catch (URIException e) {
601
602 }
603 }
604 if(peekItem!=null) {
605 return peekItem.getUURI();
606 }
607
608 UURI contextUri = peek(wqf).getUURI();
609 unpeek();
610 return contextUri;
611 }
612 }