1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.archive.crawler.admin;
23
24 import java.io.File;
25 import java.io.FileWriter;
26 import java.io.IOException;
27 import java.io.PrintWriter;
28 import java.io.Serializable;
29 import java.util.Comparator;
30 import java.util.Date;
31 import java.util.EventObject;
32 import java.util.Hashtable;
33 import java.util.Iterator;
34 import java.util.List;
35 import java.util.Map;
36 import java.util.HashMap;
37 import java.util.SortedMap;
38 import java.util.TreeMap;
39 import java.util.TreeSet;
40 import java.util.Vector;
41 import java.util.logging.Level;
42 import java.util.logging.Logger;
43
44 import org.apache.commons.collections.Closure;
45 import org.archive.crawler.datamodel.CrawlHost;
46 import org.archive.crawler.datamodel.CrawlURI;
47 import org.archive.crawler.event.CrawlURIDispositionListener;
48 import org.archive.crawler.framework.AbstractTracker;
49 import org.archive.crawler.framework.CrawlController;
50 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
51 import org.archive.crawler.util.CrawledBytesHistotable;
52 import org.archive.net.UURI;
53 import org.archive.util.ArchiveUtils;
54 import org.archive.util.Histotable;
55 import org.archive.util.LongWrapper;
56 import org.archive.util.MimetypeUtils;
57 import org.archive.util.PaddingStringBuffer;
58
59 /***
60 * This is an implementation of the AbstractTracker. It is designed to function
61 * with the WUI as well as performing various logging activity.
62 * <p>
63 * At the end of each snapshot a line is written to the
64 * 'progress-statistics.log' file.
65 * <p>
66 * The header of that file is as follows:
67 * <pre> [timestamp] [discovered] [queued] [downloaded] [doc/s(avg)] [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
68 * First there is a <b>timestamp</b>, accurate down to 1 second.
69 * <p>
70 * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
71 * are (respectively) the discovered URI count, pending URI count, successfully
72 * fetched count and failed fetch count from the frontier at the time of the
73 * snapshot.
74 * <p>
75 * <b>KB/s(avg)</b> is the bandwidth usage. We use the total bytes downloaded
76 * to calculate average bandwidth usage (KB/sec). Since we also note the value
77 * each time a snapshot is made we can calculate the average bandwidth usage
78 * during the last snapshot period to gain a "current" rate. The first number is
79 * the current and the average is in parenthesis.
80 * <p>
81 * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
82 * documents (URIs) rather then KB downloaded.
83 * <p>
84 * <b>busy-threads</b> is the total number of ToeThreads that are not available
85 * (and thus presumably busy processing a URI). This information is extracted
86 * from the crawl controller.
87 * <p>
88 * Finally mem-use-KB is extracted from the run time environment
89 * (<code>Runtime.getRuntime().totalMemory()</code>).
90 * <p>
91 * In addition to the data collected for the above logs, various other data
92 * is gathered and stored by this tracker.
93 * <ul>
94 * <li> Successfully downloaded documents per fetch status code
95 * <li> Successfully downloaded documents per document mime type
96 * <li> Amount of data per mime type
97 * <li> Successfully downloaded documents per host
98 * <li> Amount of data per host
99 * <li> Disposition of all seeds (this is written to 'reports.log' at end of
100 * crawl)
101 * <li> Successfully downloaded documents per host per source
102 * </ul>
103 *
104 * @author Parker Thompson
105 * @author Kristinn Sigurdsson
106 *
107 * @see org.archive.crawler.framework.StatisticsTracking
108 * @see org.archive.crawler.framework.AbstractTracker
109 */
110 public class StatisticsTracker extends AbstractTracker
111 implements CrawlURIDispositionListener, Serializable {
112 private static final long serialVersionUID = 8004878315916392305L;
113
114 /***
115 * Messages from the StatisticsTracker.
116 */
117 private final static Logger logger =
118 Logger.getLogger(StatisticsTracker.class.getName());
119
120
121
122
123 protected long lastPagesFetchedCount = 0;
124 protected long lastProcessedBytesCount = 0;
125
126
127
128
129 protected long discoveredUriCount = 0;
130 protected long queuedUriCount = 0;
131 protected long finishedUriCount = 0;
132
133 protected long downloadedUriCount = 0;
134 protected long downloadFailures = 0;
135 protected long downloadDisregards = 0;
136 protected double docsPerSecond = 0;
137 protected double currentDocsPerSecond = 0;
138 protected int currentKBPerSec = 0;
139 protected long totalKBPerSec = 0;
140 protected int busyThreads = 0;
141 protected long totalProcessedBytes = 0;
142 protected float congestionRatio = 0;
143 protected long deepestUri;
144 protected long averageDepth;
145
146
147
148
149 /*** tally sizes novel, verified (same hash), vouched (not-modified) */
150 protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();
151
152 /*** Keep track of the file types we see (mime type -> count) */
153 protected Hashtable<String,LongWrapper> mimeTypeDistribution
154 = new Hashtable<String,LongWrapper>();
155 protected Hashtable<String,LongWrapper> mimeTypeBytes
156 = new Hashtable<String,LongWrapper>();
157
158 /*** Keep track of fetch status codes */
159 protected Hashtable<String,LongWrapper> statusCodeDistribution
160 = new Hashtable<String,LongWrapper>();
161
162 /*** Keep track of hosts.
163 *
164 * Each of these Maps are individually unsynchronized, and cannot
165 * be trivially synchronized with the Collections wrapper. Thus
166 * their synchronized access is enforced by this class.
167 *
168 * <p>They're transient because usually bigmaps that get reconstituted
169 * on recover from checkpoint.
170 */
171 protected transient Map<String,LongWrapper> hostsDistribution = null;
172 protected transient Map<String,LongWrapper> hostsBytes = null;
173 protected transient Map<String,Long> hostsLastFinished = null;
174
175 /*** Keep track of URL counts per host per seed */
176 protected transient
177 Map<String,HashMap<String,LongWrapper>> sourceHostDistribution = null;
178
179 /***
180 * Record of seeds' latest actions.
181 */
182 protected transient Map<String,SeedRecord> processedSeedsRecords;
183
184
185 private int seedsCrawled;
186 private int seedsNotCrawled;
187
188 private String sExitMessage = "Before crawl end";
189
190
191 public StatisticsTracker(String name) {
192 super( name, "A statistics tracker thats integrated into " +
193 "the web UI and that creates the progress-statistics log.");
194 }
195
196 public void initialize(CrawlController c)
197 throws FatalConfigurationException {
198 super.initialize(c);
199 try {
200 this.sourceHostDistribution = c.getBigMap("sourceHostDistribution",
201 String.class, HashMap.class);
202 this.hostsDistribution = c.getBigMap("hostsDistribution",
203 String.class, LongWrapper.class);
204 this.hostsBytes = c.getBigMap("hostsBytes", String.class,
205 LongWrapper.class);
206 this.hostsLastFinished = c.getBigMap("hostsLastFinished",
207 String.class, Long.class);
208 this.processedSeedsRecords = c.getBigMap("processedSeedsRecords",
209 String.class, SeedRecord.class);
210 } catch (Exception e) {
211 throw new FatalConfigurationException("Failed setup of" +
212 " StatisticsTracker: " + e);
213 }
214 controller.addCrawlURIDispositionListener(this);
215 }
216
217 protected void finalCleanup() {
218 super.finalCleanup();
219 if (this.hostsBytes != null) {
220 this.hostsBytes.clear();
221 this.hostsBytes = null;
222 }
223 if (this.hostsDistribution != null) {
224 this.hostsDistribution.clear();
225 this.hostsDistribution = null;
226 }
227 if (this.hostsLastFinished != null) {
228 this.hostsLastFinished.clear();
229 this.hostsLastFinished = null;
230 }
231 if (this.processedSeedsRecords != null) {
232 this.processedSeedsRecords.clear();
233 this.processedSeedsRecords = null;
234 }
235 if (this.sourceHostDistribution != null) {
236 this.sourceHostDistribution.clear();
237 this.sourceHostDistribution = null;
238 }
239
240 }
241
242 protected synchronized void progressStatisticsEvent(final EventObject e) {
243
244 discoveredUriCount = discoveredUriCount();
245 downloadedUriCount = successfullyFetchedCount();
246 finishedUriCount = finishedUriCount();
247 queuedUriCount = queuedUriCount();
248 downloadFailures = failedFetchAttempts();
249 downloadDisregards = disregardedFetchAttempts();
250 totalProcessedBytes = totalBytesCrawled();
251 congestionRatio = congestionRatio();
252 deepestUri = deepestUri();
253 averageDepth = averageDepth();
254
255 if (finishedUriCount() == 0) {
256 docsPerSecond = 0;
257 totalKBPerSec = 0;
258 } else if (getCrawlerTotalElapsedTime() < 1000) {
259 return;
260 } else {
261 docsPerSecond = (double) downloadedUriCount /
262 (double)(getCrawlerTotalElapsedTime() / 1000);
263
264 totalKBPerSec = (long)(((totalProcessedBytes / 1024) /
265 ((getCrawlerTotalElapsedTime()) / 1000)) + .5 );
266 }
267
268 busyThreads = activeThreadCount();
269
270 if(shouldrun ||
271 (System.currentTimeMillis() - lastLogPointTime) >= 1000) {
272
273
274
275
276 currentDocsPerSecond = 0;
277 currentKBPerSec = 0;
278
279
280 long currentTime = System.currentTimeMillis();
281 long sampleTime = currentTime - lastLogPointTime;
282
283
284
285 if (sampleTime >= 1000) {
286
287 long currentPageCount = successfullyFetchedCount();
288 long samplePageCount = currentPageCount - lastPagesFetchedCount;
289
290 currentDocsPerSecond =
291 (double) samplePageCount / (double)(sampleTime / 1000);
292
293 lastPagesFetchedCount = currentPageCount;
294
295
296 long currentProcessedBytes = totalProcessedBytes;
297 long sampleProcessedBytes =
298 currentProcessedBytes - lastProcessedBytesCount;
299
300 currentKBPerSec =
301 (int)(((sampleProcessedBytes/1024)/(sampleTime/1000)) + .5);
302
303 lastProcessedBytesCount = currentProcessedBytes;
304 }
305 }
306
307 if (this.controller != null) {
308 this.controller.logProgressStatistics(getProgressStatisticsLine());
309 }
310 lastLogPointTime = System.currentTimeMillis();
311 super.progressStatisticsEvent(e);
312 }
313
314 /***
315 * Return one line of current progress-statistics
316 *
317 * @param now
318 * @return String of stats
319 */
320 public String getProgressStatisticsLine(Date now) {
321 return new PaddingStringBuffer()
322 .append(ArchiveUtils.getLog14Date(now))
323 .raAppend(32, discoveredUriCount)
324 .raAppend(44, queuedUriCount)
325 .raAppend(57, downloadedUriCount)
326 .raAppend(74, ArchiveUtils.
327 doubleToString(currentDocsPerSecond, 2) +
328 "(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")")
329 .raAppend(85, currentKBPerSec + "(" + totalKBPerSec + ")")
330 .raAppend(99, downloadFailures)
331 .raAppend(113, busyThreads)
332 .raAppend(126, (Runtime.getRuntime().totalMemory() -
333 Runtime.getRuntime().freeMemory()) / 1024)
334 .raAppend(140, Runtime.getRuntime().totalMemory() / 1024)
335 .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2))
336 .raAppend(165, deepestUri)
337 .raAppend(177, averageDepth)
338 .toString();
339 }
340
341 public Map<String,Number> getProgressStatistics() {
342 Map<String,Number> stats = new HashMap<String,Number>();
343 stats.put("discoveredUriCount", new Long(discoveredUriCount));
344 stats.put("queuedUriCount", new Long(queuedUriCount));
345 stats.put("downloadedUriCount", new Long(downloadedUriCount));
346 stats.put("currentDocsPerSecond", new Double(currentDocsPerSecond));
347 stats.put("docsPerSecond", new Double(docsPerSecond));
348 stats.put("totalKBPerSec", new Long(totalKBPerSec));
349 stats.put("totalProcessedBytes", new Long(totalProcessedBytes));
350 stats.put("currentKBPerSec", new Long(currentKBPerSec));
351 stats.put("downloadFailures", new Long(downloadFailures));
352 stats.put("busyThreads", new Integer(busyThreads));
353 stats.put("congestionRatio", new Double(congestionRatio));
354 stats.put("deepestUri", new Long(deepestUri));
355 stats.put("averageDepth", new Long(averageDepth));
356 stats.put("totalMemory", new Long(Runtime.getRuntime().totalMemory()));
357 stats.put("freeMemory", new Long(Runtime.getRuntime().freeMemory()));
358 return stats;
359 }
360
361 /***
362 * Return one line of current progress-statistics
363 *
364 * @return String of stats
365 */
366 public String getProgressStatisticsLine() {
367 return getProgressStatisticsLine(new Date());
368 }
369
370 public double processedDocsPerSec(){
371 return docsPerSecond;
372 }
373
374 public double currentProcessedDocsPerSec(){
375 return currentDocsPerSecond;
376 }
377
378 public long processedKBPerSec(){
379 return totalKBPerSec;
380 }
381
382 public int currentProcessedKBPerSec(){
383 return currentKBPerSec;
384 }
385
386 /*** Returns a HashMap that contains information about distributions of
387 * encountered mime types. Key/value pairs represent
388 * mime type -> count.
389 * <p>
390 * <b>Note:</b> All the values are wrapped with a {@link LongWrapper LongWrapper}
391 * @return mimeTypeDistribution
392 */
393 public Hashtable<String,LongWrapper> getFileDistribution() {
394 return mimeTypeDistribution;
395 }
396
397
398 /***
399 * Increment a counter for a key in a given HashMap. Used for various
400 * aggregate data.
401 *
402 * As this is used to change Maps which depend on StatisticsTracker
403 * for their synchronization, this method should only be invoked
404 * from a a block synchronized on 'this'.
405 *
406 * @param map The HashMap
407 * @param key The key for the counter to be incremented, if it does not
408 * exist it will be added (set to 1). If null it will
409 * increment the counter "unknown".
410 */
411 protected static void incrementMapCount(Map<String,LongWrapper> map,
412 String key) {
413 incrementMapCount(map,key,1);
414 }
415
416 /***
417 * Increment a counter for a key in a given HashMap by an arbitrary amount.
418 * Used for various aggregate data. The increment amount can be negative.
419 *
420 * As this is used to change Maps which depend on StatisticsTracker
421 * for their synchronization, this method should only be invoked
422 * from a a block synchronized on 'this'.
423 *
424 * @param map
425 * The HashMap
426 * @param key
427 * The key for the counter to be incremented, if it does not exist
428 * it will be added (set to equal to <code>increment</code>).
429 * If null it will increment the counter "unknown".
430 * @param increment
431 * The amount to increment counter related to the <code>key</code>.
432 */
433 protected static void incrementMapCount(Map<String,LongWrapper> map,
434 String key, long increment) {
435 if (key == null) {
436 key = "unknown";
437 }
438 LongWrapper lw = (LongWrapper)map.get(key);
439 if(lw == null) {
440 map.put(key, new LongWrapper(increment));
441 } else {
442 lw.longValue += increment;
443 }
444 }
445
446 /***
447 * Sort the entries of the given HashMap in descending order by their
448 * values, which must be longs wrapped with <code>LongWrapper</code>.
449 * <p>
450 * Elements are sorted by value from largest to smallest. Equal values are
451 * sorted in an arbitrary, but consistent manner by their keys. Only items
452 * with identical value and key are considered equal.
453 *
454 * If the passed-in map requires access to be synchronized, the caller
455 * should ensure this synchronization.
456 *
457 * @param mapOfLongWrapperValues
458 * Assumes values are wrapped with LongWrapper.
459 * @return a sorted set containing the same elements as the map.
460 */
461 public TreeMap<String,LongWrapper> getReverseSortedCopy(
462 final Map<String,LongWrapper> mapOfLongWrapperValues) {
463 TreeMap<String,LongWrapper> sortedMap =
464 new TreeMap<String,LongWrapper>(new Comparator<String>() {
465 public int compare(String e1, String e2) {
466 long firstVal = mapOfLongWrapperValues.get(e1).
467 longValue;
468 long secondVal = mapOfLongWrapperValues.get(e2).
469 longValue;
470 if (firstVal < secondVal) {
471 return 1;
472 }
473 if (secondVal < firstVal) {
474 return -1;
475 }
476
477 return e1.compareTo(e2);
478 }
479 });
480 try {
481 sortedMap.putAll(mapOfLongWrapperValues);
482 } catch (UnsupportedOperationException e) {
483 Iterator<String> i = mapOfLongWrapperValues.keySet().iterator();
484 for (;i.hasNext();) {
485
486 String key = i.next();
487 sortedMap.put(key, mapOfLongWrapperValues.get(key));
488 }
489 }
490 return sortedMap;
491 }
492
493 /***
494 * Return a HashMap representing the distribution of status codes for
495 * successfully fetched curis, as represented by a hashmap where key ->
496 * val represents (string)code -> (integer)count.
497 *
498 * <b>Note: </b> All the values are wrapped with a
499 * {@link LongWrapper LongWrapper}
500 *
501 * @return statusCodeDistribution
502 */
503 public Hashtable<String,LongWrapper> getStatusCodeDistribution() {
504 return statusCodeDistribution;
505 }
506
507 /***
508 * Returns the time (in millisec) when a URI belonging to a given host was
509 * last finished processing.
510 *
511 * @param host The host to look up time of last completed URI.
512 * @return Returns the time (in millisec) when a URI belonging to a given
513 * host was last finished processing. If no URI has been completed for host
514 * -1 will be returned.
515 */
516 public long getHostLastFinished(String host){
517 Long l = null;
518 synchronized(hostsLastFinished){
519 l = (Long)hostsLastFinished.get(host);
520 }
521 return (l != null)? l.longValue(): -1;
522 }
523
524 /***
525 * Returns the accumulated number of bytes downloaded from a given host.
526 * @param host name of the host
527 * @return the accumulated number of bytes downloaded from a given host
528 */
529 public long getBytesPerHost(String host){
530 synchronized(hostsBytes){
531 return ((LongWrapper)hostsBytes.get(host)).longValue;
532 }
533 }
534
535 /***
536 * Returns the accumulated number of bytes from files of a given file type.
537 * @param filetype Filetype to check.
538 * @return the accumulated number of bytes from files of a given mime type
539 */
540 public long getBytesPerFileType(String filetype){
541 return ((LongWrapper)mimeTypeBytes.get(filetype)).longValue;
542 }
543
544 /***
545 * Get the total number of ToeThreads (sleeping and active)
546 *
547 * @return The total number of ToeThreads
548 */
549 public int threadCount() {
550 return this.controller != null? controller.getToeCount(): 0;
551 }
552
553 /***
554 * @return Current thread count (or zero if can't figure it out).
555 */
556 public int activeThreadCount() {
557 return this.controller != null? controller.getActiveToeCount(): 0;
558
559
560 }
561
562 /***
563 * This returns the number of completed URIs as a percentage of the total
564 * number of URIs encountered (should be inverse to the discovery curve)
565 *
566 * @return The number of completed URIs as a percentage of the total
567 * number of URIs encountered
568 */
569 public int percentOfDiscoveredUrisCompleted() {
570 long completed = finishedUriCount();
571 long total = discoveredUriCount();
572
573 if (total == 0) {
574 return 0;
575 }
576
577 return (int) (100 * completed / total);
578 }
579
580 /***
581 * Number of <i>discovered</i> URIs.
582 *
583 * <p>If crawl not running (paused or stopped) this will return the value of
584 * the last snapshot.
585 *
586 * @return A count of all uris encountered
587 *
588 * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
589 */
590 public long discoveredUriCount() {
591
592
593 return shouldrun && this.controller != null &&
594 this.controller.getFrontier() != null?
595 controller.getFrontier().discoveredUriCount() : discoveredUriCount;
596 }
597
598 /***
599 * Number of URIs that have <i>finished</i> processing.
600 *
601 * @return Number of URIs that have finished processing
602 *
603 * @see org.archive.crawler.framework.Frontier#finishedUriCount()
604 */
605 public long finishedUriCount() {
606 return shouldrun && this.controller != null &&
607 this.controller.getFrontier() != null ?
608 controller.getFrontier().finishedUriCount() : finishedUriCount;
609 }
610
611 /***
612 * Get the total number of failed fetch attempts (connection failures -> give up, etc)
613 *
614 * @return The total number of failed fetch attempts
615 */
616 public long failedFetchAttempts() {
617
618
619 return shouldrun && this.controller != null &&
620 this.controller.getFrontier() != null ?
621 controller.getFrontier().failedFetchCount() : downloadFailures;
622 }
623
624 /***
625 * Get the total number of failed fetch attempts (connection failures -> give up, etc)
626 *
627 * @return The total number of failed fetch attempts
628 */
629 public long disregardedFetchAttempts() {
630
631
632 return shouldrun && this.controller != null &&
633 this.controller.getFrontier() != null?
634 controller.getFrontier().disregardedUriCount() : downloadDisregards;
635 }
636
637 public long successfullyFetchedCount() {
638
639
640 return shouldrun && this.controller != null &&
641 this.controller.getFrontier() != null?
642 controller.getFrontier().succeededFetchCount() : downloadedUriCount;
643 }
644
645 public long totalCount() {
646 return queuedUriCount() + activeThreadCount() +
647 successfullyFetchedCount();
648 }
649
650 /***
651 * Ratio of number of threads that would theoretically allow
652 * maximum crawl progress (if each was as productive as current
653 * threads), to current number of threads.
654 *
655 * @return float congestion ratio
656 */
657 public float congestionRatio() {
658
659
660 return shouldrun && this.controller != null &&
661 this.controller.getFrontier() != null ?
662 controller.getFrontier().congestionRatio() : congestionRatio;
663 }
664
665 /***
666 * Ordinal position of the 'deepest' URI eligible
667 * for crawling. Essentially, the length of the longest
668 * frontier internal queue.
669 *
670 * @return long URI count to deepest URI
671 */
672 public long deepestUri() {
673
674
675 return shouldrun && this.controller != null &&
676 this.controller.getFrontier() != null ?
677 controller.getFrontier().deepestUri() : deepestUri;
678 }
679
680 /***
681 * Average depth of the last URI in all eligible queues.
682 * That is, the average length of all eligible queues.
683 *
684 * @return long average depth of last URIs in queues
685 */
686 public long averageDepth() {
687
688
689 return shouldrun && this.controller != null &&
690 this.controller.getFrontier() != null ?
691 controller.getFrontier().averageDepth() : averageDepth;
692 }
693
694 /***
695 * Number of URIs <i>queued</i> up and waiting for processing.
696 *
697 * <p>If crawl not running (paused or stopped) this will return the value
698 * of the last snapshot.
699 *
700 * @return Number of URIs queued up and waiting for processing.
701 *
702 * @see org.archive.crawler.framework.Frontier#queuedUriCount()
703 */
704 public long queuedUriCount() {
705
706
707 return shouldrun && this.controller != null &&
708 this.controller.getFrontier() != null?
709 controller.getFrontier().queuedUriCount() : queuedUriCount;
710 }
711
712 /*** @deprecated use totalBytesCrawled */
713 public long totalBytesWritten() {
714
715 return shouldrun && this.controller != null &&
716 this.controller.getFrontier() != null?
717 controller.getFrontier().totalBytesWritten() : totalProcessedBytes;
718 }
719
720 public long totalBytesCrawled() {
721 return shouldrun ?
722 crawledBytes.getTotal() : totalProcessedBytes;
723 }
724
725 public String crawledBytesSummary() {
726 return crawledBytes.summary();
727 }
728
729 /***
730 * If the curi is a seed, we update the processedSeeds table.
731 *
732 * @param curi The CrawlURI that may be a seed.
733 * @param disposition The dispositino of the CrawlURI.
734 */
735 private void handleSeed(CrawlURI curi, String disposition) {
736 if(curi.isSeed()){
737 SeedRecord sr = new SeedRecord(curi, disposition);
738 processedSeedsRecords.put(sr.getUri(), sr);
739 }
740 }
741
742 public void crawledURISuccessful(CrawlURI curi) {
743 handleSeed(curi,SEED_DISPOSITION_SUCCESS);
744
745 crawledBytes.accumulate(curi);
746
747
748 incrementMapCount(statusCodeDistribution,
749 Integer.toString(curi.getFetchStatus()));
750
751
752 String mime = MimetypeUtils.truncate(curi.getContentType());
753 incrementMapCount(mimeTypeDistribution, mime);
754 incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
755
756
757 saveHostStats((curi.getFetchStatus() == 1)? "dns:":
758 this.controller.getServerCache().
759 getHostFor(curi).getHostName(),
760 curi.getContentSize());
761
762 if (curi.containsKey(CrawlURI.A_SOURCE_TAG)){
763 saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG),
764 this.controller.getServerCache().getHostFor(curi).
765 getHostName());
766 }
767 }
768
769 protected void saveSourceStats(String source, String hostname) {
770 synchronized(sourceHostDistribution) {
771 HashMap<String,LongWrapper> hostUriCount =
772 sourceHostDistribution.get(source);
773 if (hostUriCount == null) {
774 hostUriCount = new HashMap<String,LongWrapper>();
775 }
776
777
778
779
780
781 incrementMapCount(hostUriCount, hostname);
782 sourceHostDistribution.put(source, hostUriCount);
783 }
784 }
785
786 protected void saveHostStats(String hostname, long size) {
787 synchronized(hostsDistribution){
788 incrementMapCount(hostsDistribution, hostname);
789 }
790 synchronized(hostsBytes){
791 incrementMapCount(hostsBytes, hostname, size);
792 }
793 synchronized(hostsLastFinished){
794 hostsLastFinished.put(hostname,
795 new Long(System.currentTimeMillis()));
796 }
797 }
798
799 public void crawledURINeedRetry(CrawlURI curi) {
800 handleSeed(curi,SEED_DISPOSITION_RETRY);
801 }
802
803 public void crawledURIDisregard(CrawlURI curi) {
804 handleSeed(curi,SEED_DISPOSITION_DISREGARD);
805 }
806
807 public void crawledURIFailure(CrawlURI curi) {
808 handleSeed(curi,SEED_DISPOSITION_FAILURE);
809 }
810
811 /***
812 * Get a seed iterator for the job being monitored.
813 *
814 * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
815 * UURIs like the Scope seed iterator. The strings are equal to the URIs'
816 * getURIString() values.
817 * @return the seed iterator
818 * FIXME: Consider using TransformingIterator here
819 */
820 public Iterator<String> getSeeds() {
821 List<String> seedsCopy = new Vector<String>();
822 Iterator<UURI> i = controller.getScope().seedsIterator();
823 while (i.hasNext()) {
824 seedsCopy.add(i.next().toString());
825 }
826 return seedsCopy.iterator();
827 }
828
829 public Iterator getSeedRecordsSortedByStatusCode() {
830 return getSeedRecordsSortedByStatusCode(getSeeds());
831 }
832
833 protected Iterator<SeedRecord> getSeedRecordsSortedByStatusCode(
834 Iterator<String> i) {
835 TreeSet<SeedRecord> sortedSet =
836 new TreeSet<SeedRecord>(new Comparator<SeedRecord>() {
837 public int compare(SeedRecord sr1, SeedRecord sr2) {
838 int code1 = sr1.getStatusCode();
839 int code2 = sr2.getStatusCode();
840 if (code1 == code2) {
841
842 return sr1.getUri().compareTo(sr2.getUri());
843 }
844
845
846
847
848 code1 = -code1 - Integer.MAX_VALUE;
849 code2 = -code2 - Integer.MAX_VALUE;
850
851 return new Integer(code1).compareTo(new Integer(code2));
852 }
853 });
854 while (i.hasNext()) {
855 String seed = i.next();
856 SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);
857 if(sr==null) {
858 sr = new SeedRecord(seed,SEED_DISPOSITION_NOT_PROCESSED);
859 processedSeedsRecords.put(seed,sr);
860 }
861 sortedSet.add(sr);
862 }
863 return sortedSet.iterator();
864 }
865
866 public void crawlEnded(String message) {
867 logger.info("Entered crawlEnded");
868 this.sExitMessage = message;
869 super.crawlEnded(message);
870 logger.info("Leaving crawlEnded");
871 }
872
873 /***
874 * @param writer Where to write.
875 */
876 protected void writeSeedsReportTo(PrintWriter writer) {
877
878 writer.print("[code] [status] [seed] [redirect]\n");
879
880 seedsCrawled = 0;
881 seedsNotCrawled = 0;
882 for (Iterator i = getSeedRecordsSortedByStatusCode(getSeeds());
883 i.hasNext();) {
884 SeedRecord sr = (SeedRecord)i.next();
885 writer.print(sr.getStatusCode());
886 writer.print(" ");
887 if((sr.getStatusCode() > 0)) {
888 seedsCrawled++;
889 writer.print("CRAWLED");
890 } else {
891 seedsNotCrawled++;
892 writer.print("NOTCRAWLED");
893 }
894 writer.print(" ");
895 writer.print(sr.getUri());
896 if(sr.getRedirectUri()!=null) {
897 writer.print(" ");
898 writer.print(sr.getRedirectUri());
899 }
900 writer.print("\n");
901 }
902 }
903
904 protected void writeSourceReportTo(PrintWriter writer) {
905
906 writer.print("[source] [host] [#urls]\n");
907
908 for (Iterator i = sourceHostDistribution.keySet().iterator(); i.hasNext();) {
909 Object sourceKey = i.next();
910 Map<String,LongWrapper> hostCounts
911 = (Map<String,LongWrapper>)sourceHostDistribution.get(sourceKey);
912
913 SortedMap sortedHostCounts = getReverseSortedHostCounts(hostCounts);
914
915 for (Iterator j = sortedHostCounts.keySet().iterator(); j.hasNext();) {
916 Object hostKey = j.next();
917 LongWrapper hostCount = (LongWrapper) hostCounts.get(hostKey);
918 writer.print(sourceKey.toString());
919 writer.print(" ");
920 writer.print(hostKey.toString());
921 writer.print(" ");
922 writer.print(hostCount.longValue);
923 writer.print("\n");
924 }
925 }
926 }
927
928 /***
929 * Return a copy of the hosts distribution in reverse-sorted (largest first)
930 * order.
931 *
932 * @return SortedMap of hosts distribution
933 */
934 public SortedMap getReverseSortedHostCounts(
935 Map<String,LongWrapper> hostCounts) {
936 synchronized(hostCounts){
937 return getReverseSortedCopy(hostCounts);
938 }
939 }
940
941
942 protected void writeHostsReportTo(final PrintWriter writer) {
943
944
945 SortedMap hd = getReverseSortedHostsDistribution();
946
947 writer.print("[#urls] [#bytes] [host] [#robots] [#remaining]\n");
948 for (Iterator i = hd.keySet().iterator(); i.hasNext();) {
949
950 String key = (String) i.next();
951 CrawlHost host = controller.getServerCache().getHostFor(key);
952 LongWrapper val = (LongWrapper)hd.get(key);
953 writeReportLine(writer,
954 ((val==null)?"-":val.longValue),
955 getBytesPerHost(key),
956 key,
957 host.getSubstats().getRobotsDenials(),
958 host.getSubstats().getRemaining());
959 }
960
961
962 Closure logZeros = new Closure() {
963 public void execute(Object obj) {
964 CrawlHost host = (CrawlHost)obj;
965 if(host.getSubstats().getRecordedFinishes()==0) {
966 writeReportLine(writer,
967 host.getSubstats().getRecordedFinishes(),
968 host.getSubstats().getTotalBytes(),
969 host.getHostName(),
970 host.getSubstats().getRobotsDenials(),
971 host.getSubstats().getRemaining());
972 }
973 }};
974 controller.getServerCache().forAllHostsDo(logZeros);
975 }
976
977 protected void writeReportLine(PrintWriter writer, Object ... fields) {
978 for(Object field : fields) {
979 writer.print(field);
980 writer.print(" ");
981 }
982 writer.print("\n");
983 }
984
985 /***
986 * Return a copy of the hosts distribution in reverse-sorted
987 * (largest first) order.
988 * @return SortedMap of hosts distribution
989 */
990 public SortedMap getReverseSortedHostsDistribution() {
991 synchronized(hostsDistribution){
992 return getReverseSortedCopy(hostsDistribution);
993 }
994 }
995
996 protected void writeMimetypesReportTo(PrintWriter writer) {
997
998 writer.print("[#urls] [#bytes] [mime-types]\n");
999 TreeMap fd = getReverseSortedCopy(getFileDistribution());
1000 for (Iterator i = fd.keySet().iterator(); i.hasNext();) {
1001 Object key = i.next();
1002
1003 writer.print(Long.toString(((LongWrapper)fd.get(key)).longValue));
1004 writer.print(" ");
1005 writer.print(Long.toString(getBytesPerFileType((String)key)));
1006 writer.print(" ");
1007 writer.print((String)key);
1008 writer.print("\n");
1009 }
1010 }
1011
1012 protected void writeResponseCodeReportTo(PrintWriter writer) {
1013
1014 writer.print("[rescode] [#urls]\n");
1015 TreeMap scd = getReverseSortedCopy(getStatusCodeDistribution());
1016 for (Iterator i = scd.keySet().iterator(); i.hasNext();) {
1017 Object key = i.next();
1018 writer.print((String)key);
1019 writer.print(" ");
1020 writer.print(Long.toString(((LongWrapper)scd.get(key)).longValue));
1021 writer.print("\n");
1022 }
1023 }
1024
1025 protected void writeCrawlReportTo(PrintWriter writer) {
1026 writer.print("Crawl Name: " + controller.getOrder().getCrawlOrderName());
1027 writer.print("\nCrawl Status: " + sExitMessage);
1028 writer.print("\nDuration Time: " +
1029 ArchiveUtils.formatMillisecondsToConventional(crawlDuration()));
1030 writer.print("\nTotal Seeds Crawled: " + seedsCrawled);
1031 writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled);
1032
1033 writer.print("\nTotal Hosts Crawled: " + (hostsDistribution.size()-1));
1034 writer.print("\nTotal Documents Crawled: " + finishedUriCount);
1035 writer.print("\nProcessed docs/sec: " +
1036 ArchiveUtils.doubleToString(docsPerSecond,2));
1037 writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec);
1038 writer.print("\nTotal Raw Data Size in Bytes: " + totalProcessedBytes +
1039 " (" + ArchiveUtils.formatBytesForDisplay(totalProcessedBytes) +
1040 ") \n");
1041 writer.print("Novel Bytes: "
1042 + crawledBytes.get(CrawledBytesHistotable.NOVEL)
1043 + " (" + ArchiveUtils.formatBytesForDisplay(
1044 crawledBytes.get(CrawledBytesHistotable.NOVEL))
1045 + ") \n");
1046 if(crawledBytes.containsKey(CrawledBytesHistotable.DUPLICATE)) {
1047 writer.print("Duplicate-by-hash Bytes: "
1048 + crawledBytes.get(CrawledBytesHistotable.DUPLICATE)
1049 + " (" + ArchiveUtils.formatBytesForDisplay(
1050 crawledBytes.get(CrawledBytesHistotable.DUPLICATE))
1051 + ") \n");
1052 }
1053 if(crawledBytes.containsKey(CrawledBytesHistotable.NOTMODIFIED)) {
1054 writer.print("Not-modified Bytes: "
1055 + crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED)
1056 + " (" + ArchiveUtils.formatBytesForDisplay(
1057 crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED))
1058 + ") \n");
1059 }
1060 }
1061
1062 protected void writeProcessorsReportTo(PrintWriter writer) {
1063 controller.reportTo(CrawlController.PROCESSORS_REPORT,writer);
1064 }
1065
1066 protected void writeReportFile(String reportName, String filename) {
1067 File f = new File(controller.getDisk().getPath(), filename);
1068 try {
1069 PrintWriter bw = new PrintWriter(new FileWriter(f));
1070 writeReportTo(reportName, bw);
1071 bw.close();
1072 controller.addToManifest(f.getAbsolutePath(),
1073 CrawlController.MANIFEST_REPORT_FILE, true);
1074 } catch (IOException e) {
1075 logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() +
1076 " at the end of crawl.", e);
1077 }
1078 logger.info("wrote report: " + f.getAbsolutePath());
1079 }
1080
1081 /***
1082 * @param writer Where to write.
1083 */
1084 protected void writeManifestReportTo(PrintWriter writer) {
1085 controller.reportTo(CrawlController.MANIFEST_REPORT, writer);
1086 }
1087
1088 /***
1089 * @param reportName Name of report.
1090 * @param w Where to write.
1091 */
1092 private void writeReportTo(String reportName, PrintWriter w) {
1093 if("hosts".equals(reportName)) {
1094 writeHostsReportTo(w);
1095 } else if ("mime types".equals(reportName)) {
1096 writeMimetypesReportTo(w);
1097 } else if ("response codes".equals(reportName)) {
1098 writeResponseCodeReportTo(w);
1099 } else if ("seeds".equals(reportName)) {
1100 writeSeedsReportTo(w);
1101 } else if ("crawl".equals(reportName)) {
1102 writeCrawlReportTo(w);
1103 } else if ("processors".equals(reportName)) {
1104 writeProcessorsReportTo(w);
1105 } else if ("manifest".equals(reportName)) {
1106 writeManifestReportTo(w);
1107 } else if ("frontier".equals(reportName)) {
1108 writeFrontierReportTo(w);
1109 } else if ("source".equals(reportName)) {
1110 writeSourceReportTo(w);
1111 }
1112 }
1113
1114 /***
1115 * Write the Frontier's 'nonempty' report (if available)
1116 * @param writer to report to
1117 */
1118 protected void writeFrontierReportTo(PrintWriter writer) {
1119 if(controller.getFrontier().isEmpty()) {
1120 writer.println("frontier empty");
1121 } else {
1122 controller.getFrontier().reportTo("nonempty", writer);
1123 }
1124 }
1125
1126 /***
1127 * Run the reports.
1128 */
1129 public void dumpReports() {
1130
1131
1132 controller.addOrderToManifest();
1133 controller.installThreadContextSettingsHandler();
1134 writeReportFile("hosts","hosts-report.txt");
1135 writeReportFile("mime types","mimetype-report.txt");
1136 writeReportFile("response codes","responsecode-report.txt");
1137 writeReportFile("seeds","seeds-report.txt");
1138 writeReportFile("crawl","crawl-report.txt");
1139 writeReportFile("processors","processors-report.txt");
1140 writeReportFile("manifest","crawl-manifest.txt");
1141 writeReportFile("frontier","frontier-report.txt");
1142 if (!sourceHostDistribution.isEmpty()) {
1143 writeReportFile("source","source-report.txt");
1144 }
1145
1146 }
1147
1148 public void crawlCheckpoint(File cpDir) throws Exception {
1149
1150 logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
1151 }
1152 }