1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.admin;
26
27 import java.io.File;
28 import java.io.FileReader;
29 import java.io.BufferedReader;
30 import java.io.IOException;
31 import java.util.Comparator;
32 import java.util.Hashtable;
33 import java.util.Iterator;
34 import java.util.Map;
35 import java.util.SortedMap;
36 import java.util.TreeMap;
37 import java.util.TreeSet;
38 import java.util.logging.Level;
39 import java.util.logging.Logger;
40
41 import org.archive.util.LongWrapper;
42
43
44 /***
45 * This class provides descriptive statistics of a finished crawl job by
46 * using the crawl report files generated by StatisticsTracker. Any formatting
47 * changes to the way StatisticsTracker writes to the summary crawl reports will
48 * require changes to this class.
49 * <p>
50 * The following statistics are accessible from this class:
51 * <ul>
52 * <li> Successfully downloaded documents per fetch status code
53 * <li> Successfully downloaded documents per document mime type
54 * <li> Amount of data per mime type
55 * <li> Successfully downloaded documents per host
56 * <li> Amount of data per host
57 * <li> Successfully downloaded documents per top-level domain name (TLD)
58 * <li> Disposition of all seeds
59 * <li> Successfully downloaded documents per host per source
60 * </ul>
61 *
62 * <p>TODO: Make it so summarizing is not done all in RAM so we avoid
63 * OOME.
64 *
65 * @author Frank McCown
66 *
67 * @see org.archive.crawler.admin.StatisticsTracker
68 */
69 public class StatisticsSummary {
70 /***
71 * Messages from the StatisticsSummary.
72 */
73 private final static Logger logger =
74 Logger.getLogger(StatisticsSummary.class.getName());
75
76 private boolean stats = true;
77
78 /*** Crawl job whose summary we want to view */
79 private CrawlJob cjob;
80
81 protected long totalDnsStatusCodeDocuments = 0;
82 protected long totalStatusCodeDocuments = 0;
83 protected long totalFileTypeDocuments = 0;
84 protected long totalMimeTypeDocuments = 0;
85 protected long totalDnsMimeTypeDocuments = 0;
86 protected long totalDnsHostDocuments = 0;
87 protected long totalHostDocuments = 0;
88 protected long totalMimeSize = 0;
89 protected long totalDnsMimeSize = 0;
90 protected long totalHostSize = 0;
91 protected long totalDnsHostSize = 0;
92 protected long totalTldDocuments = 0;
93 protected long totalTldSize = 0;
94 protected long totalHosts = 0;
95
96 protected String durationTime;
97 protected String processedDocsPerSec;
98 protected String bandwidthKbytesPerSec;
99 protected String totalDataWritten;
100
101 /*** Keep track of the file types we see (mime type -> count) */
102 protected Hashtable<String,LongWrapper> mimeTypeDistribution = new Hashtable<String,LongWrapper>();
103 protected Hashtable<String,LongWrapper> mimeTypeBytes = new Hashtable<String,LongWrapper>();
104 protected Hashtable<String,LongWrapper> mimeTypeDnsDistribution = new Hashtable<String,LongWrapper>();
105 protected Hashtable<String,LongWrapper> mimeTypeDnsBytes = new Hashtable<String,LongWrapper>();
106
107 /*** Keep track of status codes */
108 protected Hashtable<String,LongWrapper> statusCodeDistribution = new Hashtable<String,LongWrapper>();
109 protected Hashtable<String,LongWrapper> dnsStatusCodeDistribution
110 = new Hashtable<String,LongWrapper>();
111
112 /*** Keep track of hosts */
113 protected Hashtable<String,LongWrapper> hostsDistribution = new Hashtable<String,LongWrapper>();
114 protected Hashtable<String,LongWrapper> hostsBytes = new Hashtable<String,LongWrapper>();
115 protected Hashtable<String,LongWrapper> hostsDnsDistribution = new Hashtable<String,LongWrapper>();
116 protected Hashtable<String,LongWrapper> hostsDnsBytes = new Hashtable<String,LongWrapper>();
117
118 /*** Keep track of TLDs */
119 protected Hashtable<String,LongWrapper> tldDistribution = new Hashtable<String,LongWrapper>();
120 protected Hashtable<String,LongWrapper> tldBytes = new Hashtable<String,LongWrapper>();
121 protected Hashtable<String,LongWrapper> tldHostDistribution = new Hashtable<String,LongWrapper>();
122
123 /*** Keep track of processed seeds */
124 protected transient Map<String,SeedRecord> processedSeedsRecords
125 = new Hashtable<String,SeedRecord>();
126
127 /***
128 * Constructor
129 *
130 * @param cjob
131 * Completed crawl job
132 */
133 public StatisticsSummary(CrawlJob cjob) {
134 this.cjob = cjob;
135
136
137 this.stats = calculateStatusCodeDistribution();
138 if (calculateMimeTypeDistribution()) {
139 this.stats = true;
140 }
141 if (calculateHostsDistribution()) {
142 this.stats = true;
143 }
144 if (readCrawlReport()) {
145 this.stats = true;
146 }
147 if (readSeedReport()) {
148 this.stats = true;
149 }
150 }
151
152
153 /***
154 * Increment a counter for a key in a given HashMap. Used for various
155 * aggregate data.
156 *
157 * @param map The HashMap
158 * @param key The key for the counter to be incremented, if it does not
159 * exist it will be added (set to 1). If null it will
160 * increment the counter "unknown".
161 */
162 protected static void incrementMapCount(Map<String,LongWrapper> map,
163 String key) {
164 incrementMapCount(map,key,1);
165 }
166
167 /***
168 * Increment a counter for a key in a given HashMap by an arbitrary amount.
169 * Used for various aggregate data. The increment amount can be negative.
170 *
171 * @param map
172 * The HashMap
173 * @param key
174 * The key for the counter to be incremented, if it does not
175 * exist it will be added (set to equal to
176 * <code>increment</code>).
177 * If null it will increment the counter "unknown".
178 * @param increment
179 * The amount to increment counter related to the
180 * <code>key</code>.
181 */
182 protected static void incrementMapCount(Map<String,LongWrapper> map,
183 String key, long increment) {
184 if (key == null) {
185 key = "unknown";
186 }
187 LongWrapper lw = map.get(key);
188 if(lw == null) {
189 map.put(key, new LongWrapper(increment));
190 } else {
191 lw.longValue += increment;
192 }
193 }
194
195 /*** Returns a HashMap that contains information about distributions of
196 * encountered mime types. Key/value pairs represent
197 * mime type -> count.
198 * <p>
199 * <b>Note:</b> All the values are wrapped with a
200 * {@link LongWrapper LongWrapper}
201 * @return mimeTypeDistribution
202 */
203 public Hashtable getMimeDistribution() {
204 return mimeTypeDistribution;
205 }
206
207 public long getTotalMimeTypeDocuments() {
208 return totalMimeTypeDocuments;
209 }
210
211 public long getTotalDnsMimeTypeDocuments() {
212 return totalDnsMimeTypeDocuments;
213 }
214
215 public long getTotalMimeSize() {
216 return totalMimeSize;
217 }
218
219 public long getTotalDnsMimeSize() {
220 return totalDnsMimeSize;
221 }
222
223 /***
224 * Return a HashMap representing the distribution of HTTP status codes for
225 * successfully fetched curis, as represented by a hashmap where key ->
226 * val represents (string)code -> (integer)count.
227 *
228 * <b>Note: </b> All the values are wrapped with a
229 * {@link LongWrapper LongWrapper}
230 *
231 * @return statusCodeDistribution
232 */
233 public Hashtable getStatusCodeDistribution() {
234 return statusCodeDistribution;
235 }
236
237 /***
238 * Return a HashMap representing the distribution of DNS status codes for
239 * successfully fetched curis, as represented by a hashmap where key ->
240 * val represents (string)code -> (integer)count.
241 *
242 * <b>Note: </b> All the values are wrapped with a
243 * {@link LongWrapper LongWrapper}
244 *
245 * @return dnsStatusCodeDistribution
246 */
247 public Hashtable getDnsStatusCodeDistribution() {
248 return dnsStatusCodeDistribution;
249 }
250
251 public Hashtable getDnsMimeDistribution() {
252 return mimeTypeDnsDistribution;
253 }
254
255 public long getTotalDnsStatusCodeDocuments() {
256 return totalDnsStatusCodeDocuments;
257 }
258
259 public long getTotalStatusCodeDocuments() {
260 return totalStatusCodeDocuments;
261 }
262
263 public long getTotalHostDocuments() {
264 return totalHostDocuments;
265 }
266
267 public long getTotalDnsHostDocuments() {
268 return totalDnsHostDocuments;
269 }
270
271 public Hashtable getHostsDnsDistribution() {
272 return hostsDnsDistribution;
273 }
274
275 public long getTotalHostDnsDocuments() {
276 return totalDnsHostDocuments;
277 }
278
279 public long getTotalHostSize() {
280 return totalHostSize;
281 }
282
283 public long getTotalDnsHostSize() {
284 return totalDnsHostSize;
285 }
286
287 public Hashtable getTldDistribution() {
288 return tldDistribution;
289 }
290
291 public Hashtable getTldBytes() {
292 return tldBytes;
293 }
294
295 public long getTotalTldDocuments() {
296 return totalTldDocuments;
297 }
298
299 public long getTotalTldSize() {
300 return totalTldSize;
301 }
302
303 public Hashtable getTldHostDistribution() {
304 return tldHostDistribution;
305 }
306
307 public long getTotalHosts() {
308 return totalHosts;
309 }
310
311 public String getDurationTime() {
312 return durationTime;
313 }
314
315 public String getProcessedDocsPerSec() {
316 return processedDocsPerSec;
317 }
318
319 public String getBandwidthKbytesPerSec() {
320 return bandwidthKbytesPerSec;
321 }
322
323 public String getTotalDataWritten() {
324 return totalDataWritten;
325 }
326
327 /***
328 * Sort the entries of the given HashMap in descending order by their
329 * values, which must be longs wrapped with <code>LongWrapper</code>.
330 * <p>
331 * Elements are sorted by value from largest to smallest. Equal values are
332 * sorted in an arbitrary, but consistent manner by their keys. Only items
333 * with identical value and key are considered equal.
334 *
335 * If the passed-in map requires access to be synchronized, the caller
336 * should ensure this synchronization.
337 *
338 * @param mapOfLongWrapperValues
339 * Assumes values are wrapped with LongWrapper.
340 * @return a sorted set containing the same elements as the map.
341 */
342 public TreeMap<String,LongWrapper> getReverseSortedCopy(
343 final Map<String,LongWrapper> mapOfLongWrapperValues) {
344 TreeMap<String,LongWrapper> sortedMap = new TreeMap<String,LongWrapper>(
345 new Comparator<String>() {
346 public int compare(String e1, String e2) {
347 long firstVal = mapOfLongWrapperValues.get(e1).longValue;
348 long secondVal = mapOfLongWrapperValues.get(e2).longValue;
349 if (firstVal < secondVal) {
350 return 1;
351 }
352 if (secondVal < firstVal) {
353 return -1;
354 }
355
356 return e1.compareTo(e2);
357 }
358 });
359 try {
360 sortedMap.putAll(mapOfLongWrapperValues);
361 } catch (UnsupportedOperationException e) {
362 for (String key: mapOfLongWrapperValues.keySet()) {
363 sortedMap.put(key, mapOfLongWrapperValues.get(key));
364 }
365 }
366 return sortedMap;
367 }
368
369 /***
370 * Get the number of hosts with a particular TLD.
371 * @param tld
372 * top-level domain name
373 * @return Total crawled hosts
374 */
375 public long getHostsPerTld(String tld) {
376 LongWrapper lw = (LongWrapper)tldHostDistribution.get(tld);
377 return (lw == null ? 0 : lw.longValue);
378 }
379
380 /***
381 * Read status code distribution from responsecode-report.txt.
382 * DNS and HTTP status codes are separated when read.
383 * @return True if we found some stats.
384 */
385 private boolean calculateStatusCodeDistribution() {
386
387 File f = new File(cjob.getDirectory(), "responsecode-report.txt");
388 if (!f.exists()) {
389 return false;
390 }
391 BufferedReader br = null;
392 try {
393 FileReader reader = new FileReader(f);
394 br = new BufferedReader(reader);
395 String line = br.readLine();
396 line = br.readLine();
397 while (line != null) {
398
399
400 String[] items = line.split(" ");
401 if (items.length < 2) {
402 logger.log(Level.WARNING,
403 "Unexpected formatting on line [" + line + "]");
404 }
405 else {
406
407 if (items[0].length() < 3) {
408
409 long total = Long.parseLong(items[1]);
410 dnsStatusCodeDistribution.put(items[0],
411 new LongWrapper(total));
412 totalDnsStatusCodeDocuments += total;
413 }
414 else {
415
416 long total = Long.parseLong(items[1]);
417 statusCodeDistribution.put(items[0],
418 new LongWrapper(total));
419 totalStatusCodeDocuments += total;
420 }
421 }
422 line = br.readLine();
423 }
424 } catch (IOException e) {
425 logger.log(Level.SEVERE, "Unable to read " + f.getAbsolutePath(),
426 e);
427 } finally {
428 if (br != null) {
429 try {
430 br.close();
431 } catch (IOException e) {
432 logger.log(Level.SEVERE,
433 "Closing " + f.getAbsolutePath(), e);
434 }
435 }
436 }
437 return true;
438 }
439
440 /***
441 * Read MIME type data from mimetype-report.txt.
442 * MIME type of text/dns is separated from other MIME types.
443 * @return True if we found some stats.
444 */
445 private boolean calculateMimeTypeDistribution() {
446 File f = new File(cjob.getDirectory(), "mimetype-report.txt");
447 if (!f.exists()) {
448 return false;
449 }
450 BufferedReader br = null;
451 try {
452 FileReader reader = new FileReader(f);
453 br = new BufferedReader(reader);
454 String line = br.readLine();
455 line = br.readLine();
456 while (line != null) {
457
458
459
460 String[] items = line.split(" ");
461 if (items.length < 3) {
462 logger.log(Level.WARNING,
463 "Unexpected formatting on line [" + line + "]");
464 }
465 else {
466 long total = Long.parseLong(items[0]);
467 long bytes = Long.parseLong(items[1]);
468 String mime = items[2];
469
470
471 if (mime.equalsIgnoreCase("text/dns")) {
472 mimeTypeDnsDistribution.put(mime,
473 new LongWrapper(total));
474 mimeTypeDnsBytes.put(mime, new LongWrapper(bytes));
475 totalDnsMimeTypeDocuments += total;
476 totalDnsMimeSize += bytes;
477 }
478 else {
479 mimeTypeDistribution.put(mime, new LongWrapper(total));
480 mimeTypeBytes.put(mime, new LongWrapper(bytes));
481 totalMimeTypeDocuments += total;
482 totalMimeSize += bytes;
483 }
484 }
485 line = br.readLine();
486 }
487 } catch (IOException e) {
488 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
489 } finally {
490 if (br != null) {
491 try {
492 br.close();
493 } catch (IOException e) {
494 logger.log(Level.SEVERE,
495 "Closing " + f.getAbsolutePath(), e);
496 }
497 }
498 }
499 return true;
500 }
501
502 /***
503 * Read number of URLs and total bytes for each host name from
504 * hosts-report.txt.
505 * Host name of "dns:" is separated from others.
506 * @return true if stats found.
507 */
508 private boolean calculateHostsDistribution() {
509 File f = new File(cjob.getDirectory(), "hosts-report.txt");
510 if (!f.exists()) {
511 return false;
512 }
513 BufferedReader br = null;
514 try {
515 FileReader reader = new FileReader(f);
516 br = new BufferedReader(reader);
517 String line = br.readLine();
518 line = br.readLine();
519 while (line != null) {
520
521
522
523 String[] items = line.split(" ");
524 if (items.length < 3) {
525 logger.log(Level.WARNING,
526 "Unexpected formatting on line [" + line + "]");
527 }
528 else {
529 long total = Long.parseLong(items[0]);
530 long bytes = Long.parseLong(items[1]);
531 String host = items[2];
532
533
534 if (host.startsWith("dns:", 0)) {
535 hostsDnsDistribution.put(host, new LongWrapper(total));
536 hostsDnsBytes.put(host, new LongWrapper(bytes));
537 totalDnsHostDocuments += total;
538 totalDnsHostSize += bytes;
539 }
540 else {
541 hostsDistribution.put(host, new LongWrapper(total));
542 hostsBytes.put(host, new LongWrapper(bytes));
543 totalHostDocuments += total;
544 totalHostSize += bytes;
545
546
547 String tld = host.substring(host.lastIndexOf('.')+1);
548 incrementMapCount(tldDistribution, tld, total);
549 incrementMapCount(tldBytes, tld, bytes);
550 incrementMapCount(tldHostDistribution, tld);
551 totalTldDocuments += total;
552 totalTldSize += bytes;
553
554 totalHosts++;
555 }
556 }
557 line = br.readLine();
558 }
559 } catch (IOException e) {
560 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
561 } finally {
562 if (br != null) {
563 try {
564 br.close();
565 } catch (IOException e) {
566 logger.log(Level.SEVERE,
567 "Closing " + f.getAbsolutePath(), e);
568 }
569 }
570 }
571 return true;
572 }
573
574 /***
575 * Returns the accumulated number of bytes downloaded from a given host.
576 * @param host name of the host
577 * @return the accumulated number of bytes downloaded from a given host
578 */
579 public long getBytesPerHost(String host) {
580 long bytes = -1;
581
582 bytes = host != null && host.startsWith("dns:", 0) ?
583 ((LongWrapper)hostsDnsBytes.get(host)).longValue :
584 ((LongWrapper)hostsBytes.get(host)).longValue;
585
586 return bytes;
587 }
588
589 /***
590 * Returns the total number of bytes downloaded for a given TLD.
591 * @param tld TLD
592 * @return the total number of bytes downloaded for a given TLD
593 */
594 public long getBytesPerTld(String tld) {
595 LongWrapper lw = (LongWrapper)tldBytes.get(tld);
596 return (lw == null ? 0 : lw.longValue);
597 }
598
599 /***
600 * Returns the accumulated number of bytes from files of a given file type.
601 * @param filetype Filetype to check.
602 * @return the accumulated number of bytes from files of a given mime type
603 */
604 public long getBytesPerMimeType(String filetype) {
605 long bytes = -1;
606
607 if (filetype != null) {
608 if (filetype.equals("text/dns")) {
609 bytes = mimeTypeDnsBytes.get(filetype) == null ? 0 :
610 ((LongWrapper)mimeTypeDnsBytes.get(filetype)).longValue;
611 }
612 else {
613 bytes = mimeTypeBytes.get(filetype) == null ? 0 :
614 ((LongWrapper)mimeTypeBytes.get(filetype)).longValue;
615 }
616 }
617 return bytes;
618 }
619
620 /***
621 * Reads duration time, processed docs/sec, bandwidth, and total size
622 * of crawl from crawl-report.txt.
623 * @return true if stats found.
624 */
625 public boolean readCrawlReport() {
626 File f = new File(cjob.getDirectory(), "crawl-report.txt");
627 if (!f.exists()) {
628 return false;
629 }
630 BufferedReader br = null;
631 try {
632 FileReader reader = new FileReader(f);
633 br = new BufferedReader(reader);
634 String line = br.readLine();
635 while (line != null) {
636 if (line.startsWith("Duration Time")) {
637 durationTime = line.substring(line.indexOf(':')+1);
638 }
639 else if (line.startsWith("Processed docs/sec")) {
640 processedDocsPerSec = line.substring(line.indexOf(':')+1);
641 }
642 else if (line.startsWith("Bandwidth in Kbytes/sec")) {
643 bandwidthKbytesPerSec = line.substring(line.indexOf(':')+1);
644 }
645 else if (line.startsWith("Total Raw Data Size in Bytes")) {
646 totalDataWritten = line.substring(line.indexOf(':')+1);
647 }
648
649 line = br.readLine();
650 }
651 }
652 catch (IOException e) {
653 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
654 } finally {
655 if (br != null) {
656 try {
657 br.close();
658 } catch (IOException e) {
659 logger.log(Level.SEVERE,
660 "Failed close of " + f.getAbsolutePath(), e);
661 }
662 }
663 }
664 return true;
665 }
666
667 /***
668 * Returns sorted Iterator of seeds records based on status code.
669 * @return sorted Iterator of seeds records
670 */
671 public Iterator<SeedRecord> getSeedRecordsSortedByStatusCode() {
672 TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(
673 new Comparator<SeedRecord>() {
674 public int compare(SeedRecord sr1, SeedRecord sr2) {
675 int code1 = sr1.getStatusCode();
676 int code2 = sr2.getStatusCode();
677 if (code1 == code2) {
678
679 return sr1.getUri().compareTo(sr2.getUri());
680 }
681
682
683
684
685 code1 = -code1 - Integer.MAX_VALUE;
686 code2 = -code2 - Integer.MAX_VALUE;
687
688 return new Integer(code1).compareTo(new Integer(code2));
689 }
690 });
691 for (SeedRecord sr: processedSeedsRecords.values()) {
692 sortedSet.add(sr);
693 }
694
695 return sortedSet.iterator();
696 }
697
698 /***
699 * Reads seed data from seeds-report.txt.
700 * @return True if stats found.
701 */
702 private boolean readSeedReport() {
703 File f = new File(cjob.getDirectory(), "seeds-report.txt");
704 if (!f.exists()) {
705 return false;
706 }
707 BufferedReader br = null;
708 try {
709 FileReader reader = new FileReader(f);
710 br = new BufferedReader(reader);
711
712
713 String line = br.readLine();
714 line = br.readLine();
715 while (line != null) {
716
717
718
719
720 String[] items = line.split(" ");
721
722 if (items.length < 3) {
723 logger.log(Level.WARNING,
724 "Unexpected formatting on line [" + line + "]");
725 }
726 else {
727 String statusCode = items[0];
728 String crawlStatus = items[1];
729 String seed = items[2];
730 String redirect = items.length > 3 ? items[3] : null;
731
732
733 if (crawlStatus.equals("CRAWLED")) {
734 crawlStatus =org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS;
735 }
736 else {
737 crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE;
738 }
739 SeedRecord sr = new SeedRecord(seed, crawlStatus,
740 Integer.parseInt(statusCode), redirect);
741 processedSeedsRecords.put(seed, sr);
742 }
743
744 line = br.readLine();
745 }
746 } catch (IOException e) {
747 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
748 } finally {
749 if (br != null) {
750 try {
751 br.close();
752 } catch (IOException e) {
753 logger.log(Level.SEVERE,
754 "Closing " + f.getAbsolutePath(), e);
755 }
756 }
757 }
758 return true;
759 }
760
761 /***
762 * Return a copy of the hosts distribution in reverse-sorted
763 * (largest first) order.
764 *
765 * @return SortedMap of hosts distribution
766 */
767 public SortedMap getReverseSortedHostsDistribution() {
768 return getReverseSortedCopy(hostsDistribution);
769 }
770
771 /***
772 * @return True if we compiled stats, false if none to compile (e.g.
773 * there are no reports files on disk).
774 */
775 public boolean isStats() {
776 return this.stats;
777 }
778 }