View Javadoc

1   /* StatisticsSummary
2    * 
3    * $Id: StatisticsSummary.java 4666 2006-09-26 17:53:28Z paul_jack $$
4    * 
5    * Created on July 27, 2006
6    * 
7    * Copyright (C) 2006 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.admin;
26  
27  import java.io.File;
28  import java.io.FileReader;
29  import java.io.BufferedReader;
30  import java.io.IOException;
31  import java.util.Comparator;
32  import java.util.Hashtable;
33  import java.util.Iterator;
34  import java.util.Map;
35  import java.util.SortedMap;
36  import java.util.TreeMap;
37  import java.util.TreeSet;
38  import java.util.logging.Level;
39  import java.util.logging.Logger;
40  
41  import org.archive.util.LongWrapper;
42  
43  
44  /***
45   * This class provides descriptive statistics of a finished crawl job by
46   * using the crawl report files generated by StatisticsTracker.  Any formatting
47   * changes to the way StatisticsTracker writes to the summary crawl reports will
48   * require changes to this class.
49   * <p>
50   * The following statistics are accessible from this class:
51   * <ul>
52   *   <li> Successfully downloaded documents per fetch status code
53   *   <li> Successfully downloaded documents per document mime type
54   *   <li> Amount of data per mime type
55   *   <li> Successfully downloaded documents per host
56   *   <li> Amount of data per host
57   *   <li> Successfully downloaded documents per top-level domain name (TLD)
58   *   <li> Disposition of all seeds 
59   *   <li> Successfully downloaded documents per host per source
60   * </ul>
61   *
62   * <p>TODO: Make it so summarizing is not done all in RAM so we avoid
63   * OOME.
64   *
65   * @author Frank McCown
66   *
67   * @see org.archive.crawler.admin.StatisticsTracker
68   */
69  public class StatisticsSummary {
70      /***
71       * Messages from the StatisticsSummary.
72       */
73      private final static Logger logger =
74          Logger.getLogger(StatisticsSummary.class.getName());
75      
76      private boolean stats = true;
77      
78      /*** Crawl job whose summary we want to view */
79      private CrawlJob cjob;
80          
81      protected long totalDnsStatusCodeDocuments = 0;
82      protected long totalStatusCodeDocuments = 0;
83      protected long totalFileTypeDocuments = 0;
84      protected long totalMimeTypeDocuments = 0;
85      protected long totalDnsMimeTypeDocuments = 0;
86      protected long totalDnsHostDocuments = 0;
87      protected long totalHostDocuments = 0;
88      protected long totalMimeSize = 0;
89      protected long totalDnsMimeSize = 0;
90      protected long totalHostSize = 0;
91      protected long totalDnsHostSize = 0;
92      protected long totalTldDocuments = 0;
93      protected long totalTldSize = 0;
94      protected long totalHosts = 0;
95      
96      protected String durationTime;
97      protected String processedDocsPerSec;
98      protected String bandwidthKbytesPerSec;
99      protected String totalDataWritten;
100     
101     /*** Keep track of the file types we see (mime type -> count) */
102     protected Hashtable<String,LongWrapper> mimeTypeDistribution = new Hashtable<String,LongWrapper>();
103     protected Hashtable<String,LongWrapper> mimeTypeBytes = new Hashtable<String,LongWrapper>();
104     protected Hashtable<String,LongWrapper> mimeTypeDnsDistribution = new Hashtable<String,LongWrapper>();
105     protected Hashtable<String,LongWrapper> mimeTypeDnsBytes = new Hashtable<String,LongWrapper>();
106     
107     /*** Keep track of status codes */
108     protected Hashtable<String,LongWrapper> statusCodeDistribution = new Hashtable<String,LongWrapper>();
109     protected Hashtable<String,LongWrapper> dnsStatusCodeDistribution
110      = new Hashtable<String,LongWrapper>();
111     
112     /*** Keep track of hosts */
113     protected Hashtable<String,LongWrapper> hostsDistribution = new Hashtable<String,LongWrapper>(); 
114     protected Hashtable<String,LongWrapper> hostsBytes = new Hashtable<String,LongWrapper>(); 
115     protected Hashtable<String,LongWrapper> hostsDnsDistribution = new Hashtable<String,LongWrapper>();
116     protected Hashtable<String,LongWrapper> hostsDnsBytes = new Hashtable<String,LongWrapper>(); 
117 
118     /*** Keep track of TLDs */
119     protected Hashtable<String,LongWrapper> tldDistribution = new Hashtable<String,LongWrapper>();
120     protected Hashtable<String,LongWrapper> tldBytes = new Hashtable<String,LongWrapper>();
121     protected Hashtable<String,LongWrapper> tldHostDistribution = new Hashtable<String,LongWrapper>();
122 
123     /*** Keep track of processed seeds */
124     protected transient Map<String,SeedRecord> processedSeedsRecords 
125      = new Hashtable<String,SeedRecord>();
126 
127     /***
128      * Constructor
129      * 
130      * @param cjob
131      * 				Completed crawl job
132      */
133     public StatisticsSummary(CrawlJob cjob) {
134     	this.cjob = cjob;
135     	
136     	// Read all stats for this crawl job
137     	this.stats = calculateStatusCodeDistribution();
138     	if (calculateMimeTypeDistribution()) {
139     		this.stats = true;
140     	}
141     	if (calculateHostsDistribution()) {
142     		this.stats = true;
143     	}
144     	if (readCrawlReport()) {
145     		this.stats = true;
146     	}
147     	if (readSeedReport()) {
148     		this.stats = true;
149     	}
150     }
151     
152     
153     /***
154      * Increment a counter for a key in a given HashMap. Used for various
155      * aggregate data.
156      *
157      * @param map The HashMap
158      * @param key The key for the counter to be incremented, if it does not
159      *               exist it will be added (set to 1).  If null it will
160      *            increment the counter "unknown".
161      */
162     protected static void incrementMapCount(Map<String,LongWrapper> map, 
163             String key) {
164     	incrementMapCount(map,key,1);
165     }
166 
167     /***
168      * Increment a counter for a key in a given HashMap by an arbitrary amount.
169      * Used for various aggregate data. The increment amount can be negative.
170      *
171      * @param map
172      *            The HashMap
173      * @param key
174      *            The key for the counter to be incremented, if it does not
175      *            exist it will be added (set to equal to
176      *            <code>increment</code>).
177      *            If null it will increment the counter "unknown".
178      * @param increment
179      *            The amount to increment counter related to the
180      *            <code>key</code>.
181      */
182     protected static void incrementMapCount(Map<String,LongWrapper> map, 
183             String key, long increment) {
184         if (key == null) {
185             key = "unknown";
186         }
187         LongWrapper lw = map.get(key);
188         if(lw == null) {
189             map.put(key, new LongWrapper(increment));
190         } else {
191             lw.longValue += increment;
192         }
193     }
194   
195     /*** Returns a HashMap that contains information about distributions of
196      *  encountered mime types.  Key/value pairs represent
197      *  mime type -> count.
198      * <p>
199      * <b>Note:</b> All the values are wrapped with a
200      * {@link LongWrapper LongWrapper}
201      * @return mimeTypeDistribution
202      */
203     public Hashtable getMimeDistribution() {
204         return mimeTypeDistribution;
205     }
206     
207     public long getTotalMimeTypeDocuments() {
208        	return totalMimeTypeDocuments;
209     }
210     
211     public long getTotalDnsMimeTypeDocuments() {
212        	return totalDnsMimeTypeDocuments;
213     }
214     
215     public long getTotalMimeSize() {
216     	return totalMimeSize;
217     }
218     
219     public long getTotalDnsMimeSize() {
220     	return totalDnsMimeSize;
221     }
222    
223     /***
224      * Return a HashMap representing the distribution of HTTP status codes for
225      * successfully fetched curis, as represented by a hashmap where key -&gt;
226      * val represents (string)code -&gt; (integer)count.
227      * 
228      * <b>Note: </b> All the values are wrapped with a
229      * {@link LongWrapper LongWrapper}
230      * 
231      * @return statusCodeDistribution
232      */
233     public Hashtable getStatusCodeDistribution() {    	
234         return statusCodeDistribution;
235     }
236    
237     /***
238      * Return a HashMap representing the distribution of DNS status codes for
239      * successfully fetched curis, as represented by a hashmap where key -&gt;
240      * val represents (string)code -&gt; (integer)count.
241      * 
242      * <b>Note: </b> All the values are wrapped with a
243      * {@link LongWrapper LongWrapper}
244      * 
245      * @return dnsStatusCodeDistribution
246      */
247     public Hashtable getDnsStatusCodeDistribution() {
248     	return dnsStatusCodeDistribution;
249     }
250     
251     public Hashtable getDnsMimeDistribution() {
252         return mimeTypeDnsDistribution;
253     }
254 
255     public long getTotalDnsStatusCodeDocuments() {
256     	return totalDnsStatusCodeDocuments;
257     }
258     
259     public long getTotalStatusCodeDocuments() {
260     	return totalStatusCodeDocuments;
261     }  
262     
263     public long getTotalHostDocuments() {
264        	return totalHostDocuments;
265     }
266     
267     public long getTotalDnsHostDocuments() {
268        	return totalDnsHostDocuments;
269     }
270     
271     public Hashtable getHostsDnsDistribution() {
272     	return hostsDnsDistribution;
273     }
274     
275     public long getTotalHostDnsDocuments() {
276     	return totalDnsHostDocuments;
277     }
278     
279     public long getTotalHostSize() {
280     	return totalHostSize;
281     }
282     
283     public long getTotalDnsHostSize() {
284     	return totalDnsHostSize;
285     }
286     
287     public Hashtable getTldDistribution() {
288     	return tldDistribution;
289     }
290     
291     public Hashtable getTldBytes() {
292     	return tldBytes;
293     }
294     
295     public long getTotalTldDocuments() {
296     	return totalTldDocuments;
297     }
298     
299     public long getTotalTldSize() {
300     	return totalTldSize;
301     }
302     
303     public Hashtable getTldHostDistribution() {
304     	return tldHostDistribution;
305     }
306     
307     public long getTotalHosts() {
308     	return totalHosts;
309     }
310     
311     public String getDurationTime() {
312     	return durationTime;
313     }
314     
315     public String getProcessedDocsPerSec() {
316     	return processedDocsPerSec;
317     }
318     
319     public String getBandwidthKbytesPerSec() {
320     	return bandwidthKbytesPerSec;
321     }
322     
323     public String getTotalDataWritten() {
324     	return totalDataWritten;
325     }
326 
327     /***
328      * Sort the entries of the given HashMap in descending order by their
329      * values, which must be longs wrapped with <code>LongWrapper</code>.
330      * <p>
331      * Elements are sorted by value from largest to smallest. Equal values are
332      * sorted in an arbitrary, but consistent manner by their keys. Only items
333      * with identical value and key are considered equal.
334      *
335      * If the passed-in map requires access to be synchronized, the caller
336      * should ensure this synchronization. 
337      * 
338      * @param mapOfLongWrapperValues
339      *            Assumes values are wrapped with LongWrapper.
340      * @return a sorted set containing the same elements as the map.
341      */
342     public TreeMap<String,LongWrapper> getReverseSortedCopy(
343             final Map<String,LongWrapper> mapOfLongWrapperValues) {
344         TreeMap<String,LongWrapper> sortedMap = new TreeMap<String,LongWrapper>(
345           new Comparator<String>() {
346             public int compare(String e1, String e2) {
347                 long firstVal = mapOfLongWrapperValues.get(e1).longValue;
348                 long secondVal = mapOfLongWrapperValues.get(e2).longValue;
349                 if (firstVal < secondVal) {
350                     return 1;
351                 }
352                 if (secondVal < firstVal) {
353                     return -1;
354                 }
355                 // If the values are the same, sort by keys.
356                 return e1.compareTo(e2);
357             }
358         });
359         try {
360             sortedMap.putAll(mapOfLongWrapperValues);
361         } catch (UnsupportedOperationException e) {
362             for (String key: mapOfLongWrapperValues.keySet()) {
363                 sortedMap.put(key, mapOfLongWrapperValues.get(key));
364             }
365         }
366         return sortedMap;
367     }
368      
369     /***
370      * Get the number of hosts with a particular TLD.
371      * @param tld
372      * 				top-level domain name
373      * @return		Total crawled hosts
374      */
375     public long getHostsPerTld(String tld) {
376     	LongWrapper lw = (LongWrapper)tldHostDistribution.get(tld);
377     	return (lw == null ? 0 : lw.longValue);
378     }
379     
380     /***
381      * Read status code distribution from responsecode-report.txt.
382      * DNS and HTTP status codes are separated when read.
383      * @return True if we found some stats.
384      */
385     private boolean calculateStatusCodeDistribution() {
386     	// Read from responsecode-report.txt
387     	File f = new File(cjob.getDirectory(), "responsecode-report.txt");
388     	if (!f.exists()) {
389     		return false;
390     	}
391     	BufferedReader br = null;
392     	try {
393 	    	FileReader reader = new FileReader(f);
394 	    	br = new BufferedReader(reader);
395 	    	String line = br.readLine();  // Ignore heading
396 	    	line = br.readLine();
397 	    	while (line != null) {  	  
398 	    	  // Get status code and # urls which are seperated by a space
399 	    	  
400 	    	  String[] items = line.split(" ");
401 	    	  if (items.length < 2) {
402 	    		  logger.log(Level.WARNING,
403                           "Unexpected formatting on line [" + line + "]");
404 	    	  }
405 	    	  else {
406 	    		  // See if DNS or HTTP status code
407 	    		  if (items[0].length() < 3) {
408 	    			  // DNS status code
409 	    			  long total = Long.parseLong(items[1]);
410 	    			  dnsStatusCodeDistribution.put(items[0], 
411 	    					  new LongWrapper(total));
412 	    			  totalDnsStatusCodeDocuments += total;
413 	    		  }
414 	    		  else {
415 	    			  // HTTP status code
416 	    			  long total = Long.parseLong(items[1]);
417 	    			  statusCodeDistribution.put(items[0], 
418 	    					  new LongWrapper(total));
419 	    			  totalStatusCodeDocuments += total;
420 	    		  }
421 	    	  }
422 	    	  line = br.readLine();
423 	    	}
424     	} catch (IOException e) {
425     		logger.log(Level.SEVERE, "Unable to read " + f.getAbsolutePath(),
426     			e);
427     	} finally {
428     		if (br != null) {
429     			try {
430 					br.close();
431 				} catch (IOException e) {
432 					logger.log(Level.SEVERE,
433 						"Closing " + f.getAbsolutePath(), e);
434 				}
435     		}
436     	}
437     	return true;
438     }
439     
440     /***
441      * Read MIME type data from mimetype-report.txt.
442      * MIME type of text/dns is separated from other MIME types.
443      * @return True if we found some stats.
444      */
445     private boolean calculateMimeTypeDistribution() {    	
446     	File f = new File(cjob.getDirectory(), "mimetype-report.txt");
447     	if (!f.exists()) {
448     		return false;
449     	}
450     	BufferedReader br = null;
451     	try {
452 	    	FileReader reader = new FileReader(f);
453 	    	br = new BufferedReader(reader);
454 	    	String line = br.readLine();  // Ignore heading
455 	    	line = br.readLine();
456 	    	while (line != null) {	    			    	  
457 	    		// Get num urls, num bytes, and MIME type (seperated by a space)
458 	    		// Example: 12 134279 text/html
459   
460 	    		String[] items = line.split(" ");
461 	    		if (items.length < 3) {
462 	    			logger.log(Level.WARNING,
463                             "Unexpected formatting on line [" + line + "]");
464 	    		}
465 	    		else {
466 	    			long total = Long.parseLong(items[0]);
467 	    			long bytes = Long.parseLong(items[1]);
468 	    			String mime = items[2];
469 
470 	    			// Seperate DNS reconrds from HTTP
471 	    			if (mime.equalsIgnoreCase("text/dns")) {
472 	    				mimeTypeDnsDistribution.put(mime,
473                                 new LongWrapper(total));
474 	    				mimeTypeDnsBytes.put(mime, new LongWrapper(bytes));
475 	    				totalDnsMimeTypeDocuments += total;
476 	    				totalDnsMimeSize += bytes;
477 	    			}
478 	    			else {
479 	    				mimeTypeDistribution.put(mime, new LongWrapper(total));
480 	    				mimeTypeBytes.put(mime, new LongWrapper(bytes));
481 	    				totalMimeTypeDocuments += total;
482 	    				totalMimeSize += bytes;
483 	    			}
484 	    		}
485 	    		line = br.readLine();
486 	    	}
487     	} catch (IOException e) {
488     		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
489     	} finally {
490     		if (br != null) {
491     			try {
492     				br.close();
493     			} catch (IOException e) {
494     				logger.log(Level.SEVERE,
495     					"Closing " + f.getAbsolutePath(), e);
496     			}
497     		}
498     	}
499     	return true;
500     }
501     
502     /***
503      * Read number of URLs and total bytes for each host name from
504      * hosts-report.txt.
505      * Host name of "dns:" is separated from others.
506      * @return true if stats found.
507      */
508     private boolean calculateHostsDistribution() {
509     	File f = new File(cjob.getDirectory(), "hosts-report.txt");
510     	if (!f.exists()) {
511     		return false;
512     	}
513     	BufferedReader br = null;
514     	try {
515 	    	FileReader reader = new FileReader(f);
516 	    	br = new BufferedReader(reader);
517 	    	String line = br.readLine();  // Ignore heading
518 	    	line = br.readLine();
519 	    	while (line != null) {    	  
520 	    		// Get num urls, num bytes, and host name (seperated by a space)
521 	    		// Example: 9 7468 www.blogger.com
522 
523 	    		String[] items = line.split(" ");
524 	    		if (items.length < 3) {
525 	    			logger.log(Level.WARNING,
526                             "Unexpected formatting on line [" + line + "]");
527 	    		}
528 	    		else {
529 	    			long total = Long.parseLong(items[0]);
530 	    			long bytes = Long.parseLong(items[1]);
531 	    			String host = items[2];
532 
533 	    			// Seperate DNS reconrds from HTTP
534 	    			if (host.startsWith("dns:", 0)) {
535 	    				hostsDnsDistribution.put(host, new LongWrapper(total));
536 	    				hostsDnsBytes.put(host, new LongWrapper(bytes));
537 	    				totalDnsHostDocuments += total;
538 	    				totalDnsHostSize += bytes;
539 	    			}
540 	    			else {
541 	    				hostsDistribution.put(host, new LongWrapper(total));
542 	    				hostsBytes.put(host, new LongWrapper(bytes));
543 	    				totalHostDocuments += total;
544 	    				totalHostSize += bytes;
545 
546 	    				// Count top level domain (TLD)
547 	    				String tld = host.substring(host.lastIndexOf('.')+1);
548 	    				incrementMapCount(tldDistribution, tld, total);   
549 	    				incrementMapCount(tldBytes, tld, bytes);
550 	    				incrementMapCount(tldHostDistribution, tld);
551 	    				totalTldDocuments += total;
552 	    				totalTldSize += bytes;
553 
554 	    				totalHosts++;
555 	    			}
556 	    		}
557 	    		line = br.readLine();
558 	    	}
559     	} catch (IOException e) {
560     		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
561     	} finally {
562     		if (br != null) {
563     			try {
564     				br.close();
565     			} catch (IOException e) {
566     				logger.log(Level.SEVERE,
567     					"Closing " + f.getAbsolutePath(), e);
568     			}
569     		}
570     	}
571     	return true;
572     }
573 
574     /***
575      * Returns the accumulated number of bytes downloaded from a given host.
576      * @param host name of the host
577      * @return the accumulated number of bytes downloaded from a given host
578      */
579     public long getBytesPerHost(String host) { 
580     	long bytes = -1;
581     	
582     	bytes = host != null && host.startsWith("dns:", 0) ? 
583 	    	((LongWrapper)hostsDnsBytes.get(host)).longValue :
584 	    	((LongWrapper)hostsBytes.get(host)).longValue;	    
585     	
586     	return bytes;
587     }
588     
589     /***
590      * Returns the total number of bytes downloaded for a given TLD.
591      * @param tld TLD
592      * @return the total number of bytes downloaded for a given TLD
593      */
594     public long getBytesPerTld(String tld) {
595     	LongWrapper lw = (LongWrapper)tldBytes.get(tld);
596     	return (lw == null ? 0 : lw.longValue);
597     }
598 
599     /***
600      * Returns the accumulated number of bytes from files of a given file type.
601      * @param filetype Filetype to check.
602      * @return the accumulated number of bytes from files of a given mime type
603      */
604     public long getBytesPerMimeType(String filetype) {
605     	long bytes = -1;
606     	
607     	if (filetype != null) {    	
608 	    	if (filetype.equals("text/dns")) {	    		
609 	    		bytes = mimeTypeDnsBytes.get(filetype) == null ? 0 :
610 	    			((LongWrapper)mimeTypeDnsBytes.get(filetype)).longValue;
611 	    	}
612 	    	else {
613 	    		bytes = mimeTypeBytes.get(filetype) == null ? 0 :
614 	    			((LongWrapper)mimeTypeBytes.get(filetype)).longValue;
615 	    	}
616     	}
617     	return bytes;
618     }
619     
620     /***
621      * Reads duration time, processed docs/sec, bandwidth, and total size
622      * of crawl from crawl-report.txt.
623      * @return true if stats found.
624      */
625     public boolean readCrawlReport() {
626     	File f = new File(cjob.getDirectory(), "crawl-report.txt");
627     	if (!f.exists()) {
628     		return false;
629     	}
630     	BufferedReader br = null;
631     	try {
632 	    	FileReader reader = new FileReader(f);
633 	    	br = new BufferedReader(reader);
634 	    	String line = br.readLine();  
635 	    	while (line != null) {
636 	    		if (line.startsWith("Duration Time")) {
637 	    			durationTime = line.substring(line.indexOf(':')+1);
638 	    		}
639 	    		else if (line.startsWith("Processed docs/sec")) {
640 	    			processedDocsPerSec = line.substring(line.indexOf(':')+1);
641 	    		}
642 	    		else if (line.startsWith("Bandwidth in Kbytes/sec")) {
643 	    			bandwidthKbytesPerSec = line.substring(line.indexOf(':')+1);
644 	    		}
645 	    		else if (line.startsWith("Total Raw Data Size in Bytes")) {
646 	    			totalDataWritten = line.substring(line.indexOf(':')+1);
647 	    		}
648 
649 	    		line = br.readLine();
650 	    	}
651     	}
652     	catch (IOException e) {
653     		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);		
654     	} finally {
655     		if (br != null) {
656     			try {
657 					br.close();
658 				} catch (IOException e) {
659 					logger.log(Level.SEVERE,
660 					    "Failed close of " + f.getAbsolutePath(), e);
661 				}
662     		}
663     	}
664     	return true;
665     }
666   
667     /***
668      * Returns sorted Iterator of seeds records based on status code.
669      * @return sorted Iterator of seeds records
670      */
671     public Iterator<SeedRecord> getSeedRecordsSortedByStatusCode() {
672         TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(
673           new Comparator<SeedRecord>() {
674             public int compare(SeedRecord sr1, SeedRecord sr2) {
675                 int code1 = sr1.getStatusCode();
676                 int code2 = sr2.getStatusCode();
677                 if (code1 == code2) {
678                     // If the values are equal, sort by URIs.
679                     return sr1.getUri().compareTo(sr2.getUri());
680                 }
681                 // mirror and shift the nubmer line so as to
682                 // place zero at the beginning, then all negatives 
683                 // in order of ascending absolute value, then all 
684                 // positives descending
685                 code1 = -code1 - Integer.MAX_VALUE;
686                 code2 = -code2 - Integer.MAX_VALUE;
687                 
688                 return new Integer(code1).compareTo(new Integer(code2));
689             }
690         });
691         for (SeedRecord sr: processedSeedsRecords.values()) {
692             sortedSet.add(sr);
693         }
694         
695         return sortedSet.iterator();
696     }
697     
698     /***
699      * Reads seed data from seeds-report.txt.
700      * @return True if stats found.
701      */
702     private boolean readSeedReport() {
703     	File f = new File(cjob.getDirectory(), "seeds-report.txt");
704     	if (!f.exists()) {
705     		return false;
706     	}
707     	BufferedReader br = null;
708     	try {
709 	    	FileReader reader = new FileReader(f);
710 	    	br = new BufferedReader(reader);
711 	    	
712 	    	// Ignore heading: [code] [status] [seed] [redirect]
713 	    	String line = br.readLine();  
714 	    	line = br.readLine();
715 	    	while (line != null) {
716 	    		// Example lines:
717 	    		// 302 CRAWLED http://www.ashlandcitytimes.com/ http://www.ashlandcitytimes.com/apps/pbcs.dll/section?Category=MTCN01
718 	    		// 200 CRAWLED http://noleeo.com/
719 
720 	    		String[] items = line.split(" ");
721 
722 	    		if (items.length < 3) {
723 	    			logger.log(Level.WARNING,
724                             "Unexpected formatting on line [" + line + "]");
725 	    		}
726 	    		else {
727 	    			String statusCode = items[0];
728 	    			String crawlStatus = items[1];
729 	    			String seed = items[2];
730 	    			String redirect = items.length > 3 ? items[3] : null;
731 
732 	    			// All values should be CRAWLED or NOTCRAWLED
733 	    			if (crawlStatus.equals("CRAWLED")) {
734 	    				crawlStatus =org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS;	    		  
735 	    			}
736 	    			else {
737 	    				crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE;
738 	    			}
739 	    			SeedRecord sr = new SeedRecord(seed, crawlStatus, 
740 	    					Integer.parseInt(statusCode), redirect);
741 	    			processedSeedsRecords.put(seed, sr);
742 	    		}
743 
744 	    		line = br.readLine();
745 	    	}
746     	} catch (IOException e) {
747     		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);   		
748     	} finally {
749     		if (br != null) {
750     			try {
751 					br.close();
752 				} catch (IOException e) {
753 					logger.log(Level.SEVERE,
754 						"Closing " + f.getAbsolutePath(), e);
755 				}
756     		}
757     	}
758     	return true;
759     }
760         
761     /***
762      * Return a copy of the hosts distribution in reverse-sorted
763      * (largest first) order.
764      *  
765      * @return SortedMap of hosts distribution
766      */
767     public SortedMap getReverseSortedHostsDistribution() {
768         return getReverseSortedCopy(hostsDistribution);  
769     }    
770     
771     /***
772      * @return True if we compiled stats, false if none to compile (e.g.
773      * there are no reports files on disk).
774      */
775     public boolean isStats() {
776     	return this.stats;
777     }
778 }