1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Jul 16, 2003
20   *
21   */
22  package org.archive.crawler.admin;
23  
24  import java.io.File;
25  import java.io.FileWriter;
26  import java.io.IOException;
27  import java.io.PrintWriter;
28  import java.io.Serializable;
29  import java.util.Comparator;
30  import java.util.Date;
31  import java.util.EventObject;
32  import java.util.Hashtable;
33  import java.util.Iterator;
34  import java.util.List;
35  import java.util.Map;
36  import java.util.HashMap;
37  import java.util.SortedMap;
38  import java.util.TreeMap;
39  import java.util.TreeSet;
40  import java.util.Vector;
41  import java.util.logging.Level;
42  import java.util.logging.Logger;
43  
44  import org.apache.commons.collections.Closure;
45  import org.archive.crawler.datamodel.CrawlHost;
46  import org.archive.crawler.datamodel.CrawlURI;
47  import org.archive.crawler.event.CrawlURIDispositionListener;
48  import org.archive.crawler.framework.AbstractTracker;
49  import org.archive.crawler.framework.CrawlController;
50  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
51  import org.archive.crawler.util.CrawledBytesHistotable;
52  import org.archive.net.UURI;
53  import org.archive.util.ArchiveUtils;
54  import org.archive.util.Histotable;
55  import org.archive.util.LongWrapper;
56  import org.archive.util.MimetypeUtils;
57  import org.archive.util.PaddingStringBuffer;
58  
59  /***
60   * This is an implementation of the AbstractTracker. It is designed to function
61   * with the WUI as well as performing various logging activity.
62   * <p>
63   * At the end of each snapshot a line is written to the
64   * 'progress-statistics.log' file.
65   * <p>
66   * The header of that file is as follows:
67   * <pre> [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
68   * First there is a <b>timestamp</b>, accurate down to 1 second.
69   * <p>
70   * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
71   * are (respectively) the discovered URI count, pending URI count, successfully
72   * fetched count and failed fetch count from the frontier at the time of the
73   * snapshot.
74   * <p>
75   * <b>KB/s(avg)</b> is the bandwidth usage.  We use the total bytes downloaded
76   * to calculate average bandwidth usage (KB/sec). Since we also note the value
77   * each time a snapshot is made we can calculate the average bandwidth usage
78   * during the last snapshot period to gain a "current" rate. The first number is
79   * the current and the average is in parenthesis.
80   * <p>
81   * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
82   * documents (URIs) rather then KB downloaded.
83   * <p>
84   * <b>busy-threads</b> is the total number of ToeThreads that are not available
85   * (and thus presumably busy processing a URI). This information is extracted
86   * from the crawl controller.
87   * <p>
88   * Finally mem-use-KB is extracted from the run time environment
89   * (<code>Runtime.getRuntime().totalMemory()</code>).
90   * <p>
91   * In addition to the data collected for the above logs, various other data
92   * is gathered and stored by this tracker.
93   * <ul>
94   *   <li> Successfully downloaded documents per fetch status code
95   *   <li> Successfully downloaded documents per document mime type
96   *   <li> Amount of data per mime type
97   *   <li> Successfully downloaded documents per host
98   *   <li> Amount of data per host
99   *   <li> Disposition of all seeds (this is written to 'reports.log' at end of
100  *        crawl)
101  *   <li> Successfully downloaded documents per host per source
102  * </ul>
103  *
104  * @author Parker Thompson
105  * @author Kristinn Sigurdsson
106  *
107  * @see org.archive.crawler.framework.StatisticsTracking
108  * @see org.archive.crawler.framework.AbstractTracker
109  */
110 public class StatisticsTracker extends AbstractTracker
111 implements CrawlURIDispositionListener, Serializable {
112     private static final long serialVersionUID = 8004878315916392305L;
113 
114     /***
115      * Messages from the StatisticsTracker.
116      */
117     private final static Logger logger =
118         Logger.getLogger(StatisticsTracker.class.getName());
119     
120     // TODO: Need to be able to specify file where the object will be
121     // written once the CrawlEnded event occurs
122 
123     protected long lastPagesFetchedCount = 0;
124     protected long lastProcessedBytesCount = 0;
125 
126     /*
127      * Snapshot data.
128      */
129     protected long discoveredUriCount = 0;
130     protected long queuedUriCount = 0;
131     protected long finishedUriCount = 0;
132 
133     protected long downloadedUriCount = 0;
134     protected long downloadFailures = 0;
135     protected long downloadDisregards = 0;
136     protected double docsPerSecond = 0;
137     protected double currentDocsPerSecond = 0;
138     protected int currentKBPerSec = 0;
139     protected long totalKBPerSec = 0;
140     protected int busyThreads = 0;
141     protected long totalProcessedBytes = 0;
142     protected float congestionRatio = 0; 
143     protected long deepestUri;
144     protected long averageDepth;
145     
146     /*
147      * Cumulative data
148      */
149     /*** tally sizes novel, verified (same hash), vouched (not-modified) */ 
150     protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();
151     
152     /*** Keep track of the file types we see (mime type -> count) */
153     protected Hashtable<String,LongWrapper> mimeTypeDistribution
154      = new Hashtable<String,LongWrapper>();
155     protected Hashtable<String,LongWrapper> mimeTypeBytes
156      = new Hashtable<String,LongWrapper>();
157     
158     /*** Keep track of fetch status codes */
159     protected Hashtable<String,LongWrapper> statusCodeDistribution
160      = new Hashtable<String,LongWrapper>();
161     
162     /*** Keep track of hosts. 
163      * 
164      * Each of these Maps are individually unsynchronized, and cannot 
165      * be trivially synchronized with the Collections wrapper. Thus
166      * their synchronized access is enforced by this class.
167      * 
168      * <p>They're transient because usually bigmaps that get reconstituted
169      * on recover from checkpoint.
170      */
171     protected transient Map<String,LongWrapper> hostsDistribution = null;
172     protected transient Map<String,LongWrapper> hostsBytes = null;
173     protected transient Map<String,Long> hostsLastFinished = null;
174 
175     /*** Keep track of URL counts per host per seed */
176     protected transient 
177     Map<String,HashMap<String,LongWrapper>> sourceHostDistribution = null;
178 
179     /***
180      * Record of seeds' latest actions.
181      */
182     protected transient Map<String,SeedRecord> processedSeedsRecords;
183 
184     // seeds tallies: ONLY UPDATED WHEN SEED REPORT WRITTEN
185     private int seedsCrawled;
186     private int seedsNotCrawled;
187     // sExitMessage: only set at crawl-end
188     private String sExitMessage = "Before crawl end";
189 
190 
191     public StatisticsTracker(String name) {
192         super( name, "A statistics tracker thats integrated into " +
193             "the web UI and that creates the progress-statistics log.");
194     }
195 
196     public void initialize(CrawlController c)
197     throws FatalConfigurationException {
198         super.initialize(c);
199         try {
200             this.sourceHostDistribution = c.getBigMap("sourceHostDistribution",
201             	    String.class, HashMap.class);
202             this.hostsDistribution = c.getBigMap("hostsDistribution",
203                 String.class, LongWrapper.class);
204             this.hostsBytes = c.getBigMap("hostsBytes", String.class,
205                 LongWrapper.class);
206             this.hostsLastFinished = c.getBigMap("hostsLastFinished",
207                 String.class, Long.class);
208             this.processedSeedsRecords = c.getBigMap("processedSeedsRecords",
209                     String.class, SeedRecord.class);
210         } catch (Exception e) {
211             throw new FatalConfigurationException("Failed setup of" +
212                 " StatisticsTracker: " + e);
213         }
214         controller.addCrawlURIDispositionListener(this);
215     }
216 
217     protected void finalCleanup() {
218         super.finalCleanup();
219         if (this.hostsBytes != null) {
220             this.hostsBytes.clear();
221             this.hostsBytes = null;
222         }
223         if (this.hostsDistribution != null) {
224             this.hostsDistribution.clear();
225             this.hostsDistribution = null;
226         }
227         if (this.hostsLastFinished != null) {
228             this.hostsLastFinished.clear();
229             this.hostsLastFinished = null;
230         }
231         if (this.processedSeedsRecords != null) {
232             this.processedSeedsRecords.clear();
233             this.processedSeedsRecords = null;
234         }
235         if (this.sourceHostDistribution != null) {
236             this.sourceHostDistribution.clear();
237             this.sourceHostDistribution = null;
238         }
239 
240     }
241 
242     protected synchronized void progressStatisticsEvent(final EventObject e) {
243         // This method loads "snapshot" data.
244         discoveredUriCount = discoveredUriCount();
245         downloadedUriCount = successfullyFetchedCount();
246         finishedUriCount = finishedUriCount();
247         queuedUriCount = queuedUriCount();
248         downloadFailures = failedFetchAttempts();
249         downloadDisregards = disregardedFetchAttempts();
250         totalProcessedBytes = totalBytesCrawled();
251         congestionRatio = congestionRatio();
252         deepestUri = deepestUri();
253         averageDepth = averageDepth();
254         
255         if (finishedUriCount() == 0) {
256             docsPerSecond = 0;
257             totalKBPerSec = 0;
258         } else if (getCrawlerTotalElapsedTime() < 1000) {
259             return; // Not enough time has passed for a decent snapshot.
260         } else {
261             docsPerSecond = (double) downloadedUriCount /
262                 (double)(getCrawlerTotalElapsedTime() / 1000);
263             // Round to nearest long.
264             totalKBPerSec = (long)(((totalProcessedBytes / 1024) /
265                  ((getCrawlerTotalElapsedTime()) / 1000)) + .5 );
266         }
267 
268         busyThreads = activeThreadCount();
269 
270         if(shouldrun ||
271             (System.currentTimeMillis() - lastLogPointTime) >= 1000) {
272             // If shouldrun is false there is a chance that the time interval
273             // since last time is too small for a good sample.  We only want
274             // to update "current" data when the interval is long enough or
275             // shouldrun is true.
276             currentDocsPerSecond = 0;
277             currentKBPerSec = 0;
278 
279             // Note time.
280             long currentTime = System.currentTimeMillis();
281             long sampleTime = currentTime - lastLogPointTime;
282 
283             // if we haven't done anyting or there isn't a reasonable sample
284             // size give up.
285             if (sampleTime >= 1000) {
286                 // Update docs/sec snapshot
287                 long currentPageCount = successfullyFetchedCount();
288                 long samplePageCount = currentPageCount - lastPagesFetchedCount;
289 
290                 currentDocsPerSecond =
291                     (double) samplePageCount / (double)(sampleTime / 1000);
292 
293                 lastPagesFetchedCount = currentPageCount;
294 
295                 // Update kbytes/sec snapshot
296                 long currentProcessedBytes = totalProcessedBytes;
297                 long sampleProcessedBytes =
298                     currentProcessedBytes - lastProcessedBytesCount;
299 
300                 currentKBPerSec =
301                     (int)(((sampleProcessedBytes/1024)/(sampleTime/1000)) + .5);
302 
303                 lastProcessedBytesCount = currentProcessedBytes;
304             }
305         }
306 
307         if (this.controller != null) {
308             this.controller.logProgressStatistics(getProgressStatisticsLine());
309         }
310         lastLogPointTime = System.currentTimeMillis();
311         super.progressStatisticsEvent(e);
312     }
313 
314     /***
315      * Return one line of current progress-statistics
316      * 
317      * @param now
318      * @return String of stats
319      */
320     public String getProgressStatisticsLine(Date now) {
321         return new PaddingStringBuffer()
322             .append(ArchiveUtils.getLog14Date(now))
323             .raAppend(32, discoveredUriCount)
324             .raAppend(44, queuedUriCount)
325             .raAppend(57, downloadedUriCount)
326             .raAppend(74, ArchiveUtils.
327                 doubleToString(currentDocsPerSecond, 2) +
328                 "(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")")
329             .raAppend(85, currentKBPerSec + "(" + totalKBPerSec + ")")
330             .raAppend(99, downloadFailures)
331             .raAppend(113, busyThreads)
332             .raAppend(126, (Runtime.getRuntime().totalMemory() -
333                 Runtime.getRuntime().freeMemory()) / 1024)
334             .raAppend(140, Runtime.getRuntime().totalMemory() / 1024)
335             .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2))
336             .raAppend(165, deepestUri)
337             .raAppend(177, averageDepth)
338             .toString();
339     }
340     
341     public Map<String,Number> getProgressStatistics() {
342         Map<String,Number> stats = new HashMap<String,Number>();
343         stats.put("discoveredUriCount", new Long(discoveredUriCount));
344         stats.put("queuedUriCount", new Long(queuedUriCount));
345         stats.put("downloadedUriCount", new Long(downloadedUriCount));
346         stats.put("currentDocsPerSecond", new Double(currentDocsPerSecond));
347         stats.put("docsPerSecond", new Double(docsPerSecond));
348         stats.put("totalKBPerSec", new Long(totalKBPerSec));
349         stats.put("totalProcessedBytes", new Long(totalProcessedBytes));
350         stats.put("currentKBPerSec", new Long(currentKBPerSec));
351         stats.put("downloadFailures", new Long(downloadFailures));
352         stats.put("busyThreads", new Integer(busyThreads));
353         stats.put("congestionRatio", new Double(congestionRatio));
354         stats.put("deepestUri", new Long(deepestUri));
355         stats.put("averageDepth", new Long(averageDepth));
356         stats.put("totalMemory", new Long(Runtime.getRuntime().totalMemory()));
357         stats.put("freeMemory", new Long(Runtime.getRuntime().freeMemory()));
358         return stats;
359     }
360 
361     /***
362      * Return one line of current progress-statistics
363      * 
364      * @return String of stats
365      */
366     public String getProgressStatisticsLine() {
367         return getProgressStatisticsLine(new Date());
368     }
369     
370     public double processedDocsPerSec(){
371         return docsPerSecond;
372     }
373 
374     public double currentProcessedDocsPerSec(){
375         return currentDocsPerSecond;
376     }
377 
378     public long processedKBPerSec(){
379         return totalKBPerSec;
380     }
381 
382     public int currentProcessedKBPerSec(){
383         return currentKBPerSec;
384     }
385 
386     /*** Returns a HashMap that contains information about distributions of
387      *  encountered mime types.  Key/value pairs represent
388      *  mime type -> count.
389      * <p>
390      * <b>Note:</b> All the values are wrapped with a {@link LongWrapper LongWrapper}
391      * @return mimeTypeDistribution
392      */
393     public Hashtable<String,LongWrapper> getFileDistribution() {
394         return mimeTypeDistribution;
395     }
396 
397 
398     /***
399      * Increment a counter for a key in a given HashMap. Used for various
400      * aggregate data.
401      * 
402      * As this is used to change Maps which depend on StatisticsTracker
403      * for their synchronization, this method should only be invoked
404      * from a a block synchronized on 'this'. 
405      *
406      * @param map The HashMap
407      * @param key The key for the counter to be incremented, if it does not
408      *               exist it will be added (set to 1).  If null it will
409      *            increment the counter "unknown".
410      */
411     protected static void incrementMapCount(Map<String,LongWrapper> map, 
412             String key) {
413     	incrementMapCount(map,key,1);
414     }
415 
416     /***
417      * Increment a counter for a key in a given HashMap by an arbitrary amount.
418      * Used for various aggregate data. The increment amount can be negative.
419      *
420      * As this is used to change Maps which depend on StatisticsTracker
421      * for their synchronization, this method should only be invoked
422      * from a a block synchronized on 'this'. 
423      *
424      * @param map
425      *            The HashMap
426      * @param key
427      *            The key for the counter to be incremented, if it does not exist
428      *            it will be added (set to equal to <code>increment</code>).
429      *            If null it will increment the counter "unknown".
430      * @param increment
431      *            The amount to increment counter related to the <code>key</code>.
432      */
433     protected static void incrementMapCount(Map<String,LongWrapper> map, 
434             String key, long increment) {
435         if (key == null) {
436             key = "unknown";
437         }
438         LongWrapper lw = (LongWrapper)map.get(key);
439         if(lw == null) {
440             map.put(key, new LongWrapper(increment));
441         } else {
442             lw.longValue += increment;
443         }
444     }
445 
446     /***
447      * Sort the entries of the given HashMap in descending order by their
448      * values, which must be longs wrapped with <code>LongWrapper</code>.
449      * <p>
450      * Elements are sorted by value from largest to smallest. Equal values are
451      * sorted in an arbitrary, but consistent manner by their keys. Only items
452      * with identical value and key are considered equal.
453      *
454      * If the passed-in map requires access to be synchronized, the caller
455      * should ensure this synchronization. 
456      * 
457      * @param mapOfLongWrapperValues
458      *            Assumes values are wrapped with LongWrapper.
459      * @return a sorted set containing the same elements as the map.
460      */
461     public TreeMap<String,LongWrapper> getReverseSortedCopy(
462             final Map<String,LongWrapper> mapOfLongWrapperValues) {
463         TreeMap<String,LongWrapper> sortedMap = 
464           new TreeMap<String,LongWrapper>(new Comparator<String>() {
465             public int compare(String e1, String e2) {
466                 long firstVal = mapOfLongWrapperValues.get(e1).
467                     longValue;
468                 long secondVal = mapOfLongWrapperValues.get(e2).
469                     longValue;
470                 if (firstVal < secondVal) {
471                     return 1;
472                 }
473                 if (secondVal < firstVal) {
474                     return -1;
475                 }
476                 // If the values are the same, sort by keys.
477                 return e1.compareTo(e2);
478             }
479         });
480         try {
481             sortedMap.putAll(mapOfLongWrapperValues);
482         } catch (UnsupportedOperationException e) {
483             Iterator<String> i = mapOfLongWrapperValues.keySet().iterator();
484             for (;i.hasNext();) {
485                 // Ok. Try doing it the slow way then.
486                 String key = i.next();
487                 sortedMap.put(key, mapOfLongWrapperValues.get(key));
488             }
489         }
490         return sortedMap;
491     }
492 
493     /***
494      * Return a HashMap representing the distribution of status codes for
495      * successfully fetched curis, as represented by a hashmap where key -&gt;
496      * val represents (string)code -&gt; (integer)count.
497      * 
498      * <b>Note: </b> All the values are wrapped with a
499      * {@link LongWrapper LongWrapper}
500      * 
501      * @return statusCodeDistribution
502      */
503     public Hashtable<String,LongWrapper> getStatusCodeDistribution() {
504         return statusCodeDistribution;
505     }
506     
507     /***
508      * Returns the time (in millisec) when a URI belonging to a given host was
509      * last finished processing. 
510      * 
511      * @param host The host to look up time of last completed URI.
512      * @return Returns the time (in millisec) when a URI belonging to a given 
513      * host was last finished processing. If no URI has been completed for host
514      * -1 will be returned. 
515      */
516     public long getHostLastFinished(String host){
517         Long l = null;
518         synchronized(hostsLastFinished){
519             l = (Long)hostsLastFinished.get(host);
520         }
521         return (l != null)? l.longValue(): -1;
522     }
523 
524     /***
525      * Returns the accumulated number of bytes downloaded from a given host.
526      * @param host name of the host
527      * @return the accumulated number of bytes downloaded from a given host
528      */
529     public long getBytesPerHost(String host){
530         synchronized(hostsBytes){
531             return ((LongWrapper)hostsBytes.get(host)).longValue;
532         }
533     }
534 
535     /***
536      * Returns the accumulated number of bytes from files of a given file type.
537      * @param filetype Filetype to check.
538      * @return the accumulated number of bytes from files of a given mime type
539      */
540     public long getBytesPerFileType(String filetype){
541         return ((LongWrapper)mimeTypeBytes.get(filetype)).longValue;
542     }
543 
544     /***
545      * Get the total number of ToeThreads (sleeping and active)
546      *
547      * @return The total number of ToeThreads
548      */
549     public int threadCount() {
550         return this.controller != null? controller.getToeCount(): 0;
551     }
552 
553     /***
554      * @return Current thread count (or zero if can't figure it out).
555      */ 
556     public int activeThreadCount() {
557         return this.controller != null? controller.getActiveToeCount(): 0;
558         // note: reuse of old busy value seemed misleading: anyone asking
559         // for thread count when paused or stopped still wants accurate reading
560     }
561 
562     /***
563      * This returns the number of completed URIs as a percentage of the total
564      * number of URIs encountered (should be inverse to the discovery curve)
565      *
566      * @return The number of completed URIs as a percentage of the total
567      * number of URIs encountered
568      */
569     public int percentOfDiscoveredUrisCompleted() {
570         long completed = finishedUriCount();
571         long total = discoveredUriCount();
572 
573         if (total == 0) {
574             return 0;
575         }
576 
577         return (int) (100 * completed / total);
578     }
579 
580     /***
581      * Number of <i>discovered</i> URIs.
582      *
583      * <p>If crawl not running (paused or stopped) this will return the value of
584      * the last snapshot.
585      *
586      * @return A count of all uris encountered
587      *
588      * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
589      */
590     public long discoveredUriCount() {
591         // While shouldrun is true we can use info direct from the crawler.
592         // After that our last snapshot will have to do.
593         return shouldrun && this.controller != null &&
594                 this.controller.getFrontier() != null?
595             controller.getFrontier().discoveredUriCount() : discoveredUriCount;
596     }
597 
598     /***
599      * Number of URIs that have <i>finished</i> processing.
600      *
601      * @return Number of URIs that have finished processing
602      *
603      * @see org.archive.crawler.framework.Frontier#finishedUriCount()
604      */
605     public long finishedUriCount() {
606         return shouldrun && this.controller != null &&
607                 this.controller.getFrontier() != null ?
608             controller.getFrontier().finishedUriCount() : finishedUriCount;
609     }
610 
611     /***
612      * Get the total number of failed fetch attempts (connection failures -> give up, etc)
613      *
614      * @return The total number of failed fetch attempts
615      */
616     public long failedFetchAttempts() {
617         // While shouldrun is true we can use info direct from the crawler.
618         // After that our last snapshot will have to do.
619         return shouldrun && this.controller != null &&
620                 this.controller.getFrontier() != null ?
621             controller.getFrontier().failedFetchCount() : downloadFailures;
622     }
623 
624     /***
625      * Get the total number of failed fetch attempts (connection failures -> give up, etc)
626      *
627      * @return The total number of failed fetch attempts
628      */
629     public long disregardedFetchAttempts() {
630         // While shouldrun is true we can use info direct from the crawler.
631         // After that our last snapshot will have to do.
632         return shouldrun && this.controller != null &&
633                 this.controller.getFrontier() != null?
634             controller.getFrontier().disregardedUriCount() : downloadDisregards;
635     }
636 
637     public long successfullyFetchedCount() {
638         // While shouldrun is true we can use info direct from the crawler.
639         // After that our last snapshot will have to do.
640         return shouldrun && this.controller != null &&
641                 this.controller.getFrontier() != null?
642             controller.getFrontier().succeededFetchCount() : downloadedUriCount;
643     }
644     
645     public long totalCount() {
646         return queuedUriCount() + activeThreadCount() +
647             successfullyFetchedCount();
648     }
649 
650     /***
651      * Ratio of number of threads that would theoretically allow
652      * maximum crawl progress (if each was as productive as current
653      * threads), to current number of threads.
654      * 
655      * @return float congestion ratio 
656      */
657     public float congestionRatio() {
658         // While shouldrun is true we can use info direct from the crawler.
659         // After that our last snapshot will have to do.
660         return shouldrun && this.controller != null &&
661                 this.controller.getFrontier() != null ?
662             controller.getFrontier().congestionRatio() : congestionRatio;
663     }
664     
665     /***
666      * Ordinal position of the 'deepest' URI eligible 
667      * for crawling. Essentially, the length of the longest
668      * frontier internal queue. 
669      * 
670      * @return long URI count to deepest URI
671      */
672     public long deepestUri() {
673         // While shouldrun is true we can use info direct from the crawler.
674         // After that our last snapshot will have to do.
675         return shouldrun && this.controller != null &&
676                 this.controller.getFrontier() != null ?
677             controller.getFrontier().deepestUri() : deepestUri;
678     }
679     
680     /***
681      * Average depth of the last URI in all eligible queues.
682      * That is, the average length of all eligible queues.
683      * 
684      * @return long average depth of last URIs in queues 
685      */
686     public long averageDepth() {
687         // While shouldrun is true we can use info direct from the crawler.
688         // After that our last snapshot will have to do.
689         return shouldrun && this.controller != null &&
690                 this.controller.getFrontier() != null ?
691             controller.getFrontier().averageDepth() : averageDepth;
692     }
693     
694     /***
695      * Number of URIs <i>queued</i> up and waiting for processing.
696      *
697      * <p>If crawl not running (paused or stopped) this will return the value
698      * of the last snapshot.
699      *
700      * @return Number of URIs queued up and waiting for processing.
701      *
702      * @see org.archive.crawler.framework.Frontier#queuedUriCount()
703      */
704     public long queuedUriCount() {
705         // While shouldrun is true we can use info direct from the crawler.
706         // After that our last snapshot will have to do.
707         return shouldrun && this.controller != null &&
708                 this.controller.getFrontier() != null?
709             controller.getFrontier().queuedUriCount() : queuedUriCount;
710     }
711 
712     /*** @deprecated use totalBytesCrawled */ 
713     public long totalBytesWritten() {
714         // return totalBytesCrawled(); 
715         return shouldrun && this.controller != null &&
716                 this.controller.getFrontier() != null?
717             controller.getFrontier().totalBytesWritten() : totalProcessedBytes;
718     }
719     
720     public long totalBytesCrawled() {
721         return shouldrun ?
722             crawledBytes.getTotal() : totalProcessedBytes;
723     }
724     
725     public String crawledBytesSummary() {
726         return crawledBytes.summary();
727     }
728 
729     /***
730      * If the curi is a seed, we update the processedSeeds table.
731      *
732      * @param curi The CrawlURI that may be a seed.
733      * @param disposition The dispositino of the CrawlURI.
734      */
735     private void handleSeed(CrawlURI curi, String disposition) {
736         if(curi.isSeed()){
737             SeedRecord sr = new SeedRecord(curi, disposition);
738             processedSeedsRecords.put(sr.getUri(), sr);
739         }
740     }
741 
742     public void crawledURISuccessful(CrawlURI curi) {
743         handleSeed(curi,SEED_DISPOSITION_SUCCESS);
744         // save crawled bytes tally
745         crawledBytes.accumulate(curi);
746         
747         // Save status codes
748         incrementMapCount(statusCodeDistribution,
749             Integer.toString(curi.getFetchStatus()));
750 
751         // Save mime types
752         String mime = MimetypeUtils.truncate(curi.getContentType());
753         incrementMapCount(mimeTypeDistribution, mime);
754         incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
755 
756         // Save hosts stats.
757         saveHostStats((curi.getFetchStatus() == 1)? "dns:":
758                 this.controller.getServerCache().
759                 getHostFor(curi).getHostName(),
760                 curi.getContentSize());
761         
762         if (curi.containsKey(CrawlURI.A_SOURCE_TAG)){
763             saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG), 
764                     this.controller.getServerCache().getHostFor(curi).
765                     getHostName()); 
766         }
767     }
768          
769     protected void saveSourceStats(String source, String hostname) {
770         synchronized(sourceHostDistribution) {
771             HashMap<String,LongWrapper> hostUriCount = 
772                 sourceHostDistribution.get(source);
773             if (hostUriCount == null) {
774                 hostUriCount = new HashMap<String,LongWrapper>();
775             }
776             // TODO: Dan suggests we don't need a hashtable value.  Might
777             // be faster if we went without. Could just have keys of:
778             //  seed | host (concatenated as string)
779             // and values of: 
780             //  #urls
781             incrementMapCount(hostUriCount, hostname);
782             sourceHostDistribution.put(source, hostUriCount);
783         }
784     }
785     
786     protected void saveHostStats(String hostname, long size) {
787         synchronized(hostsDistribution){
788             incrementMapCount(hostsDistribution, hostname);
789         }
790         synchronized(hostsBytes){
791             incrementMapCount(hostsBytes, hostname, size);
792         }
793         synchronized(hostsLastFinished){
794             hostsLastFinished.put(hostname,
795                 new Long(System.currentTimeMillis()));
796         }
797     }
798 
799     public void crawledURINeedRetry(CrawlURI curi) {
800         handleSeed(curi,SEED_DISPOSITION_RETRY);
801     }
802 
803     public void crawledURIDisregard(CrawlURI curi) {
804         handleSeed(curi,SEED_DISPOSITION_DISREGARD);
805     }
806 
807     public void crawledURIFailure(CrawlURI curi) {
808         handleSeed(curi,SEED_DISPOSITION_FAILURE);
809     }
810 
811     /***
812      * Get a seed iterator for the job being monitored. 
813      * 
814      * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
815      * UURIs like the Scope seed iterator. The strings are equal to the URIs'
816      * getURIString() values.
817      * @return the seed iterator
818      * FIXME: Consider using TransformingIterator here
819      */
820     public Iterator<String> getSeeds() {
821         List<String> seedsCopy = new Vector<String>();
822         Iterator<UURI> i = controller.getScope().seedsIterator();
823         while (i.hasNext()) {
824             seedsCopy.add(i.next().toString());
825         }
826         return seedsCopy.iterator();
827     }
828 
829     public Iterator getSeedRecordsSortedByStatusCode() {
830         return getSeedRecordsSortedByStatusCode(getSeeds());
831     }
832     
833     protected Iterator<SeedRecord> getSeedRecordsSortedByStatusCode(
834             Iterator<String> i) {
835         TreeSet<SeedRecord> sortedSet = 
836           new TreeSet<SeedRecord>(new Comparator<SeedRecord>() {
837             public int compare(SeedRecord sr1, SeedRecord sr2) {
838                 int code1 = sr1.getStatusCode();
839                 int code2 = sr2.getStatusCode();
840                 if (code1 == code2) {
841                     // If the values are equal, sort by URIs.
842                     return sr1.getUri().compareTo(sr2.getUri());
843                 }
844                 // mirror and shift the nubmer line so as to
845                 // place zero at the beginning, then all negatives 
846                 // in order of ascending absolute value, then all 
847                 // positives descending
848                 code1 = -code1 - Integer.MAX_VALUE;
849                 code2 = -code2 - Integer.MAX_VALUE;
850                 
851                 return new Integer(code1).compareTo(new Integer(code2));
852             }
853         });
854         while (i.hasNext()) {
855             String seed = i.next();
856             SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);
857             if(sr==null) {
858                 sr = new SeedRecord(seed,SEED_DISPOSITION_NOT_PROCESSED);
859                 processedSeedsRecords.put(seed,sr);
860             }
861             sortedSet.add(sr);
862         }
863         return sortedSet.iterator();
864     }
865 
866     public void crawlEnded(String message) {
867         logger.info("Entered crawlEnded");
868         this.sExitMessage = message; // held for reference by reports
869         super.crawlEnded(message);
870         logger.info("Leaving crawlEnded");
871     }
872     
873     /***
874      * @param writer Where to write.
875      */
876     protected void writeSeedsReportTo(PrintWriter writer) {
877         // Build header.
878         writer.print("[code] [status] [seed] [redirect]\n");
879 
880         seedsCrawled = 0;
881         seedsNotCrawled = 0;
882         for (Iterator i = getSeedRecordsSortedByStatusCode(getSeeds());
883                 i.hasNext();) {
884             SeedRecord sr = (SeedRecord)i.next();
885             writer.print(sr.getStatusCode());
886             writer.print(" ");
887             if((sr.getStatusCode() > 0)) {
888                 seedsCrawled++;
889                 writer.print("CRAWLED");
890             } else {
891                 seedsNotCrawled++;
892                 writer.print("NOTCRAWLED");
893             }
894             writer.print(" ");
895             writer.print(sr.getUri());
896             if(sr.getRedirectUri()!=null) {
897                 writer.print(" ");
898                 writer.print(sr.getRedirectUri());
899             }
900             writer.print("\n");
901         }
902     }
903     
904     protected void writeSourceReportTo(PrintWriter writer) {
905         
906         writer.print("[source] [host] [#urls]\n");
907         // for each source
908         for (Iterator i = sourceHostDistribution.keySet().iterator(); i.hasNext();) {
909             Object sourceKey = i.next();
910             Map<String,LongWrapper> hostCounts 
911              = (Map<String,LongWrapper>)sourceHostDistribution.get(sourceKey);
912             // sort hosts by #urls
913             SortedMap sortedHostCounts = getReverseSortedHostCounts(hostCounts);
914             // for each host
915             for (Iterator j = sortedHostCounts.keySet().iterator(); j.hasNext();) {
916                 Object hostKey = j.next();
917                 LongWrapper hostCount = (LongWrapper) hostCounts.get(hostKey);
918                 writer.print(sourceKey.toString());
919                 writer.print(" ");
920                 writer.print(hostKey.toString());
921                 writer.print(" ");
922                 writer.print(hostCount.longValue);
923                 writer.print("\n");
924             }
925         }
926     }
927   
928     /***
929      * Return a copy of the hosts distribution in reverse-sorted (largest first)
930      * order.
931      * 
932      * @return SortedMap of hosts distribution
933      */
934     public SortedMap getReverseSortedHostCounts(
935             Map<String,LongWrapper> hostCounts) {
936         synchronized(hostCounts){
937             return getReverseSortedCopy(hostCounts);
938         }
939     }
940 
941     
942     protected void writeHostsReportTo(final PrintWriter writer) {
943         // TODO: use CrawlHosts for all stats; only perform sorting on 
944         // manageable number of hosts
945         SortedMap hd = getReverseSortedHostsDistribution();
946         // header
947         writer.print("[#urls] [#bytes] [host] [#robots] [#remaining]\n");
948         for (Iterator i = hd.keySet().iterator(); i.hasNext();) {
949             // Key is 'host'.
950             String key = (String) i.next();
951             CrawlHost host = controller.getServerCache().getHostFor(key);
952             LongWrapper val = (LongWrapper)hd.get(key);
953             writeReportLine(writer,
954                     ((val==null)?"-":val.longValue),
955                     getBytesPerHost(key),
956                     key,
957                     host.getSubstats().getRobotsDenials(),
958                     host.getSubstats().getRemaining());
959         }
960         // StatisticsTracker doesn't know of zero-completion hosts; 
961         // so supplement report with those entries from host cache
962         Closure logZeros = new Closure() {
963             public void execute(Object obj) {
964                 CrawlHost host = (CrawlHost)obj;
965                 if(host.getSubstats().getRecordedFinishes()==0) {
966                     writeReportLine(writer,
967                             host.getSubstats().getRecordedFinishes(),
968                             host.getSubstats().getTotalBytes(),
969                             host.getHostName(),
970                             host.getSubstats().getRobotsDenials(),
971                             host.getSubstats().getRemaining());
972                 }
973             }};
974         controller.getServerCache().forAllHostsDo(logZeros);
975     }
976     
977     protected void writeReportLine(PrintWriter writer, Object  ... fields) {
978        for(Object field : fields) {
979            writer.print(field);
980            writer.print(" ");
981        }
982        writer.print("\n");
983     }
984 
985     /***
986      * Return a copy of the hosts distribution in reverse-sorted
987      * (largest first) order. 
988      * @return SortedMap of hosts distribution
989      */
990     public SortedMap getReverseSortedHostsDistribution() {
991         synchronized(hostsDistribution){
992             return getReverseSortedCopy(hostsDistribution);
993         }
994     }
995 
996     protected void writeMimetypesReportTo(PrintWriter writer) {
997         // header
998         writer.print("[#urls] [#bytes] [mime-types]\n");
999         TreeMap fd = getReverseSortedCopy(getFileDistribution());
1000         for (Iterator i = fd.keySet().iterator(); i.hasNext();) {
1001             Object key = i.next();
1002             // Key is mime type.
1003             writer.print(Long.toString(((LongWrapper)fd.get(key)).longValue));
1004             writer.print(" ");
1005             writer.print(Long.toString(getBytesPerFileType((String)key)));
1006             writer.print(" ");
1007             writer.print((String)key);
1008             writer.print("\n");
1009         }
1010     }
1011     
1012     protected void writeResponseCodeReportTo(PrintWriter writer) {
1013         // Build header.
1014         writer.print("[rescode] [#urls]\n");
1015         TreeMap scd = getReverseSortedCopy(getStatusCodeDistribution());
1016         for (Iterator i = scd.keySet().iterator(); i.hasNext();) {
1017             Object key = i.next();
1018             writer.print((String)key);
1019             writer.print(" ");
1020             writer.print(Long.toString(((LongWrapper)scd.get(key)).longValue));
1021             writer.print("\n");
1022         }
1023     }
1024     
1025     protected void writeCrawlReportTo(PrintWriter writer) {
1026         writer.print("Crawl Name: " + controller.getOrder().getCrawlOrderName());
1027         writer.print("\nCrawl Status: " + sExitMessage);
1028         writer.print("\nDuration Time: " +
1029                 ArchiveUtils.formatMillisecondsToConventional(crawlDuration()));
1030         writer.print("\nTotal Seeds Crawled: " + seedsCrawled);
1031         writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled);
1032         // hostsDistribution contains all hosts crawled plus an entry for dns.
1033         writer.print("\nTotal Hosts Crawled: " + (hostsDistribution.size()-1));
1034         writer.print("\nTotal Documents Crawled: " + finishedUriCount);
1035         writer.print("\nProcessed docs/sec: " +
1036                 ArchiveUtils.doubleToString(docsPerSecond,2));
1037         writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec);
1038         writer.print("\nTotal Raw Data Size in Bytes: " + totalProcessedBytes +
1039                 " (" + ArchiveUtils.formatBytesForDisplay(totalProcessedBytes) +
1040                 ") \n");
1041         writer.print("Novel Bytes: " 
1042                 + crawledBytes.get(CrawledBytesHistotable.NOVEL)
1043                 + " (" + ArchiveUtils.formatBytesForDisplay(
1044                         crawledBytes.get(CrawledBytesHistotable.NOVEL))
1045                 +  ") \n");
1046         if(crawledBytes.containsKey(CrawledBytesHistotable.DUPLICATE)) {
1047             writer.print("Duplicate-by-hash Bytes: " 
1048                     + crawledBytes.get(CrawledBytesHistotable.DUPLICATE)
1049                     + " (" + ArchiveUtils.formatBytesForDisplay(
1050                             crawledBytes.get(CrawledBytesHistotable.DUPLICATE))
1051                     +  ") \n");
1052         }
1053         if(crawledBytes.containsKey(CrawledBytesHistotable.NOTMODIFIED)) {
1054             writer.print("Not-modified Bytes: " 
1055                     + crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED)
1056                     + " (" + ArchiveUtils.formatBytesForDisplay(
1057                             crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED))
1058                     +  ") \n");
1059         }
1060     }
1061     
1062     protected void writeProcessorsReportTo(PrintWriter writer) {
1063         controller.reportTo(CrawlController.PROCESSORS_REPORT,writer);
1064     }
1065     
1066     protected void writeReportFile(String reportName, String filename) {
1067         File f = new File(controller.getDisk().getPath(), filename);
1068         try {
1069             PrintWriter bw = new PrintWriter(new FileWriter(f));
1070             writeReportTo(reportName, bw);
1071             bw.close();
1072             controller.addToManifest(f.getAbsolutePath(),
1073                 CrawlController.MANIFEST_REPORT_FILE, true);
1074         } catch (IOException e) {
1075             logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() +
1076                 " at the end of crawl.", e);
1077         }
1078         logger.info("wrote report: " + f.getAbsolutePath());
1079     }
1080     
1081     /***
1082      * @param writer Where to write.
1083      */
1084     protected void writeManifestReportTo(PrintWriter writer) {
1085         controller.reportTo(CrawlController.MANIFEST_REPORT, writer);
1086     }
1087     
1088     /***
1089      * @param reportName Name of report.
1090      * @param w Where to write.
1091      */
1092     private void writeReportTo(String reportName, PrintWriter w) {
1093         if("hosts".equals(reportName)) {
1094             writeHostsReportTo(w);
1095         } else if ("mime types".equals(reportName)) {
1096             writeMimetypesReportTo(w);
1097         } else if ("response codes".equals(reportName)) {
1098             writeResponseCodeReportTo(w);
1099         } else if ("seeds".equals(reportName)) {
1100             writeSeedsReportTo(w);
1101         } else if ("crawl".equals(reportName)) {
1102             writeCrawlReportTo(w);
1103         } else if ("processors".equals(reportName)) {
1104             writeProcessorsReportTo(w);
1105         } else if ("manifest".equals(reportName)) {
1106             writeManifestReportTo(w);
1107         } else if ("frontier".equals(reportName)) {
1108             writeFrontierReportTo(w);
1109         } else if ("source".equals(reportName)) {
1110             writeSourceReportTo(w);
1111         }// / TODO else default/error
1112     }
1113 
1114     /***
1115      * Write the Frontier's 'nonempty' report (if available)
1116      * @param writer to report to
1117      */
1118     protected void writeFrontierReportTo(PrintWriter writer) {
1119         if(controller.getFrontier().isEmpty()) {
1120             writer.println("frontier empty");
1121         } else {
1122             controller.getFrontier().reportTo("nonempty", writer);
1123         }
1124     }
1125 
1126     /***
1127      * Run the reports.
1128      */
1129     public void dumpReports() {
1130         // Add all files mentioned in the crawl order to the
1131         // manifest set.
1132         controller.addOrderToManifest();
1133         controller.installThreadContextSettingsHandler();
1134         writeReportFile("hosts","hosts-report.txt");
1135         writeReportFile("mime types","mimetype-report.txt");
1136         writeReportFile("response codes","responsecode-report.txt");
1137         writeReportFile("seeds","seeds-report.txt");
1138         writeReportFile("crawl","crawl-report.txt");
1139         writeReportFile("processors","processors-report.txt");
1140         writeReportFile("manifest","crawl-manifest.txt");
1141         writeReportFile("frontier","frontier-report.txt");
1142         if (!sourceHostDistribution.isEmpty()) {
1143             writeReportFile("source","source-report.txt");
1144         }
1145         // TODO: Save object to disk?
1146     }
1147 
1148     public void crawlCheckpoint(File cpDir) throws Exception {
1149         // CrawlController is managing the checkpointing of this object.
1150         logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
1151     }
1152 }