View Javadoc

1   /* AbstractFrontier
2    *
3    * $Id: AbstractFrontier.java 5439 2007-08-28 05:15:25Z gojomo $
4    *
5    * Created on Aug 17, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.frontier;
26  
27  import java.io.BufferedWriter;
28  import java.io.File;
29  import java.io.FileWriter;
30  import java.io.IOException;
31  import java.io.PrintWriter;
32  import java.io.Serializable;
33  import java.io.StringWriter;
34  import java.io.Writer;
35  import java.util.Iterator;
36  import java.util.List;
37  import java.util.concurrent.atomic.AtomicLong;
38  import java.util.logging.Level;
39  import java.util.logging.Logger;
40  import java.util.regex.Pattern;
41  
42  import javax.management.AttributeNotFoundException;
43  
44  import org.apache.commons.httpclient.HttpStatus;
45  import org.archive.crawler.datamodel.CandidateURI;
46  import org.archive.crawler.datamodel.CoreAttributeConstants;
47  import org.archive.crawler.datamodel.CrawlHost;
48  import org.archive.crawler.datamodel.CrawlOrder;
49  import org.archive.crawler.datamodel.CrawlServer;
50  import org.archive.crawler.datamodel.CrawlSubstats;
51  import org.archive.crawler.datamodel.CrawlURI;
52  import org.archive.crawler.datamodel.FetchStatusCodes;
53  import org.archive.crawler.datamodel.CrawlSubstats.Stage;
54  import org.archive.crawler.event.CrawlStatusListener;
55  import org.archive.crawler.framework.CrawlController;
56  import org.archive.crawler.framework.Frontier;
57  import org.archive.crawler.framework.ToeThread;
58  import org.archive.crawler.framework.Frontier.FrontierGroup;
59  import org.archive.crawler.framework.exceptions.EndedException;
60  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
61  import org.archive.crawler.settings.ModuleType;
62  import org.archive.crawler.settings.RegularExpressionConstraint;
63  import org.archive.crawler.settings.SimpleType;
64  import org.archive.crawler.settings.Type;
65  import org.archive.crawler.url.Canonicalizer;
66  import org.archive.net.UURI;
67  import org.archive.util.ArchiveUtils;
68  
69  /***
70   * Shared facilities for Frontier implementations.
71   * 
72   * @author gojomo
73   */
74  public abstract class AbstractFrontier extends ModuleType
75  implements CrawlStatusListener, Frontier, FetchStatusCodes,
76          CoreAttributeConstants, Serializable {
77      private static final long serialVersionUID = -4766504935003203930L;
78  
79      private static final Logger logger = Logger
80              .getLogger(AbstractFrontier.class.getName());
81  
82      protected transient CrawlController controller;
83  
84      /*** ordinal numbers to assign to created CrawlURIs */
85      protected AtomicLong nextOrdinal = new AtomicLong(1); 
86  
87      /*** should the frontier hold any threads asking for URIs? */
88      protected boolean shouldPause = false;
89  
90      /***
91       * should the frontier send an EndedException to any threads asking for
92       * URIs?
93       */
94      protected transient boolean shouldTerminate = false;
95  
96      /***
97       * how many multiples of last fetch elapsed time to wait before recontacting
98       * same server
99       */
100     public final static String ATTR_DELAY_FACTOR = "delay-factor";
101 
102     protected final static Float DEFAULT_DELAY_FACTOR = new Float(5);
103 
104     /***
105      * always wait this long after one completion before recontacting same
106      * server, regardless of multiple
107      */
108     public final static String ATTR_MIN_DELAY = "min-delay-ms";
109 
110     // 3 secs.
111     protected final static Integer DEFAULT_MIN_DELAY = new Integer(3000);
112 
113     /*** never wait more than this long, regardless of multiple */
114     public final static String ATTR_MAX_DELAY = "max-delay-ms";
115 
116     // 30 secs
117     protected final static Integer DEFAULT_MAX_DELAY = new Integer(30000);
118 
119     /*** number of hops of embeds (ERX) to bump to front of host queue */
120     public final static String ATTR_PREFERENCE_EMBED_HOPS =
121         "preference-embed-hops";
122 
123     protected final static Integer DEFAULT_PREFERENCE_EMBED_HOPS =
124         new Integer(1);
125 
126     /*** maximum per-host bandwidth usage */
127     public final static String ATTR_MAX_HOST_BANDWIDTH_USAGE =
128         "max-per-host-bandwidth-usage-KB-sec";
129 
130     protected final static Integer DEFAULT_MAX_HOST_BANDWIDTH_USAGE =
131         new Integer(0);
132 
133     /*** maximum overall bandwidth usage */
134     public final static String ATTR_MAX_OVERALL_BANDWIDTH_USAGE =
135         "total-bandwidth-usage-KB-sec";
136 
137     protected final static Integer DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE =
138         new Integer(0);
139 
140     /*** for retryable problems, seconds to wait before a retry */
141     public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";
142 
143     // 15 mins
144     protected final static Long DEFAULT_RETRY_DELAY = new Long(900);
145 
146     /*** maximum times to emit a CrawlURI without final disposition */
147     public final static String ATTR_MAX_RETRIES = "max-retries";
148 
149     protected final static Integer DEFAULT_MAX_RETRIES = new Integer(30);
150 
151     public final static String ATTR_QUEUE_ASSIGNMENT_POLICY =
152         "queue-assignment-policy";
153 
154     /*** queue assignment to force onto CrawlURIs; intended to be overridden */
155     public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";
156 
157     protected final static String DEFAULT_FORCE_QUEUE = "";
158 
159     // word chars, dash, period, comma, colon
160     protected final static String ACCEPTABLE_FORCE_QUEUE = "[-//w//.,:]*";
161         
162     /*** whether pause, rather than finish, when crawl appears done */
163     public final static String ATTR_PAUSE_AT_FINISH = "pause-at-finish";
164     // TODO: change default to true once well-tested
165     protected final static Boolean DEFAULT_PAUSE_AT_FINISH = Boolean.FALSE;
166     
167     /*** whether to pause at crawl start */
168     public final static String ATTR_PAUSE_AT_START = "pause-at-start";
169     protected final static Boolean DEFAULT_PAUSE_AT_START = Boolean.FALSE;
170     
171     /*** whether to pause at crawl start */
172     public final static String ATTR_SOURCE_TAG_SEEDS = "source-tag-seeds";
173     protected final static Boolean DEFAULT_SOURCE_TAG_SEEDS = Boolean.FALSE;
174 
175     /***
176      * Recover log on or off attribute.
177      */
178     protected final static String ATTR_RECOVERY_ENABLED =
179         "recovery-log-enabled";
180     protected final static Boolean DEFAULT_ATTR_RECOVERY_ENABLED =
181         Boolean.TRUE;
182 
183     // to maintain serialization compatibility, stored under old names
184     protected long queuedUriCount;
185     protected long succeededFetchCount;
186     protected long failedFetchCount;
187     protected long disregardedUriCount;
188     
189     // top-level stats
190     /*** total URIs queued to be visited */
191     transient protected AtomicLong liveQueuedUriCount = new AtomicLong(0); 
192 
193     transient protected AtomicLong liveSucceededFetchCount = new AtomicLong(0);
194 
195     transient protected AtomicLong liveFailedFetchCount = new AtomicLong(0);
196 
197     /*** URIs that are disregarded (for example because of robot.txt rules */
198     transient protected AtomicLong liveDisregardedUriCount = new AtomicLong(0);
199 
200     /***
201      * Used when bandwidth constraint are used.
202      */
203     protected long totalProcessedBytes = 0;
204 
205     private transient long nextURIEmitTime = 0;
206 
207     protected long processedBytesAfterLastEmittedURI = 0;
208     
209     protected int lastMaxBandwidthKB = 0;
210 
211     /***
212      * Crawl replay logger.
213      * 
214      * Currently captures Frontier/URI transitions.
215      * Can be null if user chose not to run a recovery.log.
216      */
217     private transient FrontierJournal recover = null;
218 
219     /*** file collecting report of ignored seed-file entries (if any) */
220     public static final String IGNORED_SEEDS_FILENAME = "seeds.ignored";
221 
222     /***
223      * @param name Name of this frontier.
224      * @param description Description for this frontier.
225      */
226     public AbstractFrontier(String name, String description) {
227         super(name, description);
228         addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
229                 "How many multiples of last fetch elapsed time to wait before "
230                         + "recontacting same server", DEFAULT_DELAY_FACTOR));
231         addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
232                 "Never wait more than this long.", DEFAULT_MAX_DELAY));
233         addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
234                 "Always wait this long after one completion before recontacting "
235                         + "same server.", DEFAULT_MIN_DELAY));
236         addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,
237                 "How often to retry fetching a URI that failed to be retrieved. "
238                         + "If zero, the crawler will get the robots.txt only.",
239                 DEFAULT_MAX_RETRIES));
240         addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,
241                 "How long to wait by default until we retry fetching a"
242                         + " URI that failed to be retrieved (seconds). ",
243                 DEFAULT_RETRY_DELAY));
244         addElementToDefinition(new SimpleType(
245                 ATTR_PREFERENCE_EMBED_HOPS,
246                 "Number of embedded (or redirected) hops up to which "
247                 + "a URI has higher priority scheduling. For example, if set "
248                 + "to 1 (the default), items such as inline images (1-hop "
249                 + "embedded resources) will be scheduled ahead of all regular "
250                 + "links (or many-hop resources, like nested frames). If set to "
251                 + "zero, no preferencing will occur, and embeds/redirects are "
252                 + "scheduled the same as regular links.",
253                 DEFAULT_PREFERENCE_EMBED_HOPS));
254         Type t;
255         t = addElementToDefinition(new SimpleType(
256                 ATTR_MAX_OVERALL_BANDWIDTH_USAGE,
257                 "The maximum average bandwidth the crawler is allowed to use. "
258                 + "The actual read speed is not affected by this setting, it only "
259                 + "holds back new URIs from being processed when the bandwidth "
260                 + "usage has been to high. 0 means no bandwidth limitation.",
261                 DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE));
262         t.setOverrideable(false);
263         t = addElementToDefinition(new SimpleType(
264                 ATTR_MAX_HOST_BANDWIDTH_USAGE,
265                 "The maximum average bandwidth the crawler is allowed to use per "
266                 + "host. The actual read speed is not affected by this setting, "
267                 + "it only holds back new URIs from being processed when the "
268                 + "bandwidth usage has been to high. 0 means no bandwidth "
269                 + "limitation.", DEFAULT_MAX_HOST_BANDWIDTH_USAGE));
270         t.setExpertSetting(true);
271 
272         // Read the list of permissible choices from heritrix.properties.
273         // Its a list of space- or comma-separated values.
274         String queueStr = System.getProperty(AbstractFrontier.class.getName() +
275                 "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
276                 HostnameQueueAssignmentPolicy.class.getName() + " " +
277                 IPQueueAssignmentPolicy.class.getName() + " " +
278                 BucketQueueAssignmentPolicy.class.getName() + " " +
279                 SurtAuthorityQueueAssignmentPolicy.class.getName() + " " +
280                 TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
281         Pattern p = Pattern.compile("//s*,//s*|//s+");
282         String [] queues = p.split(queueStr);
283         if (queues.length <= 0) {
284             throw new RuntimeException("Failed parse of " +
285                     " assignment queue policy string: " + queueStr);
286         }
287         t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
288                 "Defines how to assign URIs to queues. Can assign by host, " +
289                 "by ip, and into one of a fixed set of buckets (1k).",
290                 queues[0], queues));
291         t.setExpertSetting(true);
292         t.setOverrideable(true);
293 
294         t = addElementToDefinition(new SimpleType(
295                 ATTR_FORCE_QUEUE,
296                 "The queue name into which to force URIs. Should "
297                 + "be left blank at global level.  Specify a "
298                 + "per-domain/per-host override to force URIs into "
299                 + "a particular named queue, regardless of the assignment "
300                 + "policy in effect (domain or ip-based politeness). "
301                 + "This could be used on domains known to all be from "
302                 + "the same small set of IPs (eg blogspot, dailykos, etc.) "
303                 + "to simulate IP-based politeness, or it could be used if "
304                 + "you wanted to enforce politeness over a whole domain, even "
305                 + "though the subdomains are split across many IPs.",
306                 DEFAULT_FORCE_QUEUE));
307         t.setOverrideable(true);
308         t.setExpertSetting(true);
309         t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,
310                 Level.WARNING, "This field must contain only alphanumeric "
311                 + "characters plus period, dash, comma, colon, or underscore."));
312         t = addElementToDefinition(new SimpleType(
313                 ATTR_PAUSE_AT_START,
314                 "Whether to pause when the crawl begins, before any URIs " +
315                 "are tried. This gives the operator a chance to verify or " +
316                 "adjust the crawl before actual work begins. " +
317                 "Default is false.", DEFAULT_PAUSE_AT_START));
318         t = addElementToDefinition(new SimpleType(
319                 ATTR_PAUSE_AT_FINISH,
320                 "Whether to pause when the crawl appears finished, rather "
321                 + "than immediately end the crawl. This gives the operator an "
322                 + "opportunity to view crawl results, and possibly add URIs or "
323                 + "adjust settings, while the crawl state is still available. "
324                 + "Default is false.", DEFAULT_PAUSE_AT_FINISH));
325         t.setOverrideable(false);
326         
327         t = addElementToDefinition(new SimpleType(
328                 ATTR_SOURCE_TAG_SEEDS,
329                 "Whether to tag seeds with their own URI as a heritable " +
330                 "'source' String, which will be carried-forward to all URIs " +
331                 "discovered on paths originating from that seed. When " +
332                 "present, such source tags appear in the second-to-last " +
333                 "crawl.log field.", DEFAULT_SOURCE_TAG_SEEDS));
334         t.setOverrideable(false);
335         
336         t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED,
337                 "Set to false to disable recovery log writing.  Do this if " +
338                 "you you are using the checkpoint feature for recovering " +
339                 "crashed crawls.", DEFAULT_ATTR_RECOVERY_ENABLED));
340         t.setExpertSetting(true);
341         // No sense in it being overrideable.
342         t.setOverrideable(false);
343     }
344 
345     public void start() {
346         if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_START))
347                 .booleanValue()) {
348             // trigger crawl-wide pause
349             controller.requestCrawlPause();
350         } else {
351             // simply begin
352             unpause(); 
353         }
354     }
355     
356     synchronized public void pause() {
357         shouldPause = true;
358     }
359 
360     synchronized public void unpause() {
361         shouldPause = false;
362         notifyAll();
363     }
364 
365     public void initialize(CrawlController c)
366             throws FatalConfigurationException, IOException {
367         c.addCrawlStatusListener(this);
368         File logsDisk = null;
369         try {
370             logsDisk = c.getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
371         } catch (AttributeNotFoundException e) {
372             logger.log(Level.SEVERE, "Failed to get logs directory", e);
373         }
374         if (logsDisk != null) {
375             String logsPath = logsDisk.getAbsolutePath() + File.separatorChar;
376             if (((Boolean)getUncheckedAttribute(null, ATTR_RECOVERY_ENABLED))
377                     .booleanValue()) {
378                 this.recover = new RecoveryJournal(logsPath,
379                     FrontierJournal.LOGNAME_RECOVER);
380             }
381         }
382 //        try {
383 //            final Class qapClass = Class.forName((String)getUncheckedAttribute(
384 //                    null, ATTR_QUEUE_ASSIGNMENT_POLICY));
385 //
386 //            queueAssignmentPolicy =
387 //                (QueueAssignmentPolicy)qapClass.newInstance();
388 //        } catch (Exception e) {
389 //            logger.log(Level.SEVERE, "Bad queue assignment policy class", e);
390 //            throw new FatalConfigurationException(e.getMessage());
391 //        }
392     }
393 
394     synchronized public void terminate() {
395         shouldTerminate = true;
396         if (this.recover != null) {
397             this.recover.close();
398             this.recover = null;
399         }
400         unpause();
401     }
402 
403     /***
404      * Report CrawlURI to each of the three 'substats' accumulators
405      * (group/queue, server, host) for a given stage. 
406      * 
407      * @param curi
408      * @param stage
409      */
410     protected void tally(CrawlURI curi, Stage stage) {
411         // Tally per-server, per-host, per-frontier-class running totals
412         CrawlServer server =
413             controller.getServerCache().getServerFor(curi);
414         if (server != null) {
415             server.getSubstats().tally(curi, stage);
416         }
417         CrawlHost host = 
418             controller.getServerCache().getHostFor(curi);
419         if (host != null) {
420             host.getSubstats().tally(curi, stage);
421         } 
422         FrontierGroup group = 
423             controller.getFrontier().getGroup(curi);
424         group.getSubstats().tally(curi, stage);
425     }
426     
427     protected void doJournalFinishedSuccess(CrawlURI c) {
428         tally(c,CrawlSubstats.Stage.SUCCEEDED);
429         if (this.recover != null) {
430             this.recover.finishedSuccess(c);
431         }
432     }
433 
434     protected void doJournalAdded(CrawlURI c) {
435         tally(c,CrawlSubstats.Stage.SCHEDULED);
436         if (this.recover != null) {
437             this.recover.added(c);
438         }
439     }
440 
441     protected void doJournalRescheduled(CrawlURI c) {
442         tally(c,CrawlSubstats.Stage.RETRIED);
443         if (this.recover != null) {
444             this.recover.rescheduled(c);
445         }
446     }
447 
448     protected void doJournalFinishedFailure(CrawlURI c) {
449         tally(c,CrawlSubstats.Stage.FAILED);
450         if (this.recover != null) {
451             this.recover.finishedFailure(c);
452         }
453     }
454     
455     protected void doJournalDisregarded(CrawlURI c) {
456         tally(c,CrawlSubstats.Stage.DISREGARDED);
457         if (this.recover != null) {
458             this.recover.finishedDisregard(c);
459         }
460     }
461 
462     protected void doJournalEmitted(CrawlURI c) {
463         if (this.recover != null) {
464             this.recover.emitted(c);
465         }
466     }
467 
468     /***
469      * Frontier is empty only if all queues are empty and no URIs are in-process
470      * 
471      * @return True if queues are empty.
472      */
473     public boolean isEmpty() {
474         return liveQueuedUriCount.get() == 0;
475     }
476 
477     /***
478      * Increment the running count of queued URIs. 
479      */
480     protected void incrementQueuedUriCount() {
481         liveQueuedUriCount.incrementAndGet();
482     }
483 
484     /***
485      * Increment the running count of queued URIs. Synchronized because
486      * operations on longs are not atomic.
487      * 
488      * @param increment
489      *            amount to increment the queued count
490      */
491     protected void incrementQueuedUriCount(long increment) {
492         liveQueuedUriCount.addAndGet(increment);
493     }
494 
495     /***
496      * Note that a number of queued Uris have been deleted.
497      * 
498      * @param numberOfDeletes
499      */
500     protected void decrementQueuedCount(long numberOfDeletes) {
501         liveQueuedUriCount.addAndGet(-numberOfDeletes);
502     }
503 
504     /***
505      * (non-Javadoc)
506      * 
507      * @see org.archive.crawler.framework.Frontier#queuedUriCount()
508      */
509     public long queuedUriCount() {
510         return liveQueuedUriCount.get();
511     }
512 
513     /***
514      * (non-Javadoc)
515      * 
516      * @see org.archive.crawler.framework.Frontier#finishedUriCount()
517      */
518     public long finishedUriCount() {
519         return liveSucceededFetchCount.get() + liveFailedFetchCount.get() + liveDisregardedUriCount.get();
520     }
521 
522     /***
523      * Increment the running count of successfully fetched URIs. 
524      */
525     protected void incrementSucceededFetchCount() {
526         liveSucceededFetchCount.incrementAndGet();
527     }
528 
529     /***
530      * (non-Javadoc)
531      * 
532      * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
533      */
534     public long succeededFetchCount() {
535         return liveSucceededFetchCount.get();
536     }
537 
538     /***
539      * Increment the running count of failed URIs. 
540      */
541     protected void incrementFailedFetchCount() {
542         liveFailedFetchCount.incrementAndGet();
543     }
544 
545     /***
546      * (non-Javadoc)
547      * 
548      * @see org.archive.crawler.framework.Frontier#failedFetchCount()
549      */
550     public long failedFetchCount() {
551         return liveFailedFetchCount.get();
552     }
553 
554     /***
555      * Increment the running count of disregarded URIs. Synchronized because
556      * operations on longs are not atomic.
557      */
558     protected void incrementDisregardedUriCount() {
559         liveDisregardedUriCount.incrementAndGet();
560     }
561 
562     public long disregardedUriCount() {
563         return liveDisregardedUriCount.get();
564     }
565 
566     /*** @deprecated misnomer; use StatisticsTracking figures instead */
567     public long totalBytesWritten() {
568         return totalProcessedBytes;
569     }
570 
571     /***
572      * Load up the seeds.
573      * 
574      * This method is called on initialize and inside in the crawlcontroller
575      * when it wants to force reloading of configuration.
576      * 
577      * @see org.archive.crawler.framework.CrawlController#kickUpdate()
578      */
579     public void loadSeeds() {
580         Writer ignoredWriter = new StringWriter();
581         logger.info("beginning");
582         // Get the seeds to refresh.
583         Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);
584         int count = 0; 
585         while (iter.hasNext()) {
586             UURI u = (UURI)iter.next();
587             CandidateURI caUri = CandidateURI.createSeedCandidateURI(u);
588             caUri.setSchedulingDirective(CandidateURI.MEDIUM);
589             if (((Boolean)getUncheckedAttribute(null, ATTR_SOURCE_TAG_SEEDS))
590                     .booleanValue()) {
591                 caUri.putString(CoreAttributeConstants.A_SOURCE_TAG,caUri.toString());
592                 caUri.makeHeritable(CoreAttributeConstants.A_SOURCE_TAG);
593             }
594             schedule(caUri);
595             count++;
596             if(count%1000==0) {
597                 logger.info(count+" seeds");
598             }
599         }
600         // save ignored items (if any) where they can be consulted later
601         saveIgnoredItems(ignoredWriter.toString(), controller.getDisk());
602         logger.info("finished");
603     }
604 
605     /***
606      * Dump ignored seed items (if any) to disk; delete file otherwise.
607      * Static to allow non-derived sibling classes (frontiers not yet 
608      * subclassed here) to reuse.
609      * 
610      * @param ignoredItems
611      * @param dir 
612      */
613     public static void saveIgnoredItems(String ignoredItems, File dir) {
614         File ignoredFile = new File(dir, IGNORED_SEEDS_FILENAME);
615         if(ignoredItems==null | ignoredItems.length()>0) {
616             try {
617                 BufferedWriter bw = new BufferedWriter(new FileWriter(ignoredFile));
618                 bw.write(ignoredItems);
619                 bw.close();
620             } catch (IOException e) {
621                 // TODO make an alert?
622                 e.printStackTrace();
623             }
624         } else {
625             // delete any older file (if any)
626             ignoredFile.delete();
627         }
628     }
629 
630     protected CrawlURI asCrawlUri(CandidateURI caUri) {
631         CrawlURI curi;
632         if (caUri instanceof CrawlURI) {
633             curi = (CrawlURI)caUri;
634         } else {
635             curi = CrawlURI.from(caUri, nextOrdinal.getAndIncrement());
636         }
637         curi.setClassKey(getClassKey(curi));
638         return curi;
639     }
640 
641     /***
642      * @param now
643      * @throws InterruptedException
644      * @throws EndedException
645      */
646     protected synchronized void preNext(long now) throws InterruptedException,
647             EndedException {
648         if (this.controller == null) {
649             return;
650         }
651         
652         // Check completion conditions
653         if (this.controller.atFinish()) {
654             if (((Boolean)getUncheckedAttribute(null, ATTR_PAUSE_AT_FINISH))
655                     .booleanValue()) {
656                 this.controller.requestCrawlPause();
657             } else {
658                 this.controller.beginCrawlStop();
659             }
660         }
661 
662         // enforce operator pause
663         if (shouldPause) {
664             while (shouldPause) {
665                 this.controller.toePaused();
666                 wait();
667             }
668             // exitted pause; possibly finish regardless of pause-at-finish
669             if (controller != null && controller.atFinish()) {
670                 this.controller.beginCrawlStop();
671             }
672         }
673 
674         // enforce operator terminate or thread retirement
675         if (shouldTerminate
676                 || ((ToeThread)Thread.currentThread()).shouldRetire()) {
677             throw new EndedException("terminated");
678         }
679 
680         enforceBandwidthThrottle(now);
681     }
682 
683     /***
684      * Perform any special handling of the CrawlURI, such as promoting its URI
685      * to seed-status, or preferencing it because it is an embed.
686      * 
687      * @param curi
688      */
689     protected void applySpecialHandling(CrawlURI curi) {
690         if (curi.isSeed() && curi.getVia() != null
691                 && curi.flattenVia().length() > 0) {
692             // The only way a seed can have a non-empty via is if it is the
693             // result of a seed redirect. Add it to the seeds list.
694             //
695             // This is a feature. This is handling for case where a seed
696             // gets immediately redirected to another page. What we're doing is
697             // treating the immediate redirect target as a seed.
698             this.controller.getScope().addSeed(curi);
699             // And it needs rapid scheduling.
700 	    if (curi.getSchedulingDirective() == CandidateURI.NORMAL)
701                 curi.setSchedulingDirective(CandidateURI.MEDIUM);
702         }
703 
704         // optionally preferencing embeds up to MEDIUM
705         int prefHops = ((Integer)getUncheckedAttribute(curi,
706                 ATTR_PREFERENCE_EMBED_HOPS)).intValue();
707         if (prefHops > 0) {
708             int embedHops = curi.getTransHops();
709             if (embedHops > 0 && embedHops <= prefHops
710                     && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
711                 // number of embed hops falls within the preferenced range, and
712                 // uri is not already MEDIUM -- so promote it
713                 curi.setSchedulingDirective(CandidateURI.MEDIUM);
714             }
715         }
716     }
717 
718     /***
719      * Perform fixups on a CrawlURI about to be returned via next().
720      * 
721      * @param curi
722      *            CrawlURI about to be returned by next()
723      * @param q
724      *            the queue from which the CrawlURI came
725      */
726     protected void noteAboutToEmit(CrawlURI curi, WorkQueue q) {
727         curi.setHolder(q);
728         // if (curi.getServer() == null) {
729         //    // TODO: perhaps short-circuit the emit here,
730         //    // because URI will be rejected as unfetchable
731         // }
732         doJournalEmitted(curi);
733     }
734 
735     /***
736      * @param curi
737      * @return the CrawlServer to be associated with this CrawlURI
738      */
739     protected CrawlServer getServer(CrawlURI curi) {
740         return this.controller.getServerCache().getServerFor(curi);
741     }
742 
743     /***
744      * Return a suitable value to wait before retrying the given URI.
745      * 
746      * @param curi
747      *            CrawlURI to be retried
748      * @return millisecond delay before retry
749      */
750     protected long retryDelayFor(CrawlURI curi) {
751         int status = curi.getFetchStatus();
752         return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST ||
753                 status == S_DOMAIN_UNRESOLVABLE)?
754             ((Long)getUncheckedAttribute(curi, ATTR_RETRY_DELAY)).longValue():
755             0; // no delay for most
756     }
757 
758     /***
759      * Update any scheduling structures with the new information in this
760      * CrawlURI. Chiefly means make necessary arrangements for no other URIs at
761      * the same host to be visited within the appropriate politeness window.
762      * 
763      * @param curi
764      *            The CrawlURI
765      * @return millisecond politeness delay
766      */
767     protected long politenessDelayFor(CrawlURI curi) {
768         long durationToWait = 0;
769         if (curi.containsKey(A_FETCH_BEGAN_TIME)
770                 && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
771 
772             long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
773             long durationTaken = (completeTime - curi
774                     .getLong(A_FETCH_BEGAN_TIME));
775             durationToWait = (long)(((Float)getUncheckedAttribute(curi,
776                     ATTR_DELAY_FACTOR)).floatValue() * durationTaken);
777 
778             long minDelay = ((Integer)getUncheckedAttribute(curi,
779                     ATTR_MIN_DELAY)).longValue();
780             if (minDelay > durationToWait) {
781                 // wait at least the minimum
782                 durationToWait = minDelay;
783             }
784 
785             long maxDelay = ((Integer)getUncheckedAttribute(curi,
786                     ATTR_MAX_DELAY)).longValue();
787             if (durationToWait > maxDelay) {
788                 // wait no more than the maximum
789                 durationToWait = maxDelay;
790             }
791 
792             long now = System.currentTimeMillis();
793             int maxBandwidthKB = ((Integer)getUncheckedAttribute(curi,
794                     ATTR_MAX_HOST_BANDWIDTH_USAGE)).intValue();
795             if (maxBandwidthKB > 0) {
796                 // Enforce bandwidth limit
797                 CrawlHost host = controller.getServerCache().getHostFor(curi);
798                 long minDurationToWait = host.getEarliestNextURIEmitTime()
799                         - now;
800                 float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor
801                 long processedBytes = curi.getContentSize();
802                 host
803                         .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)
804                                 + now);
805 
806                 if (minDurationToWait > durationToWait) {
807                     durationToWait = minDurationToWait;
808                 }
809             }
810         }
811         return durationToWait;
812     }
813 
814     /***
815      * Ensure that any overall-bandwidth-usage limit is respected, by pausing as
816      * long as necessary.
817      * 
818      * @param now
819      * @throws InterruptedException
820      */
821     private void enforceBandwidthThrottle(long now) throws InterruptedException {
822         int maxBandwidthKB = ((Integer)getUncheckedAttribute(null,
823                 ATTR_MAX_OVERALL_BANDWIDTH_USAGE)).intValue();
824         if (maxBandwidthKB > 0) {
825             // Make sure that new bandwidth setting doesn't affect total crawl
826             if (maxBandwidthKB != lastMaxBandwidthKB) {
827                 lastMaxBandwidthKB = maxBandwidthKB;
828                 processedBytesAfterLastEmittedURI = totalProcessedBytes;
829             }
830 
831             // Enforce bandwidth limit
832             long sleepTime = nextURIEmitTime - now;
833             float maxBandwidth = maxBandwidthKB * 1.024F; // Kilo_factor
834             long processedBytes = totalProcessedBytes
835                     - processedBytesAfterLastEmittedURI;
836             long shouldHaveEmittedDiff = nextURIEmitTime == 0? 0
837                     : nextURIEmitTime - now;
838             nextURIEmitTime = (long)(processedBytes / maxBandwidth) + now
839                     + shouldHaveEmittedDiff;
840             processedBytesAfterLastEmittedURI = totalProcessedBytes;
841             if (sleepTime > 0) {
842                 long targetTime = now + sleepTime;
843                 now = System.currentTimeMillis();
844                 while (now < targetTime) {
845                     synchronized (this) {
846                         if (logger.isLoggable(Level.FINE)) {
847                             logger.fine("Frontier waits for: " + sleepTime
848                                     + "ms to respect bandwidth limit.");
849                         }
850                         // TODO: now that this is a wait(), frontier can
851                         // still schedule and finish items while waiting,
852                         // which is good, but multiple threads could all
853                         // wait for the same wakeTime, which somewhat
854                         // spoils the throttle... should be fixed.
855                         wait(targetTime - now);
856                     }
857                     now = System.currentTimeMillis();
858                 }
859             }
860         }
861     }
862 
863     /***
864      * Take note of any processor-local errors that have been entered into the
865      * CrawlURI.
866      * 
867      * @param curi
868      *  
869      */
870     protected void logLocalizedErrors(CrawlURI curi) {
871         if (curi.containsKey(A_LOCALIZED_ERRORS)) {
872             List localErrors = (List)curi.getObject(A_LOCALIZED_ERRORS);
873             Iterator iter = localErrors.iterator();
874             while (iter.hasNext()) {
875                 Object array[] = {curi, iter.next()};
876                 controller.localErrors.log(Level.WARNING, curi.getUURI()
877                         .toString(), array);
878             }
879             // once logged, discard
880             curi.remove(A_LOCALIZED_ERRORS);
881         }
882     }
883 
884     /***
885      * Utility method to return a scratch dir for the given key's temp files.
886      * Every key gets its own subdir. To avoid having any one directory with
887      * thousands of files, there are also two levels of enclosing directory
888      * named by the least-significant hex digits of the key string's java
889      * hashcode.
890      * 
891      * @param key
892      * @return File representing scratch directory
893      */
894     protected File scratchDirFor(String key) {
895         String hex = Integer.toHexString(key.hashCode());
896         while (hex.length() < 4) {
897             hex = "0" + hex;
898         }
899         int len = hex.length();
900         return new File(this.controller.getStateDisk(), hex.substring(len - 2,
901                 len)
902                 + File.separator
903                 + hex.substring(len - 4, len - 2)
904                 + File.separator + key);
905     }
906 
907     protected boolean overMaxRetries(CrawlURI curi) {
908         // never retry more than the max number of times
909         if (curi.getFetchAttempts() >= ((Integer)getUncheckedAttribute(curi,
910                 ATTR_MAX_RETRIES)).intValue()) {
911             return true;
912         }
913         return false;
914     }
915 
916     public void importRecoverLog(String pathToLog, boolean retainFailures)
917             throws IOException {
918         File source = new File(pathToLog);
919         if (!source.isAbsolute()) {
920             source = new File(getSettingsHandler().getOrder().getController()
921                     .getDisk(), pathToLog);
922         }
923         RecoveryJournal.importRecoverLog(source, controller, retainFailures);
924     }
925 
926     /*
927      * (non-Javadoc)
928      * 
929      * @see org.archive.crawler.framework.URIFrontier#kickUpdate()
930      */
931     public void kickUpdate() {
932         // by default, do nothing
933         // (scope will loadSeeds, if appropriate)
934     }
935 
936     /***
937      * Log to the main crawl.log
938      * 
939      * @param curi
940      */
941     protected void log(CrawlURI curi) {
942         curi.aboutToLog();
943         Object array[] = {curi};
944         this.controller.uriProcessing.log(Level.INFO,
945                 curi.getUURI().toString(), array);
946     }
947 
948     protected boolean isDisregarded(CrawlURI curi) {
949         switch (curi.getFetchStatus()) {
950         case S_ROBOTS_PRECLUDED: // they don't want us to have it
951         case S_BLOCKED_BY_CUSTOM_PROCESSOR:
952         case S_OUT_OF_SCOPE: // filtered out by scope
953         case S_BLOCKED_BY_USER: // filtered out by user
954         case S_TOO_MANY_EMBED_HOPS: // too far from last true link
955         case S_TOO_MANY_LINK_HOPS: // too far from seeds
956         case S_DELETED_BY_USER: // user deleted
957             return true;
958         default:
959             return false;
960         }
961     }
962 
963     /***
964      * Checks if a recently completed CrawlURI that did not finish successfully
965      * needs to be retried (processed again after some time elapses)
966      * 
967      * @param curi
968      *            The CrawlURI to check
969      * @return True if we need to retry.
970      */
971     protected boolean needsRetrying(CrawlURI curi) {
972         if (overMaxRetries(curi)) {
973             return false;
974         }
975 
976         switch (curi.getFetchStatus()) {
977         case HttpStatus.SC_UNAUTHORIZED:
978             // We can get here though usually a positive status code is
979             // a success. We get here if there is rfc2617 credential data
980             // loaded and we're supposed to go around again. See if any
981             // rfc2617 credential present and if there, assume it got
982             // loaded in FetchHTTP on expectation that we're to go around
983             // again. If no rfc2617 loaded, we should not be here.
984             boolean loaded = curi.hasRfc2617CredentialAvatar();
985             if (!loaded && logger.isLoggable(Level.INFO)) {
986                 logger.info("Have 401 but no creds loaded " + curi);
987             }
988             return loaded;
989         case S_DEFERRED:
990         case S_CONNECT_FAILED:
991         case S_CONNECT_LOST:
992         case S_DOMAIN_UNRESOLVABLE:
993             // these are all worth a retry
994             // TODO: consider if any others (S_TIMEOUT in some cases?) deserve
995             // retry
996             return true;
997         default:
998             return false;
999         }
1000     }
1001 
1002     /***
1003      * Canonicalize passed uuri. Its would be sweeter if this canonicalize
1004      * function was encapsulated by that which it canonicalizes but because
1005      * settings change with context -- i.e. there may be overrides in operation
1006      * for a particular URI -- its not so easy; Each CandidateURI would need a
1007      * reference to the settings system. That's awkward to pass in.
1008      * 
1009      * @param uuri Candidate URI to canonicalize.
1010      * @return Canonicalized version of passed <code>uuri</code>.
1011      */
1012     protected String canonicalize(UURI uuri) {
1013         return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
1014     }
1015 
1016     /***
1017      * Canonicalize passed CandidateURI. This method differs from
1018      * {@link #canonicalize(UURI)} in that it takes a look at
1019      * the CandidateURI context possibly overriding any canonicalization effect if
1020      * it could make us miss content. If canonicalization produces an URL that
1021      * was 'alreadyseen', but the entry in the 'alreadyseen' database did
1022      * nothing but redirect to the current URL, we won't get the current URL;
1023      * we'll think we've already see it. Examples would be archive.org
1024      * redirecting to www.archive.org or the inverse, www.netarkivet.net
1025      * redirecting to netarkivet.net (assuming stripWWW rule enabled).
1026      * <p>Note, this method under circumstance sets the forceFetch flag.
1027      * 
1028      * @param cauri CandidateURI to examine.
1029      * @return Canonicalized <code>cacuri</code>.
1030      */
1031     protected String canonicalize(CandidateURI cauri) {
1032         String canon = canonicalize(cauri.getUURI());
1033         if (cauri.isLocation()) {
1034             // If the via is not the same as where we're being redirected (i.e.
1035             // we're not being redirected back to the same page, AND the
1036             // canonicalization of the via is equal to the the current cauri, 
1037             // THEN forcefetch (Forcefetch so no chance of our not crawling
1038             // content because alreadyseen check things its seen the url before.
1039             // An example of an URL that redirects to itself is:
1040             // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
1041             // An example of an URL whose canonicalization equals its via's
1042             // canonicalization, and we want to fetch content at the
1043             // redirection (i.e. need to set forcefetch), is netarkivet.dk.
1044             if (!cauri.toString().equals(cauri.getVia().toString()) &&
1045                     canonicalize(cauri.getVia()).equals(canon)) {
1046                 cauri.setForceFetch(true);
1047             }
1048         }
1049         return canon;
1050     }
1051 
1052     /***
1053      * @param cauri CrawlURI we're to get a key for.
1054      * @return a String token representing a queue
1055      */
1056     public String getClassKey(CandidateURI cauri) {
1057         String queueKey = (String)getUncheckedAttribute(cauri,
1058             ATTR_FORCE_QUEUE);
1059         if ("".equals(queueKey)) {
1060             // no forced override
1061             QueueAssignmentPolicy queueAssignmentPolicy = 
1062                 getQueueAssignmentPolicy(cauri);
1063             queueKey =
1064                 queueAssignmentPolicy.getClassKey(this.controller, cauri);
1065         }
1066         return queueKey;
1067     }
1068 
1069     protected QueueAssignmentPolicy getQueueAssignmentPolicy(CandidateURI cauri) {
1070         String clsName = (String)getUncheckedAttribute(cauri,
1071                 ATTR_QUEUE_ASSIGNMENT_POLICY);
1072         try {
1073             return (QueueAssignmentPolicy) Class.forName(clsName).newInstance();
1074         } catch (Exception e) {
1075             throw new RuntimeException(e);
1076         }
1077     }
1078 
1079     /***
1080      * @return RecoveryJournal instance.  May be null.
1081      */
1082     public FrontierJournal getFrontierJournal() {
1083         return this.recover;
1084     }
1085 
1086     public void crawlEnding(String sExitMessage) {
1087         // TODO Auto-generated method stub
1088     }
1089 
1090     public void crawlEnded(String sExitMessage) {
1091         if (logger.isLoggable(Level.INFO)) {
1092             logger.info("Closing with " + Long.toString(queuedUriCount()) +
1093                 " urls still in queue.");
1094         }
1095     }
1096 
1097     public void crawlStarted(String message) {
1098         // TODO Auto-generated method stub
1099     }
1100 
1101     public void crawlPausing(String statusMessage) {
1102         // TODO Auto-generated method stub
1103     }
1104 
1105     public void crawlPaused(String statusMessage) {
1106         // TODO Auto-generated method stub
1107     }
1108 
1109     public void crawlResuming(String statusMessage) {
1110         // TODO Auto-generated method stub
1111     }
1112     
1113     public void crawlCheckpoint(File checkpointDir)
1114     throws Exception {
1115         if (this.recover == null) {
1116             return;
1117         }
1118         this.recover.checkpoint(checkpointDir);
1119     }
1120     
1121     //
1122     // Reporter implementation
1123     // 
1124     public String singleLineReport() {
1125         return ArchiveUtils.singleLineReport(this);
1126     }
1127 
1128     public void reportTo(PrintWriter writer) {
1129         reportTo(null, writer);
1130     }
1131     
1132     //
1133     // maintain serialization compatibility to pre-AtomicLong impl
1134     private void writeObject(java.io.ObjectOutputStream out)
1135     throws IOException {
1136         queuedUriCount = liveQueuedUriCount.get();
1137         succeededFetchCount = liveSucceededFetchCount.get();
1138         failedFetchCount = liveFailedFetchCount.get();
1139         disregardedUriCount = liveDisregardedUriCount.get();
1140         out.defaultWriteObject();
1141     }
1142     private void readObject(java.io.ObjectInputStream in)
1143     throws IOException, ClassNotFoundException {
1144         in.defaultReadObject();
1145         liveQueuedUriCount = new AtomicLong(queuedUriCount);
1146         liveSucceededFetchCount = new AtomicLong(succeededFetchCount);
1147         liveFailedFetchCount = new AtomicLong(failedFetchCount);
1148         liveDisregardedUriCount = new AtomicLong(disregardedUriCount);
1149     }
1150 }