View Javadoc

1   /* FrontierJournal
2    * 
3    * Created on Oct 26, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.frontier;
24  
25  import java.io.File;
26  import java.io.IOException;
27  
28  import org.archive.crawler.datamodel.CrawlURI;
29  import org.archive.net.UURI;
30  
31  /***
32   * Record of key Frontier happenings.
33   * @author stack
34   * @version $Date: 2007-08-28 05:15:25 +0000 (Tue, 28 Aug 2007) $, $Revision: 5439 $
35   */
36  public interface FrontierJournal {
37      public static final String LOGNAME_RECOVER = "recover.gz";
38  
39      /***
40       * @param curi CrawlURI that has been scheduled to be added to the
41       * Frontier.
42       */
43      public abstract void added(CrawlURI curi);
44  
45      /***
46       * @param curi CrawlURI that finished successfully.
47       */
48      public abstract void finishedSuccess(CrawlURI curi);
49  
50      /***
51       * @param uuri UURI that finished successfully.
52       */
53      public abstract void finishedSuccess(UURI uuri);
54  
55      /***
56       * Note that a CrawlURI was emitted for processing.
57       * If not followed by a finished or rescheduled notation in
58       * the journal, the CrawlURI was still in-process when the journal ended.
59       * 
60       * @param curi CrawlURI emitted.
61       */
62      public abstract void emitted(CrawlURI curi);
63  
64      /***
65       * @param u UURI that finished unsuccessfully 
66       */
67      public abstract void finishedFailure(UURI u);
68      
69      /***
70       * @param curi CrawlURI finished unsuccessfully.
71       */
72      public abstract void finishedFailure(CrawlURI curi);
73  
74      /***
75       * @param curi CrawlURI finished disregarded (uncounted failure).
76       */
77      public abstract void finishedDisregard(CrawlURI curi);
78      
79      /***
80       * @param curi CrawlURI that was returned to the Frontier for 
81       * another try.
82       */
83      public abstract void rescheduled(CrawlURI curi);
84  
85      /***
86       *  Flush and close any held objects.
87       */
88      public abstract void close();
89      
90      /***
91       * Checkpoint.
92       * @param checkpointDir Directory we're checkpointing into.
93       * @throws IOException
94       */
95      public abstract void checkpoint(final File checkpointDir)
96      throws IOException;
97  
98      /***
99       * Add a line noting a serious crawl error. 
100      * 
101      * @param string
102      */
103     public abstract void seriousError(String string);
104 }