1   /* RecoveryJournal
2    *
3    * $Id: RecoveryJournal.java 5507 2007-10-05 21:31:54Z gojomo $
4    *
5    * Created on Jul 20, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.frontier;
26  
27  import it.unimi.dsi.mg4j.util.MutableString;
28  
29  import java.io.BufferedInputStream;
30  import java.io.EOFException;
31  import java.io.File;
32  import java.io.IOException;
33  import java.util.ArrayList;
34  import java.util.logging.Level;
35  import java.util.logging.Logger;
36  
37  import org.apache.commons.httpclient.URIException;
38  import org.archive.crawler.datamodel.CandidateURI;
39  import org.archive.crawler.datamodel.CrawlOrder;
40  import org.archive.crawler.datamodel.CrawlURI;
41  import org.archive.crawler.framework.CrawlController;
42  import org.archive.crawler.framework.CrawlScope;
43  import org.archive.crawler.framework.Frontier;
44  import org.archive.crawler.io.CrawlerJournal;
45  import org.archive.crawler.settings.SettingsHandler;
46  import org.archive.net.UURI;
47  import org.archive.net.UURIFactory;
48  
49  import java.util.concurrent.CountDownLatch;
50  
51  /***
52   * Helper class for managing a simple Frontier change-events journal which is
53   * useful for recovering from crawl problems.
54   * 
55   * By replaying the journal into a new Frontier, its state (at least with
56   * respect to URIs alreadyIncluded and in pending queues) will match that of the
57   * original Frontier, allowing a pseudo-resume of a previous crawl, at least as
58   * far as URI visitation/coverage is concerned.
59   * 
60   * @author gojomo
61   */
62  public class RecoveryJournal extends CrawlerJournal 
63  implements FrontierJournal {
64      private static final Logger LOGGER = Logger.getLogger(
65              RecoveryJournal.class.getName());
66      
67      public final static String F_ADD = "F+ ";
68      public final static String F_EMIT = "Fe ";
69      public final static String F_DISREGARD = "Fd ";
70      public final static String F_RESCHEDULE = "Fr ";
71      public final static String F_SUCCESS = "Fs ";
72      public final static String F_FAILURE = "Ff ";
73      
74      //  show recovery progress every this many lines
75      private static final int PROGRESS_INTERVAL = 1000000;
76  
77      // once this many URIs are queued during recovery, allow 
78      // crawl to begin, while enqueuing of other URIs from log
79      // continues in background
80      private static final long ENOUGH_TO_START_CRAWLING = 100000;
81      
82      /***
83       * Create a new recovery journal at the given location
84       * 
85       * @param path Directory to make the recovery  journal in.
86       * @param filename Name to use for recovery journal file.
87       * @throws IOException
88       */
89      public RecoveryJournal(String path, String filename)
90      throws IOException {
91          super(path,filename);
92          timestamp_interval = 10000; // write timestamp lines occasionally
93      }
94      
95      public synchronized void added(CrawlURI curi) {
96          accumulatingBuffer.length(0);
97          this.accumulatingBuffer.append(F_ADD).
98              append(curi.toString()).
99              append(" "). 
100             append(curi.getPathFromSeed()).
101             append(" ").
102             append(curi.flattenVia());
103         writeLine(accumulatingBuffer);
104     }
105 
106     public void finishedSuccess(CrawlURI curi) {
107         finishedSuccess(curi.toString());
108     }
109     
110     public void finishedSuccess(UURI uuri) {
111         finishedSuccess(uuri.toString());
112     }
113     
114     protected void finishedSuccess(String uuri) {
115         writeLine(F_SUCCESS, uuri);
116     }
117 
118     public void emitted(CrawlURI curi) {
119         writeLine(F_EMIT, curi.toString());
120 
121     }
122     public void finishedDisregard(CrawlURI curi) {
123         writeLine(F_DISREGARD, curi.toString());
124     }
125     
126     public void finishedFailure(CrawlURI curi) {
127         finishedFailure(curi.toString());
128     }
129     
130     public void finishedFailure(UURI uuri) {
131         finishedFailure(uuri.toString());
132     }
133     
134     public void finishedFailure(String u) {
135         writeLine(F_FAILURE, u);
136     }
137 
138     public void rescheduled(CrawlURI curi) {
139         writeLine(F_RESCHEDULE, curi.toString());
140     }
141 
142     /***
143      * Utility method for scanning a recovery journal and applying it to
144      * a Frontier.
145      * 
146      * @param source Recover log path.
147      * @param frontier Frontier reference.
148      * @param retainFailures
149      * @throws IOException
150      * 
151      * @see org.archive.crawler.framework.Frontier#importRecoverLog(String, boolean)
152      */
153     public static void importRecoverLog(final File source,
154         final CrawlController controller, final boolean retainFailures)
155     throws IOException {
156         if (source == null) {
157             throw new IllegalArgumentException("Passed source file is null.");
158         }
159         LOGGER.info("recovering frontier completion state from "+source);
160         
161         // first, fill alreadyIncluded with successes (and possibly failures),
162         // and count the total lines
163         final int lines =
164             importCompletionInfoFromLog(source, controller, retainFailures);
165         
166         LOGGER.info("finished completion state; recovering queues from " +
167             source);
168 
169         // now, re-add anything that was in old frontier and not already
170         // registered as finished. Do this in a separate thread that signals
171         // this thread once ENOUGH_TO_START_CRAWLING URIs have been queued. 
172         final CountDownLatch recoveredEnough = new CountDownLatch(1);
173         new Thread(new Runnable() {
174             public void run() {
175                 importQueuesFromLog(source, controller, lines, recoveredEnough);
176             }
177         }, "queuesRecoveryThread").start();
178         
179         try {
180             // wait until at least ENOUGH_TO_START_CRAWLING URIs queued
181             recoveredEnough.await();
182         } catch (InterruptedException e) {
183             // TODO Auto-generated catch block
184             e.printStackTrace();
185         }
186     }
187     
188     /***
189      * Import just the SUCCESS (and possibly FAILURE) URIs from the given
190      * recovery log into the frontier as considered included. 
191      * 
192      * @param source recovery log file to use
193      * @param frontier frontier to update
194      * @param retainFailures whether failure ('Ff') URIs should count as done
195      * @return number of lines in recovery log (for reference)
196      * @throws IOException
197      */
198     private static int importCompletionInfoFromLog(File source, 
199             CrawlController controller, boolean retainFailures) throws IOException {
200         Frontier frontier = controller.getFrontier();
201         boolean checkScope = (Boolean) controller.getOrder()
202                 .getUncheckedAttribute(null,
203                         CrawlOrder.ATTR_RECOVER_SCOPE_INCLUDES);
204         CrawlScope scope = checkScope ? controller.getScope() : null;
205         // Scan log for all 'Fs' lines: add as 'alreadyIncluded'
206         BufferedInputStream is = getBufferedInput(source);
207         // create MutableString of good starting size (will grow if necessary)
208         MutableString read = new MutableString(UURI.MAX_URL_LENGTH); 
209         int lines = 0; 
210         try {
211             while (readLine(is,read)) {
212                 lines++;
213                 boolean wasSuccess = read.startsWith(F_SUCCESS);
214                 if (wasSuccess
215 						|| (retainFailures && read.startsWith(F_FAILURE))) {
216                     // retrieve first (only) URL on line 
217                     String s = read.subSequence(3,read.length()).toString();
218                     try {
219                         UURI u = UURIFactory.getInstance(s);
220                         if(checkScope) {
221                             if(!scope.accepts(u)) {
222                                 // skip out-of-scope URIs.
223                                 continue;
224                             }
225                         }
226                         frontier.considerIncluded(u);
227                         if(wasSuccess) {
228                             if (frontier.getFrontierJournal() != null) {
229                                 frontier.getFrontierJournal().
230                                     finishedSuccess(u);
231                             }
232                         } else {
233                             // carryforward failure, in case future recovery
234                             // wants to no retain them as finished 
235                             if (frontier.getFrontierJournal() != null) {
236                                 frontier.getFrontierJournal().
237                                     finishedFailure(u);
238                             }
239                         }
240                     } catch (URIException e) {
241                         e.printStackTrace();
242                     }
243                 }
244                 if((lines%PROGRESS_INTERVAL)==0) {
245                     // every 1 million lines, print progress
246                     LOGGER.info(
247                             "at line " + lines 
248                             + " alreadyIncluded count = " +
249                             frontier.discoveredUriCount());
250                 }
251             }
252         } catch (EOFException e) {
253             // expected in some uncleanly-closed recovery logs; ignore
254         } finally {
255             is.close();
256         }
257         return lines;
258     }
259 
260     /***
261      * Read a line from the given bufferedinputstream into the MutableString.
262      * Return true if a line was read; false if EOF. 
263      * 
264      * @param is
265      * @param read
266      * @return True if we read a line.
267      * @throws IOException
268      */
269     private static boolean readLine(BufferedInputStream is, MutableString read)
270     throws IOException {
271         read.length(0);
272         int c = is.read();
273         while((c!=-1)&&c!='\n'&&c!='\r') {
274             read.append((char)c);
275             c = is.read();
276         }
277         if(c==-1 && read.length()==0) {
278             // EOF and none read; return false
279             return false;
280         }
281         if(c=='\n') {
282             // consume LF following CR, if present
283             is.mark(1);
284             if(is.read()!='\r') {
285                 is.reset();
286             }
287         }
288         // a line (possibly blank) was read
289         return true;
290     }
291 
292     /***
293      * Import all ADDs from given recovery log into the frontier's queues
294      * (excepting those the frontier drops as already having been included)
295      * 
296      * @param source recovery log file to use
297      * @param frontier frontier to update
298      * @param lines total lines noted in recovery log earlier
299      * @param enough latch signalling 'enough' URIs queued to begin crawling
300      */
301     private static void importQueuesFromLog(File source, CrawlController controller,
302             int lines, CountDownLatch enough) {
303         BufferedInputStream is;
304         // create MutableString of good starting size (will grow if necessary)
305         MutableString read = new MutableString(UURI.MAX_URL_LENGTH);
306         controller.installThreadContextSettingsHandler();
307         Frontier frontier = controller.getFrontier();
308         boolean checkScope = (Boolean) controller.getOrder()
309         .getUncheckedAttribute(null,
310                 CrawlOrder.ATTR_RECOVER_SCOPE_ENQUEUES);
311         CrawlScope scope = checkScope ? controller.getScope() : null;
312         long queuedAtStart = frontier.queuedUriCount();
313         long queuedDuringRecovery = 0;
314         int qLines = 0;
315         
316         try {
317             // Scan log for all 'F+' lines: if not alreadyIncluded, schedule for
318             // visitation
319             is = getBufferedInput(source);
320             try {
321                 while (readLine(is,read)) {
322                     qLines++;
323                     if (read.startsWith(F_ADD)) {
324                         UURI u;
325                         CharSequence args[] = splitOnSpaceRuns(read);
326                         try {
327                             u = UURIFactory.getInstance(args[1].toString());
328                             String pathFromSeed = (args.length > 2)?
329                                 args[2].toString() : "";
330                             UURI via = (args.length > 3)?
331                                 UURIFactory.getInstance(args[3].toString()):
332                                 null;
333                             String viaContext = (args.length > 4)?
334                                     args[4].toString(): "";
335                             CandidateURI caUri = new CandidateURI(u, 
336                                     pathFromSeed, via, viaContext);
337                             if(checkScope) {
338                                 if(!scope.accepts(caUri)) {
339                                     // skip out-of-scope URIs.
340                                     continue;
341                                 }
342                             }
343                             frontier.schedule(caUri);
344                             
345                             queuedDuringRecovery =
346                                 frontier.queuedUriCount() - queuedAtStart;
347                             if(((queuedDuringRecovery + 1) %
348                                     ENOUGH_TO_START_CRAWLING) == 0) {
349                                 enough.countDown();
350                             }
351                         } catch (URIException e) {
352                             LOGGER.log(Level.WARNING, "bad URI during " +
353                                 "log-recovery of queue contents ",e);
354                             // and continue...
355                         } catch (RuntimeException e) {
356                             LOGGER.log(Level.SEVERE, "exception during " +
357                                     "log-recovery of queue contents ",e);
358                             // and continue, though this may be risky
359                             // if the exception wasn't a trivial NPE 
360                             // or wrapped interrupted-exception.
361                         }
362                     }
363                     if((qLines%PROGRESS_INTERVAL)==0) {
364                         // every 1 million lines, print progress
365                         LOGGER.info(
366                                 "through line " 
367                                 + qLines + "/" + lines 
368                                 + " queued count = " +
369                                 frontier.queuedUriCount());
370                     }
371                 }
372             } catch (EOFException e) {
373                 // no problem: untidy end of recovery journal
374             } finally {
375             	    is.close(); 
376             }
377         } catch (IOException e) {
378             // TODO Auto-generated catch block
379             e.printStackTrace();
380         }
381         LOGGER.info("finished recovering frontier from "+source+" "
382                 +qLines+" lines processed");
383         enough.countDown();
384     }
385 
386     /***
387      * Return an array of the subsequences of the passed-in sequence,
388      * split on space runs. 
389      * 
390      * @param read
391      * @return CharSequence.
392      */
393     private static CharSequence[] splitOnSpaceRuns(CharSequence read) {
394         int lastStart = 0;
395         ArrayList<CharSequence> segs = new ArrayList<CharSequence>(5);
396         int i;
397         for(i=0;i<read.length();i++) {
398             if (read.charAt(i)==' ') {
399                 segs.add(read.subSequence(lastStart,i));
400                 i++;
401                 while(i < read.length() && read.charAt(i)==' ') {
402                     // skip any space runs
403                     i++;
404                 }
405                 lastStart = i;
406             }
407         }
408         if(lastStart<read.length()) {
409             segs.add(read.subSequence(lastStart,i));
410         }
411         return (CharSequence[]) segs.toArray(new CharSequence[segs.size()]);        
412     }
413 }