1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.frontier;
26
27 import it.unimi.dsi.mg4j.util.MutableString;
28
29 import java.io.BufferedInputStream;
30 import java.io.EOFException;
31 import java.io.File;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.logging.Level;
35 import java.util.logging.Logger;
36
37 import org.apache.commons.httpclient.URIException;
38 import org.archive.crawler.datamodel.CandidateURI;
39 import org.archive.crawler.datamodel.CrawlOrder;
40 import org.archive.crawler.datamodel.CrawlURI;
41 import org.archive.crawler.framework.CrawlController;
42 import org.archive.crawler.framework.CrawlScope;
43 import org.archive.crawler.framework.Frontier;
44 import org.archive.crawler.io.CrawlerJournal;
45 import org.archive.crawler.settings.SettingsHandler;
46 import org.archive.net.UURI;
47 import org.archive.net.UURIFactory;
48
49 import java.util.concurrent.CountDownLatch;
50
51 /***
52 * Helper class for managing a simple Frontier change-events journal which is
53 * useful for recovering from crawl problems.
54 *
55 * By replaying the journal into a new Frontier, its state (at least with
56 * respect to URIs alreadyIncluded and in pending queues) will match that of the
57 * original Frontier, allowing a pseudo-resume of a previous crawl, at least as
58 * far as URI visitation/coverage is concerned.
59 *
60 * @author gojomo
61 */
62 public class RecoveryJournal extends CrawlerJournal
63 implements FrontierJournal {
64 private static final Logger LOGGER = Logger.getLogger(
65 RecoveryJournal.class.getName());
66
67 public final static String F_ADD = "F+ ";
68 public final static String F_EMIT = "Fe ";
69 public final static String F_DISREGARD = "Fd ";
70 public final static String F_RESCHEDULE = "Fr ";
71 public final static String F_SUCCESS = "Fs ";
72 public final static String F_FAILURE = "Ff ";
73
74
75 private static final int PROGRESS_INTERVAL = 1000000;
76
77
78
79
80 private static final long ENOUGH_TO_START_CRAWLING = 100000;
81
82 /***
83 * Create a new recovery journal at the given location
84 *
85 * @param path Directory to make the recovery journal in.
86 * @param filename Name to use for recovery journal file.
87 * @throws IOException
88 */
89 public RecoveryJournal(String path, String filename)
90 throws IOException {
91 super(path,filename);
92 timestamp_interval = 10000;
93 }
94
95 public synchronized void added(CrawlURI curi) {
96 accumulatingBuffer.length(0);
97 this.accumulatingBuffer.append(F_ADD).
98 append(curi.toString()).
99 append(" ").
100 append(curi.getPathFromSeed()).
101 append(" ").
102 append(curi.flattenVia());
103 writeLine(accumulatingBuffer);
104 }
105
106 public void finishedSuccess(CrawlURI curi) {
107 finishedSuccess(curi.toString());
108 }
109
110 public void finishedSuccess(UURI uuri) {
111 finishedSuccess(uuri.toString());
112 }
113
114 protected void finishedSuccess(String uuri) {
115 writeLine(F_SUCCESS, uuri);
116 }
117
118 public void emitted(CrawlURI curi) {
119 writeLine(F_EMIT, curi.toString());
120
121 }
122 public void finishedDisregard(CrawlURI curi) {
123 writeLine(F_DISREGARD, curi.toString());
124 }
125
126 public void finishedFailure(CrawlURI curi) {
127 finishedFailure(curi.toString());
128 }
129
130 public void finishedFailure(UURI uuri) {
131 finishedFailure(uuri.toString());
132 }
133
134 public void finishedFailure(String u) {
135 writeLine(F_FAILURE, u);
136 }
137
138 public void rescheduled(CrawlURI curi) {
139 writeLine(F_RESCHEDULE, curi.toString());
140 }
141
142 /***
143 * Utility method for scanning a recovery journal and applying it to
144 * a Frontier.
145 *
146 * @param source Recover log path.
147 * @param frontier Frontier reference.
148 * @param retainFailures
149 * @throws IOException
150 *
151 * @see org.archive.crawler.framework.Frontier#importRecoverLog(String, boolean)
152 */
153 public static void importRecoverLog(final File source,
154 final CrawlController controller, final boolean retainFailures)
155 throws IOException {
156 if (source == null) {
157 throw new IllegalArgumentException("Passed source file is null.");
158 }
159 LOGGER.info("recovering frontier completion state from "+source);
160
161
162
163 final int lines =
164 importCompletionInfoFromLog(source, controller, retainFailures);
165
166 LOGGER.info("finished completion state; recovering queues from " +
167 source);
168
169
170
171
172 final CountDownLatch recoveredEnough = new CountDownLatch(1);
173 new Thread(new Runnable() {
174 public void run() {
175 importQueuesFromLog(source, controller, lines, recoveredEnough);
176 }
177 }, "queuesRecoveryThread").start();
178
179 try {
180
181 recoveredEnough.await();
182 } catch (InterruptedException e) {
183
184 e.printStackTrace();
185 }
186 }
187
188 /***
189 * Import just the SUCCESS (and possibly FAILURE) URIs from the given
190 * recovery log into the frontier as considered included.
191 *
192 * @param source recovery log file to use
193 * @param frontier frontier to update
194 * @param retainFailures whether failure ('Ff') URIs should count as done
195 * @return number of lines in recovery log (for reference)
196 * @throws IOException
197 */
198 private static int importCompletionInfoFromLog(File source,
199 CrawlController controller, boolean retainFailures) throws IOException {
200 Frontier frontier = controller.getFrontier();
201 boolean checkScope = (Boolean) controller.getOrder()
202 .getUncheckedAttribute(null,
203 CrawlOrder.ATTR_RECOVER_SCOPE_INCLUDES);
204 CrawlScope scope = checkScope ? controller.getScope() : null;
205
206 BufferedInputStream is = getBufferedInput(source);
207
208 MutableString read = new MutableString(UURI.MAX_URL_LENGTH);
209 int lines = 0;
210 try {
211 while (readLine(is,read)) {
212 lines++;
213 boolean wasSuccess = read.startsWith(F_SUCCESS);
214 if (wasSuccess
215 || (retainFailures && read.startsWith(F_FAILURE))) {
216
217 String s = read.subSequence(3,read.length()).toString();
218 try {
219 UURI u = UURIFactory.getInstance(s);
220 if(checkScope) {
221 if(!scope.accepts(u)) {
222
223 continue;
224 }
225 }
226 frontier.considerIncluded(u);
227 if(wasSuccess) {
228 if (frontier.getFrontierJournal() != null) {
229 frontier.getFrontierJournal().
230 finishedSuccess(u);
231 }
232 } else {
233
234
235 if (frontier.getFrontierJournal() != null) {
236 frontier.getFrontierJournal().
237 finishedFailure(u);
238 }
239 }
240 } catch (URIException e) {
241 e.printStackTrace();
242 }
243 }
244 if((lines%PROGRESS_INTERVAL)==0) {
245
246 LOGGER.info(
247 "at line " + lines
248 + " alreadyIncluded count = " +
249 frontier.discoveredUriCount());
250 }
251 }
252 } catch (EOFException e) {
253
254 } finally {
255 is.close();
256 }
257 return lines;
258 }
259
260 /***
261 * Read a line from the given bufferedinputstream into the MutableString.
262 * Return true if a line was read; false if EOF.
263 *
264 * @param is
265 * @param read
266 * @return True if we read a line.
267 * @throws IOException
268 */
269 private static boolean readLine(BufferedInputStream is, MutableString read)
270 throws IOException {
271 read.length(0);
272 int c = is.read();
273 while((c!=-1)&&c!='\n'&&c!='\r') {
274 read.append((char)c);
275 c = is.read();
276 }
277 if(c==-1 && read.length()==0) {
278
279 return false;
280 }
281 if(c=='\n') {
282
283 is.mark(1);
284 if(is.read()!='\r') {
285 is.reset();
286 }
287 }
288
289 return true;
290 }
291
292 /***
293 * Import all ADDs from given recovery log into the frontier's queues
294 * (excepting those the frontier drops as already having been included)
295 *
296 * @param source recovery log file to use
297 * @param frontier frontier to update
298 * @param lines total lines noted in recovery log earlier
299 * @param enough latch signalling 'enough' URIs queued to begin crawling
300 */
301 private static void importQueuesFromLog(File source, CrawlController controller,
302 int lines, CountDownLatch enough) {
303 BufferedInputStream is;
304
305 MutableString read = new MutableString(UURI.MAX_URL_LENGTH);
306 controller.installThreadContextSettingsHandler();
307 Frontier frontier = controller.getFrontier();
308 boolean checkScope = (Boolean) controller.getOrder()
309 .getUncheckedAttribute(null,
310 CrawlOrder.ATTR_RECOVER_SCOPE_ENQUEUES);
311 CrawlScope scope = checkScope ? controller.getScope() : null;
312 long queuedAtStart = frontier.queuedUriCount();
313 long queuedDuringRecovery = 0;
314 int qLines = 0;
315
316 try {
317
318
319 is = getBufferedInput(source);
320 try {
321 while (readLine(is,read)) {
322 qLines++;
323 if (read.startsWith(F_ADD)) {
324 UURI u;
325 CharSequence args[] = splitOnSpaceRuns(read);
326 try {
327 u = UURIFactory.getInstance(args[1].toString());
328 String pathFromSeed = (args.length > 2)?
329 args[2].toString() : "";
330 UURI via = (args.length > 3)?
331 UURIFactory.getInstance(args[3].toString()):
332 null;
333 String viaContext = (args.length > 4)?
334 args[4].toString(): "";
335 CandidateURI caUri = new CandidateURI(u,
336 pathFromSeed, via, viaContext);
337 if(checkScope) {
338 if(!scope.accepts(caUri)) {
339
340 continue;
341 }
342 }
343 frontier.schedule(caUri);
344
345 queuedDuringRecovery =
346 frontier.queuedUriCount() - queuedAtStart;
347 if(((queuedDuringRecovery + 1) %
348 ENOUGH_TO_START_CRAWLING) == 0) {
349 enough.countDown();
350 }
351 } catch (URIException e) {
352 LOGGER.log(Level.WARNING, "bad URI during " +
353 "log-recovery of queue contents ",e);
354
355 } catch (RuntimeException e) {
356 LOGGER.log(Level.SEVERE, "exception during " +
357 "log-recovery of queue contents ",e);
358
359
360
361 }
362 }
363 if((qLines%PROGRESS_INTERVAL)==0) {
364
365 LOGGER.info(
366 "through line "
367 + qLines + "/" + lines
368 + " queued count = " +
369 frontier.queuedUriCount());
370 }
371 }
372 } catch (EOFException e) {
373
374 } finally {
375 is.close();
376 }
377 } catch (IOException e) {
378
379 e.printStackTrace();
380 }
381 LOGGER.info("finished recovering frontier from "+source+" "
382 +qLines+" lines processed");
383 enough.countDown();
384 }
385
386 /***
387 * Return an array of the subsequences of the passed-in sequence,
388 * split on space runs.
389 *
390 * @param read
391 * @return CharSequence.
392 */
393 private static CharSequence[] splitOnSpaceRuns(CharSequence read) {
394 int lastStart = 0;
395 ArrayList<CharSequence> segs = new ArrayList<CharSequence>(5);
396 int i;
397 for(i=0;i<read.length();i++) {
398 if (read.charAt(i)==' ') {
399 segs.add(read.subSequence(lastStart,i));
400 i++;
401 while(i < read.length() && read.charAt(i)==' ') {
402
403 i++;
404 }
405 lastStart = i;
406 }
407 }
408 if(lastStart<read.length()) {
409 segs.add(read.subSequence(lastStart,i));
410 }
411 return (CharSequence[]) segs.toArray(new CharSequence[segs.size()]);
412 }
413 }