1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.framework;
25
26 import java.io.PrintWriter;
27 import java.util.HashMap;
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import org.archive.crawler.datamodel.CoreAttributeConstants;
32 import org.archive.crawler.datamodel.CrawlOrder;
33 import org.archive.crawler.datamodel.CrawlURI;
34 import org.archive.crawler.datamodel.FetchStatusCodes;
35 import org.archive.crawler.datamodel.InstancePerThread;
36 import org.archive.crawler.framework.exceptions.EndedException;
37 import org.archive.util.ArchiveUtils;
38 import org.archive.util.DevUtils;
39 import org.archive.util.HttpRecorder;
40 import org.archive.util.HttpRecorderMarker;
41 import org.archive.util.ProgressStatisticsReporter;
42 import org.archive.util.Reporter;
43
44 import com.sleepycat.util.RuntimeExceptionWrapper;
45
46 /***
47 * One "worker thread"; asks for CrawlURIs, processes them,
48 * repeats unless told otherwise.
49 *
50 * @author Gordon Mohr
51 */
52 public class ToeThread extends Thread
53 implements CoreAttributeConstants, FetchStatusCodes, HttpRecorderMarker,
54 Reporter, ProgressStatisticsReporter {
55 private static final String STEP_NASCENT = "NASCENT";
56 private static final String STEP_ABOUT_TO_GET_URI = "ABOUT_TO_GET_URI";
57 private static final String STEP_FINISHED = "FINISHED";
58 private static final String STEP_ABOUT_TO_BEGIN_CHAIN =
59 "ABOUT_TO_BEGIN_CHAIN";
60 private static final String STEP_ABOUT_TO_BEGIN_PROCESSOR =
61 "ABOUT_TO_BEGIN_PROCESSOR";
62 private static final String STEP_DONE_WITH_PROCESSORS =
63 "DONE_WITH_PROCESSORS";
64 private static final String STEP_HANDLING_RUNTIME_EXCEPTION =
65 "HANDLING_RUNTIME_EXCEPTION";
66 private static final String STEP_ABOUT_TO_RETURN_URI =
67 "ABOUT_TO_RETURN_URI";
68 private static final String STEP_FINISHING_PROCESS = "FINISHING_PROCESS";
69
70 private static Logger logger =
71 Logger.getLogger("org.archive.crawler.framework.ToeThread");
72
73 private CrawlController controller;
74 private int serialNumber;
75
76 /***
77 * Each ToeThead has an instance of HttpRecord that gets used
78 * over and over by each request.
79 *
80 * @see org.archive.util.HttpRecorderMarker
81 */
82 private HttpRecorder httpRecorder = null;
83
84 private HashMap<String,Processor> localProcessors
85 = new HashMap<String,Processor>();
86 private String currentProcessorName = "";
87
88 private String coreName;
89 private CrawlURI currentCuri;
90 private long lastStartTime;
91 private long lastFinishTime;
92
93
94 private String step = STEP_NASCENT;
95 private long atStepSince;
96
97
98 private static final int DEFAULT_PRIORITY = Thread.NORM_PRIORITY-2;
99
100
101
102 private volatile boolean shouldRetire = false;
103
104 /***
105 * Create a ToeThread
106 *
107 * @param g ToeThreadGroup
108 * @param sn serial number
109 */
110 public ToeThread(ToePool g, int sn) {
111
112 super(g,"ToeThread #" + sn);
113 coreName="ToeThread #" + sn + ": ";
114 controller = g.getController();
115 serialNumber = sn;
116 setPriority(DEFAULT_PRIORITY);
117 int outBufferSize = ((Integer) controller
118 .getOrder()
119 .getUncheckedAttribute(null,CrawlOrder.ATTR_RECORDER_OUT_BUFFER))
120 .intValue();
121 int inBufferSize = ((Integer) controller
122 .getOrder()
123 .getUncheckedAttribute(null, CrawlOrder.ATTR_RECORDER_IN_BUFFER))
124 .intValue();
125 httpRecorder = new HttpRecorder(controller.getScratchDisk(),
126 "tt" + sn + "http", outBufferSize, inBufferSize);
127 lastFinishTime = System.currentTimeMillis();
128 }
129
130 /*** (non-Javadoc)
131 * @see java.lang.Thread#run()
132 */
133 public void run() {
134 String name = controller.getOrder().getCrawlOrderName();
135 logger.fine(getName()+" started for order '"+name+"'");
136
137 try {
138 while ( true ) {
139
140 continueCheck();
141
142 setStep(STEP_ABOUT_TO_GET_URI);
143
144 CrawlURI curi = controller.getFrontier().next();
145
146 synchronized(this) {
147 continueCheck();
148 setCurrentCuri(curi);
149 }
150
151 processCrawlUri();
152
153 setStep(STEP_ABOUT_TO_RETURN_URI);
154 continueCheck();
155
156 synchronized(this) {
157 controller.getFrontier().finished(currentCuri);
158 setCurrentCuri(null);
159 }
160
161 setStep(STEP_FINISHING_PROCESS);
162 lastFinishTime = System.currentTimeMillis();
163 controller.releaseContinuePermission();
164 if(shouldRetire) {
165 break;
166 }
167 }
168 } catch (EndedException e) {
169
170 } catch (Exception e) {
171
172 logger.log(Level.SEVERE,"Fatal exception in "+getName(),e);
173 } catch (OutOfMemoryError err) {
174 seriousError(err);
175 } finally {
176 controller.releaseContinuePermission();
177 }
178 setCurrentCuri(null);
179
180 this.httpRecorder.closeRecorders();
181 this.httpRecorder = null;
182 localProcessors = null;
183
184 logger.fine(getName()+" finished for order '"+name+"'");
185 setStep(STEP_FINISHED);
186 controller.toeEnded();
187 controller = null;
188 }
189
190 /***
191 * Set currentCuri, updating thread name as appropriate
192 * @param curi
193 */
194 private void setCurrentCuri(CrawlURI curi) {
195 if(curi==null) {
196 setName(coreName);
197 } else {
198 setName(coreName+curi);
199 }
200 currentCuri = curi;
201 }
202
203 /***
204 * @param s
205 */
206 private void setStep(String s) {
207 step=s;
208 atStepSince = System.currentTimeMillis();
209 }
210
211 private void seriousError(Error err) {
212
213
214
215 setPriority(DEFAULT_PRIORITY+1);
216 if (controller!=null) {
217
218 controller.singleThreadMode();
219
220
221 controller.freeReserveMemory();
222 controller.requestCrawlPause();
223 if (controller.getFrontier().getFrontierJournal() != null) {
224 controller.getFrontier().getFrontierJournal().seriousError(
225 getName() + err.getMessage());
226 }
227 }
228
229
230 String extraInfo = DevUtils.extraInfo();
231 System.err.println("<<<");
232 System.err.println(ArchiveUtils.getLog17Date());
233 System.err.println(err);
234 System.err.println(extraInfo);
235 err.printStackTrace(System.err);
236
237 if (controller!=null) {
238 PrintWriter pw = new PrintWriter(System.err);
239 controller.getToePool().compactReportTo(pw);
240 pw.flush();
241 }
242 System.err.println(">>>");
243
244
245 String context = "unknown";
246 if(currentCuri!=null) {
247
248 currentCuri.addAnnotation("err="+err.getClass().getName());
249 currentCuri.addAnnotation("os"+currentCuri.getFetchStatus());
250 currentCuri.setFetchStatus(S_SERIOUS_ERROR);
251 context = currentCuri.singleLineReport() + " in " + currentProcessorName;
252 }
253 String message = "Serious error occured trying " +
254 "to process '" + context + "'\n" + extraInfo;
255 logger.log(Level.SEVERE, message.toString(), err);
256 setPriority(DEFAULT_PRIORITY);
257 }
258
259 /***
260 * Perform checks as to whether normal execution should proceed.
261 *
262 * If an external interrupt is detected, throw an interrupted exception.
263 * Used before anything that should not be attempted by a 'zombie' thread
264 * that the Frontier/Crawl has given up on.
265 *
266 * Otherwise, if the controller's memoryGate has been closed,
267 * hold until it is opened. (Provides a better chance of
268 * being able to complete some tasks after an OutOfMemoryError.)
269 *
270 * @throws InterruptedException
271 */
272 private void continueCheck() throws InterruptedException {
273 if(Thread.interrupted()) {
274 throw new InterruptedException("die request detected");
275 }
276 controller.acquireContinuePermission();
277 }
278
279 /***
280 * Pass the CrawlURI to all appropriate processors
281 *
282 * @throws InterruptedException
283 */
284 private void processCrawlUri() throws InterruptedException {
285 currentCuri.setThreadNumber(this.serialNumber);
286 currentCuri.setNextProcessorChain(controller.getFirstProcessorChain());
287 lastStartTime = System.currentTimeMillis();
288
289 try {
290 while (currentCuri.nextProcessorChain() != null) {
291 setStep(STEP_ABOUT_TO_BEGIN_CHAIN);
292
293 currentCuri.setNextProcessor(currentCuri.nextProcessorChain().getFirstProcessor());
294 currentCuri.setNextProcessorChain(currentCuri.nextProcessorChain().getNextProcessorChain());
295
296 while (currentCuri.nextProcessor() != null) {
297 setStep(STEP_ABOUT_TO_BEGIN_PROCESSOR);
298 Processor currentProcessor = getProcessor(currentCuri.nextProcessor());
299 currentProcessorName = currentProcessor.getName();
300 continueCheck();
301
302 currentProcessor.process(currentCuri);
303
304
305 }
306 }
307 setStep(STEP_DONE_WITH_PROCESSORS);
308 currentProcessorName = "";
309 } catch (RuntimeExceptionWrapper e) {
310
311 if(e.getCause() == null) {
312 e.initCause(e.getCause());
313 }
314 recoverableProblem(e);
315 } catch (AssertionError ae) {
316
317
318 recoverableProblem(ae);
319 } catch (RuntimeException e) {
320 recoverableProblem(e);
321 } catch (StackOverflowError err) {
322 recoverableProblem(err);
323 } catch (Error err) {
324
325 seriousError(err);
326 }
327 }
328
329
330 /***
331 * Handling for exceptions and errors that are possibly recoverable.
332 *
333 * @param e
334 */
335 private void recoverableProblem(Throwable e) {
336 Object previousStep = step;
337 setStep(STEP_HANDLING_RUNTIME_EXCEPTION);
338 e.printStackTrace(System.err);
339 currentCuri.setFetchStatus(S_RUNTIME_EXCEPTION);
340
341 currentCuri.addAnnotation("err="+e.getClass().getName());
342 currentCuri.putObject(A_RUNTIME_EXCEPTION, e);
343 String message = "Problem " + e +
344 " occured when trying to process '"
345 + currentCuri.toString()
346 + "' at step " + previousStep
347 + " in " + currentProcessorName +"\n";
348 logger.log(Level.SEVERE, message.toString(), e);
349 }
350
351 private Processor getProcessor(Processor processor) {
352 if(!(processor instanceof InstancePerThread)) {
353
354 return processor;
355 }
356
357 Processor localProcessor = (Processor) localProcessors.get(
358 processor.getClass().getName());
359 if (localProcessor == null) {
360 localProcessor = processor.spawn(this.getSerialNumber());
361 localProcessors.put(processor.getClass().getName(),localProcessor);
362 }
363 return localProcessor;
364 }
365
366 /***
367 * @return Return toe thread serial number.
368 */
369 public int getSerialNumber() {
370 return this.serialNumber;
371 }
372
373 /***
374 * Used to get current threads HttpRecorder instance.
375 * Implementation of the HttpRecorderMarker interface.
376 * @return Returns instance of HttpRecorder carried by this thread.
377 * @see org.archive.util.HttpRecorderMarker#getHttpRecorder()
378 */
379 public HttpRecorder getHttpRecorder() {
380 return this.httpRecorder;
381 }
382
383 /*** Get the CrawlController acossiated with this thread.
384 *
385 * @return Returns the CrawlController.
386 */
387 public CrawlController getController() {
388 return controller;
389 }
390
391 /***
392 * Terminates a thread.
393 *
394 * <p> Calling this method will ensure that the current thread will stop
395 * processing as soon as possible (note: this may be never). Meant to
396 * 'short circuit' hung threads.
397 *
398 * <p> Current crawl uri will have its fetch status set accordingly and
399 * will be immediately returned to the frontier.
400 *
401 * <p> As noted before, this does not ensure that the thread will stop
402 * running (ever). But once evoked it will not try and communicate with
403 * other parts of crawler and will terminate as soon as control is
404 * established.
405 */
406 protected void kill(){
407 this.interrupt();
408 synchronized(this) {
409 if (currentCuri!=null) {
410 currentCuri.setFetchStatus(S_PROCESSING_THREAD_KILLED);
411 controller.getFrontier().finished(currentCuri);
412 }
413 }
414 }
415
416 /***
417 * @return Current step (For debugging/reporting, give abstract step
418 * where this thread is).
419 */
420 public Object getStep() {
421 return step;
422 }
423
424 /***
425 * Is this thread validly processing a URI, not paused, waiting for
426 * a URI, or interrupted?
427 * @return whether thread is actively processing a URI
428 */
429 public boolean isActive() {
430
431 return this.isAlive() && (currentCuri != null) && !isInterrupted();
432 }
433
434 /***
435 * Request that this thread retire (exit cleanly) at the earliest
436 * opportunity.
437 */
438 public void retire() {
439 shouldRetire = true;
440 }
441
442 /***
443 * Whether this thread should cleanly retire at the earliest
444 * opportunity.
445 *
446 * @return True if should retire.
447 */
448 public boolean shouldRetire() {
449 return shouldRetire;
450 }
451
452
453
454
455
456 /***
457 * Compiles and returns a report on its status.
458 * @param name Report name.
459 * @param pw Where to print.
460 */
461 public void reportTo(String name, PrintWriter pw) {
462
463
464 pw.print("[");
465 pw.println(getName());
466
467
468
469
470
471
472
473
474 CrawlURI c = currentCuri;
475 if(c != null) {
476 pw.print(" ");
477 c.singleLineReportTo(pw);
478 pw.print(" ");
479 pw.print(c.getFetchAttempts());
480 pw.print(" attempts");
481 pw.println();
482 pw.print(" ");
483 pw.print("in processor: ");
484 pw.print(currentProcessorName);
485 } else {
486 pw.print(" -no CrawlURI- ");
487 }
488 pw.println();
489
490 long now = System.currentTimeMillis();
491 long time = 0;
492
493 pw.print(" ");
494 if(lastFinishTime > lastStartTime) {
495
496
497 pw.print("WAITING for ");
498 time = now - lastFinishTime;
499 } else if(lastStartTime > 0) {
500
501 pw.print("ACTIVE for ");
502 time = now-lastStartTime;
503 }
504 pw.print(ArchiveUtils.formatMillisecondsToConventional(time));
505 pw.println();
506
507 pw.print(" ");
508 pw.print("step: ");
509 pw.print(step);
510 pw.print(" for ");
511 pw.print(ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis()-atStepSince));
512 pw.println();
513
514 StackTraceElement[] ste = this.getStackTrace();
515 for(int i=0;i<ste.length;i++) {
516 pw.print(" ");
517 pw.print(ste[i].toString());
518 pw.println();
519 }
520 pw.print("]");
521 pw.println();
522
523 pw.flush();
524 }
525
526 /***
527 * @param w PrintWriter to write to.
528 */
529 public void singleLineReportTo(PrintWriter w)
530 {
531 w.print("#");
532 w.print(this.serialNumber);
533
534
535
536
537
538
539
540
541 CrawlURI c = currentCuri;
542 if(c != null) {
543 w.print(" ");
544 w.print(currentProcessorName);
545 w.print(" ");
546 w.print(c.toString());
547 w.print(" (");
548 w.print(c.getFetchAttempts());
549 w.print(") ");
550 } else {
551 w.print(" [no CrawlURI] ");
552 }
553
554 long now = System.currentTimeMillis();
555 long time = 0;
556
557 if(lastFinishTime > lastStartTime) {
558
559
560 w.print("WAITING for ");
561 time = now - lastFinishTime;
562 } else if(lastStartTime > 0) {
563
564 w.print("ACTIVE for ");
565 time = now-lastStartTime;
566 }
567 w.print(ArchiveUtils.formatMillisecondsToConventional(time));
568 w.print(" at ");
569 w.print(step);
570 w.print(" for ");
571 w.print(ArchiveUtils.formatMillisecondsToConventional(now-atStepSince));
572 w.print("\n");
573 w.flush();
574 }
575
576
577
578
579 public String singleLineLegend() {
580 return "#serialNumber processorName currentUri (fetchAttempts) threadState threadStep";
581 }
582
583
584
585
586 public String[] getReports() {
587
588 return new String[] {};
589 }
590
591 public void reportTo(PrintWriter writer) {
592 reportTo(null, writer);
593 }
594
595
596
597
598 public String singleLineReport() {
599 return ArchiveUtils.singleLineReport(this);
600 }
601
602 public void progressStatisticsLine(PrintWriter writer) {
603 writer.print(getController().getStatistics()
604 .getProgressStatisticsLine());
605 writer.print("\n");
606 }
607
608 public void progressStatisticsLegend(PrintWriter writer) {
609 writer.print(getController().getStatistics()
610 .progressStatisticsLegend());
611 writer.print("\n");
612 }
613
614 public String getCurrentProcessorName() {
615 return currentProcessorName;
616 }
617 }