1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor.recrawl;
24
25 import java.io.File;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.PrintStream;
29 import java.util.concurrent.atomic.AtomicInteger;
30
31 import org.apache.commons.codec.binary.Base64;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.event.CrawlStatusListener;
34 import org.archive.crawler.io.CrawlerJournal;
35 import org.archive.crawler.settings.SimpleType;
36 import org.archive.util.FileUtils;
37 import org.archive.util.IoUtils;
38
39
40
41 /***
42 * Log CrawlURI attributes from latest fetch for consultation by a later
43 * recrawl. Log must be imported into alternate data structure in order
44 * to be consulted.
45 *
46 * @author gojomo
47 * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
48 */
49 public class PersistLogProcessor extends PersistProcessor implements CrawlStatusListener {
50 private static final long serialVersionUID = 1678691994065439346L;
51
52 protected CrawlerJournal log;
53
54 /*** setting for log filename */
55 public static final String ATTR_LOG_FILENAME = "log-filename";
56 /*** default log filename */
57 public static final String DEFAULT_LOG_FILENAME = "persistlog.txtser.gz";
58
59 /***
60 * Usual constructor
61 *
62 * @param name
63 */
64 public PersistLogProcessor(String name) {
65 super(name, "PersistLogProcessor. Logs CrawlURI attributes " +
66 "from latest fetch for consultation by a later recrawl.");
67
68 addElementToDefinition(new SimpleType(ATTR_LOG_FILENAME,
69 "Filename to which to log URI persistence information. " +
70 "Interpreted relative to job logs directory. " +
71 "Default is 'persistlog.txtser.gz'. ",
72 DEFAULT_LOG_FILENAME));
73 }
74
75 @Override
76 protected void initialTasks() {
77
78 getController().addCrawlStatusListener(this);
79 try {
80 File logFile = FileUtils.maybeRelative(getController().getLogsDir(),
81 (String) getUncheckedAttribute(null, ATTR_LOG_FILENAME));
82 log = new CrawlerJournal(logFile);
83 } catch (IOException e) {
84
85 throw new RuntimeException(e);
86 }
87 }
88
89 @Override
90 protected void finalTasks() {
91 log.close();
92 }
93
94 @Override
95 protected void innerProcess(CrawlURI curi) {
96 if(shouldStore(curi)) {
97 log.writeLine(persistKeyFor(curi), " ", new String(Base64.encodeBase64(IoUtils
98 .serializeToByteArray(curi.getPersistentAList()))));
99 }
100 }
101
102 public void crawlCheckpoint(File checkpointDir) throws Exception {
103
104 log.checkpoint(checkpointDir);
105 }
106
107 public void crawlEnded(String sExitMessage) {
108
109
110 }
111
112 public void crawlEnding(String sExitMessage) {
113
114
115 }
116
117 public void crawlPaused(String statusMessage) {
118
119
120 }
121
122 public void crawlPausing(String statusMessage) {
123
124
125 }
126
127 public void crawlResuming(String statusMessage) {
128
129
130 }
131
132 public void crawlStarted(String message) {
133
134 }
135 }