1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor;
24
25 import java.io.BufferedOutputStream;
26 import java.io.File;
27 import java.io.FileNotFoundException;
28 import java.io.FileOutputStream;
29 import java.io.PrintWriter;
30 import java.util.HashMap;
31 import java.util.Iterator;
32
33 import javax.management.AttributeNotFoundException;
34
35 import org.archive.crawler.datamodel.CandidateURI;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.datamodel.FetchStatusCodes;
38 import org.archive.crawler.deciderules.DecideRule;
39 import org.archive.crawler.deciderules.DecideRuleSequence;
40 import org.archive.crawler.framework.Processor;
41 import org.archive.crawler.settings.SimpleType;
42 import org.archive.util.ArchiveUtils;
43 import org.archive.util.fingerprint.ArrayLongFPCache;
44
45 import st.ata.util.FPGenerator;
46
47 /***
48 * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
49 * between crawlers by diverting some range of URIs to local log files
50 * (which can then be imported to other crawlers).
51 *
52 * May operate on a CrawlURI (typically early in the processing chain) or
53 * its CandidateURI outlinks (late in the processing chain, after
54 * LinksScoper), or both (if inserted and configured in both places).
55 *
56 * <p>Applies a map() method, supplied by a concrete subclass, to
57 * classKeys to map URIs to crawlers by name.
58 *
59 * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
60 * this name are not diverted, but continue to be processed normally.
61 *
62 * <p>If using the JMX importUris operation importing URLs dropped by
63 * a {@link CrawlMapper} instance, use <code>recoveryLog</code> style.
64 *
65 * @author gojomo
66 * @version $Date: 2007-06-07 21:34:56 +0000 (Thu, 07 Jun 2007) $, $Revision: 5199 $
67 */
68 public abstract class CrawlMapper extends Processor implements FetchStatusCodes {
69 /***
70 * PrintWriter which remembers the File to which it writes.
71 */
72 private class FilePrintWriter extends PrintWriter {
73 File file;
74 public FilePrintWriter(File file) throws FileNotFoundException {
75 super(new BufferedOutputStream(new FileOutputStream(file)));
76 this.file = file;
77 }
78 public File getFile() {
79 return file;
80 }
81 }
82
83 /*** whether to map CrawlURI itself (if status nonpositive) */
84 public static final String ATTR_CHECK_URI = "check-uri";
85 public static final Boolean DEFAULT_CHECK_URI = Boolean.TRUE;
86
87 /*** whether to map CrawlURI's outlinks (if CandidateURIs) */
88 public static final String ATTR_CHECK_OUTLINKS = "check-outlinks";
89 public static final Boolean DEFAULT_CHECK_OUTLINKS = Boolean.TRUE;
90
91 /*** decide rules to determine if an outlink is subject to mapping */
92 public static final String ATTR_MAP_OUTLINK_DECIDE_RULES = "decide-rules";
93
94 /*** name of local crawler (URIs mapped to here are not diverted) */
95 public static final String ATTR_LOCAL_NAME = "local-name";
96 public static final String DEFAULT_LOCAL_NAME = ".";
97
98 /*** where to log diversions */
99 public static final String ATTR_DIVERSION_DIR = "diversion-dir";
100 public static final String DEFAULT_DIVERSION_DIR = "diversions";
101
102 /*** rotate logs when change occurs within this # of digits of timestamp */
103 public static final String ATTR_ROTATION_DIGITS = "rotation-digits";
104 public static final Integer DEFAULT_ROTATION_DIGITS = new Integer(10);
105
106 /***
107 * Mapping of target crawlers to logs (PrintWriters)
108 */
109 HashMap<String,PrintWriter> diversionLogs
110 = new HashMap<String,PrintWriter>();
111
112 /***
113 * Truncated timestamp prefix for diversion logs; when
114 * current time doesn't match, it's time to close all
115 * current logs.
116 */
117 String logGeneration = "";
118
119 /*** name of the enclosing crawler (URIs mapped here stay put) */
120 protected String localName;
121
122 protected ArrayLongFPCache cache;
123
124 /***
125 * Constructor.
126 * @param name Name of this processor.
127 */
128 public CrawlMapper(String name, String description) {
129 super(name, description);
130 addElementToDefinition(new SimpleType(ATTR_LOCAL_NAME,
131 "Name of local crawler node; mappings to this name " +
132 "result in normal processing (no diversion).",
133 DEFAULT_LOCAL_NAME));
134 addElementToDefinition(new SimpleType(ATTR_DIVERSION_DIR,
135 "Directory to write diversion logs.",
136 DEFAULT_DIVERSION_DIR));
137 addElementToDefinition(new SimpleType(ATTR_CHECK_URI,
138 "Whether to apply the mapping to a URI being processed " +
139 "itself, for example early in processing (while its " +
140 "status is still 'unattempted').",
141 DEFAULT_CHECK_URI));
142 addElementToDefinition(new SimpleType(ATTR_CHECK_OUTLINKS,
143 "Whether to apply the mapping to discovered outlinks, " +
144 "for example after extraction has occurred. ",
145 DEFAULT_CHECK_OUTLINKS));
146 addElementToDefinition(new DecideRuleSequence(
147 ATTR_MAP_OUTLINK_DECIDE_RULES));
148 addElementToDefinition(new SimpleType(ATTR_ROTATION_DIGITS,
149 "Number of timestamp digits to use as prefix of log " +
150 "names (grouping all diversions from that period in " +
151 "a single log). Default is 10 (hourly log rotation).",
152 DEFAULT_ROTATION_DIGITS));
153 }
154
155
156 protected void innerProcess(CrawlURI curi) {
157 String nowGeneration =
158 ArchiveUtils.get14DigitDate().substring(
159 0,
160 ((Integer) getUncheckedAttribute(null,
161 ATTR_ROTATION_DIGITS)).intValue());
162 if(!nowGeneration.equals(logGeneration)) {
163 updateGeneration(nowGeneration);
164 }
165
166 if (curi.getFetchStatus() <= 0
167 && ((Boolean) getUncheckedAttribute(null, ATTR_CHECK_URI))
168 .booleanValue()) {
169
170 String target = map(curi);
171 if(!localName.equals(target)) {
172
173 curi.setFetchStatus(S_BLOCKED_BY_CUSTOM_PROCESSOR);
174 curi.addAnnotation("to:"+target);
175 curi.skipToProcessorChain(getController().
176 getPostprocessorChain());
177 divertLog(curi,target);
178 } else {
179
180 }
181 }
182
183 if ((Boolean) getUncheckedAttribute(null, ATTR_CHECK_OUTLINKS)) {
184
185 Iterator<CandidateURI> iter = curi.getOutCandidates().iterator();
186 while(iter.hasNext()) {
187 CandidateURI cauri = iter.next();
188 if (decideToMapOutlink(cauri)) {
189
190 String target = map(cauri);
191 if(!localName.equals(target)) {
192
193 iter.remove();
194 divertLog(cauri,target);
195 } else {
196
197 }
198 }
199 }
200 }
201 }
202
203 protected boolean decideToMapOutlink(CandidateURI cauri) {
204 boolean rejected = getMapOutlinkDecideRule(cauri).decisionFor(cauri)
205 .equals(DecideRule.REJECT);
206 return !rejected;
207 }
208
209 protected DecideRule getMapOutlinkDecideRule(Object o) {
210 try {
211 return (DecideRule)getAttribute(o, ATTR_MAP_OUTLINK_DECIDE_RULES);
212 } catch (AttributeNotFoundException e) {
213 throw new RuntimeException(e);
214 }
215 }
216
217
218 /***
219 * Close and mark as finished all existing diversion logs, and
220 * arrange for new logs to use the new generation prefix.
221 *
222 * @param nowGeneration new generation (timestamp prefix) to use
223 */
224 protected synchronized void updateGeneration(String nowGeneration) {
225
226 Iterator iter = diversionLogs.values().iterator();
227 while(iter.hasNext()) {
228 FilePrintWriter writer = (FilePrintWriter) iter.next();
229 writer.close();
230 writer.getFile().renameTo(
231 new File(writer.getFile().getAbsolutePath()
232 .replaceFirst("//.open$", ".divert")));
233 }
234 diversionLogs.clear();
235 logGeneration = nowGeneration;
236 }
237
238 /***
239 * Look up the crawler node name to which the given CandidateURI
240 * should be mapped.
241 *
242 * @param cauri CandidateURI to consider
243 * @return String node name which should handle URI
244 */
245 protected abstract String map(CandidateURI cauri);
246
247
248 /***
249 * Note the given CandidateURI in the appropriate diversion log.
250 *
251 * @param cauri CandidateURI to append to a diversion log
252 * @param target String node name (log name) to receive URI
253 */
254 protected synchronized void divertLog(CandidateURI cauri, String target) {
255 if(recentlySeen(cauri)) {
256 return;
257 }
258 PrintWriter diversionLog = getDiversionLog(target);
259 cauri.singleLineReportTo(diversionLog);
260 diversionLog.println();
261 }
262
263 /***
264 * Consult the cache to determine if the given URI
265 * has been recently seen -- entering it if not.
266 *
267 * @param cauri CandidateURI to test
268 * @return true if URI was already in the cache; false otherwise
269 */
270 private boolean recentlySeen(CandidateURI cauri) {
271 long fp = FPGenerator.std64.fp(cauri.toString());
272 return ! cache.add(fp);
273 }
274
275 /***
276 * Get the diversion log for a given target crawler node node.
277 *
278 * @param target crawler node name of requested log
279 * @return PrintWriter open on an appropriately-named
280 * log file
281 */
282 protected PrintWriter getDiversionLog(String target) {
283 FilePrintWriter writer = (FilePrintWriter) diversionLogs.get(target);
284 if(writer == null) {
285 String divertDirPath = (String) getUncheckedAttribute(null,ATTR_DIVERSION_DIR);
286 File divertDir = new File(divertDirPath);
287 if (!divertDir.isAbsolute()) {
288 divertDir = new File(getSettingsHandler().getOrder()
289 .getController().getDisk(), divertDirPath);
290 }
291 divertDir.mkdirs();
292 File divertLog =
293 new File(divertDir,
294 logGeneration+"-"+localName+"-to-"+target+".open");
295 try {
296 writer = new FilePrintWriter(divertLog);
297 } catch (FileNotFoundException e) {
298
299 e.printStackTrace();
300 throw new RuntimeException(e);
301 }
302 diversionLogs.put(target,writer);
303 }
304 return writer;
305 }
306
307 protected void initialTasks() {
308 super.initialTasks();
309 localName = (String) getUncheckedAttribute(null, ATTR_LOCAL_NAME);
310 cache = new ArrayLongFPCache();
311 }
312 }