1   /* CrawlMapper
2    * 
3    * Created on Sep 30, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor;
24  
25  import java.io.BufferedOutputStream;
26  import java.io.File;
27  import java.io.FileNotFoundException;
28  import java.io.FileOutputStream;
29  import java.io.PrintWriter;
30  import java.util.HashMap;
31  import java.util.Iterator;
32  
33  import javax.management.AttributeNotFoundException;
34  
35  import org.archive.crawler.datamodel.CandidateURI;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.datamodel.FetchStatusCodes;
38  import org.archive.crawler.deciderules.DecideRule;
39  import org.archive.crawler.deciderules.DecideRuleSequence;
40  import org.archive.crawler.framework.Processor;
41  import org.archive.crawler.settings.SimpleType;
42  import org.archive.util.ArchiveUtils;
43  import org.archive.util.fingerprint.ArrayLongFPCache;
44  
45  import st.ata.util.FPGenerator;
46  
47  /***
48   * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
49   * between crawlers by diverting some range of URIs to local log files
50   * (which can then be imported to other crawlers). 
51   * 
52   * May operate on a CrawlURI (typically early in the processing chain) or
53   * its CandidateURI outlinks (late in the processing chain, after 
54   * LinksScoper), or both (if inserted and configured in both places). 
55   * 
56   * <p>Applies a map() method, supplied by a concrete subclass, to
57   * classKeys to map URIs to crawlers by name. 
58   *
59   * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
60   * this name are not diverted, but continue to be processed normally.
61   *
62   * <p>If using the JMX importUris operation importing URLs dropped by
63   * a {@link CrawlMapper} instance, use <code>recoveryLog</code> style.
64   * 
65   * @author gojomo
66   * @version $Date: 2007-06-07 21:34:56 +0000 (Thu, 07 Jun 2007) $, $Revision: 5199 $
67   */
68  public abstract class CrawlMapper extends Processor implements FetchStatusCodes {
69      /***
70       * PrintWriter which remembers the File to which it writes. 
71       */
72      private class FilePrintWriter extends PrintWriter {
73          File file; 
74          public FilePrintWriter(File file) throws FileNotFoundException {
75              super(new BufferedOutputStream(new FileOutputStream(file)));
76              this.file = file; 
77          }
78          public File getFile() {
79              return file;
80          }
81      }
82      
83      /*** whether to map CrawlURI itself (if status nonpositive) */
84      public static final String ATTR_CHECK_URI = "check-uri";
85      public static final Boolean DEFAULT_CHECK_URI = Boolean.TRUE;
86      
87      /*** whether to map CrawlURI's outlinks (if CandidateURIs) */
88      public static final String ATTR_CHECK_OUTLINKS = "check-outlinks";
89      public static final Boolean DEFAULT_CHECK_OUTLINKS = Boolean.TRUE;
90  
91      /*** decide rules to determine if an outlink is subject to mapping */ 
92      public static final String ATTR_MAP_OUTLINK_DECIDE_RULES = "decide-rules";
93  
94      /*** name of local crawler (URIs mapped to here are not diverted) */
95      public static final String ATTR_LOCAL_NAME = "local-name";
96      public static final String DEFAULT_LOCAL_NAME = ".";
97      
98      /*** where to log diversions  */
99      public static final String ATTR_DIVERSION_DIR = "diversion-dir";
100     public static final String DEFAULT_DIVERSION_DIR = "diversions";
101 
102     /*** rotate logs when change occurs within this # of digits of timestamp  */
103     public static final String ATTR_ROTATION_DIGITS = "rotation-digits";
104     public static final Integer DEFAULT_ROTATION_DIGITS = new Integer(10); // hourly
105     
106     /***
107      * Mapping of target crawlers to logs (PrintWriters)
108      */
109     HashMap<String,PrintWriter> diversionLogs
110      = new HashMap<String,PrintWriter>();
111 
112     /***
113      * Truncated timestamp prefix for diversion logs; when
114      * current time doesn't match, it's time to close all
115      * current logs. 
116      */
117     String logGeneration = "";
118     
119     /*** name of the enclosing crawler (URIs mapped here stay put) */
120     protected String localName;
121     
122     protected ArrayLongFPCache cache;
123     
124     /***
125      * Constructor.
126      * @param name Name of this processor.
127      */
128     public CrawlMapper(String name, String description) {
129         super(name, description);
130         addElementToDefinition(new SimpleType(ATTR_LOCAL_NAME,
131             "Name of local crawler node; mappings to this name " +
132             "result in normal processing (no diversion).",
133             DEFAULT_LOCAL_NAME));
134         addElementToDefinition(new SimpleType(ATTR_DIVERSION_DIR,
135             "Directory to write diversion logs.",
136             DEFAULT_DIVERSION_DIR));
137         addElementToDefinition(new SimpleType(ATTR_CHECK_URI,
138             "Whether to apply the mapping to a URI being processed " +
139             "itself, for example early in processing (while its " +
140             "status is still 'unattempted').",
141             DEFAULT_CHECK_URI));
142         addElementToDefinition(new SimpleType(ATTR_CHECK_OUTLINKS,
143             "Whether to apply the mapping to discovered outlinks, " +
144             "for example after extraction has occurred. ",
145             DEFAULT_CHECK_OUTLINKS));
146         addElementToDefinition(new DecideRuleSequence(
147                 ATTR_MAP_OUTLINK_DECIDE_RULES));
148         addElementToDefinition(new SimpleType(ATTR_ROTATION_DIGITS,
149                 "Number of timestamp digits to use as prefix of log " +
150                 "names (grouping all diversions from that period in " +
151                 "a single log). Default is 10 (hourly log rotation).",
152                 DEFAULT_ROTATION_DIGITS));
153     }
154 
155 
156     protected void innerProcess(CrawlURI curi) {
157         String nowGeneration = 
158             ArchiveUtils.get14DigitDate().substring(
159                         0,
160                         ((Integer) getUncheckedAttribute(null,
161                                 ATTR_ROTATION_DIGITS)).intValue());
162         if(!nowGeneration.equals(logGeneration)) {
163             updateGeneration(nowGeneration);
164         }
165         
166         if (curi.getFetchStatus() <= 0 // unfetched/unsuccessful
167                 && ((Boolean) getUncheckedAttribute(null, ATTR_CHECK_URI))
168                         .booleanValue()) {
169             // apply mapping to the CrawlURI itself
170             String target = map(curi);
171             if(!localName.equals(target)) {
172                 // CrawlURI is mapped to somewhere other than here
173                 curi.setFetchStatus(S_BLOCKED_BY_CUSTOM_PROCESSOR);
174                 curi.addAnnotation("to:"+target);
175                 curi.skipToProcessorChain(getController().
176                         getPostprocessorChain());
177                 divertLog(curi,target);
178             } else {
179                 // localName means keep locally; do nothing
180             }
181         }
182         
183         if ((Boolean) getUncheckedAttribute(null, ATTR_CHECK_OUTLINKS)) {
184             // consider outlinks for mapping
185             Iterator<CandidateURI> iter = curi.getOutCandidates().iterator();
186             while(iter.hasNext()) {
187                 CandidateURI cauri = iter.next();
188                 if (decideToMapOutlink(cauri)) {
189                     // apply mapping to the CandidateURI
190                     String target = map(cauri);
191                     if(!localName.equals(target)) {
192                         // CandidateURI is mapped to somewhere other than here
193                         iter.remove();
194                         divertLog(cauri,target);
195                     } else {
196                         // localName means keep locally; do nothing
197                     }
198                 }
199             }
200         }
201     }
202     
203     protected boolean decideToMapOutlink(CandidateURI cauri) {
204         boolean rejected = getMapOutlinkDecideRule(cauri).decisionFor(cauri)
205                 .equals(DecideRule.REJECT);
206         return !rejected;
207     }
208 
209     protected DecideRule getMapOutlinkDecideRule(Object o) {
210         try {
211             return (DecideRule)getAttribute(o, ATTR_MAP_OUTLINK_DECIDE_RULES);
212         } catch (AttributeNotFoundException e) {
213             throw new RuntimeException(e);
214         }
215     }
216     
217     
218     /***
219      * Close and mark as finished all existing diversion logs, and
220      * arrange for new logs to use the new generation prefix.
221      * 
222      * @param nowGeneration new generation (timestamp prefix) to use
223      */
224     protected synchronized void updateGeneration(String nowGeneration) {
225         // all existing logs are of a previous generation
226         Iterator iter = diversionLogs.values().iterator();
227         while(iter.hasNext()) {
228             FilePrintWriter writer = (FilePrintWriter) iter.next();
229             writer.close();
230             writer.getFile().renameTo(
231                     new File(writer.getFile().getAbsolutePath()
232                             .replaceFirst("//.open$", ".divert")));
233         }
234         diversionLogs.clear();
235         logGeneration = nowGeneration;
236     }
237 
238     /***
239      * Look up the crawler node name to which the given CandidateURI 
240      * should be mapped. 
241      * 
242      * @param cauri CandidateURI to consider
243      * @return String node name which should handle URI
244      */
245     protected abstract String map(CandidateURI cauri);
246 
247     
248     /***
249      * Note the given CandidateURI in the appropriate diversion log. 
250      * 
251      * @param cauri CandidateURI to append to a diversion log
252      * @param target String node name (log name) to receive URI
253      */
254     protected synchronized void divertLog(CandidateURI cauri, String target) {
255         if(recentlySeen(cauri)) {
256             return;
257         }
258         PrintWriter diversionLog = getDiversionLog(target);
259         cauri.singleLineReportTo(diversionLog);
260         diversionLog.println();
261     }
262     
263     /***
264      * Consult the cache to determine if the given URI
265      * has been recently seen -- entering it if not. 
266      * 
267      * @param cauri CandidateURI to test
268      * @return true if URI was already in the cache; false otherwise 
269      */
270     private boolean recentlySeen(CandidateURI cauri) {
271         long fp = FPGenerator.std64.fp(cauri.toString());
272         return ! cache.add(fp);
273     }
274 
275     /***
276      * Get the diversion log for a given target crawler node node. 
277      * 
278      * @param target crawler node name of requested log
279      * @return PrintWriter open on an appropriately-named 
280      * log file
281      */
282     protected PrintWriter getDiversionLog(String target) {
283         FilePrintWriter writer = (FilePrintWriter) diversionLogs.get(target);
284         if(writer == null) {
285             String divertDirPath = (String) getUncheckedAttribute(null,ATTR_DIVERSION_DIR);
286             File divertDir = new File(divertDirPath);
287             if (!divertDir.isAbsolute()) {
288                 divertDir = new File(getSettingsHandler().getOrder()
289                         .getController().getDisk(), divertDirPath);
290             }
291             divertDir.mkdirs();
292             File divertLog = 
293                 new File(divertDir,
294                          logGeneration+"-"+localName+"-to-"+target+".open");
295             try {
296                 writer = new FilePrintWriter(divertLog);
297             } catch (FileNotFoundException e) {
298                 // TODO Auto-generated catch block
299                 e.printStackTrace();
300                 throw new RuntimeException(e);
301             }
302             diversionLogs.put(target,writer);
303         } 
304         return writer;
305     }
306 
307     protected void initialTasks() {
308         super.initialTasks();
309         localName = (String) getUncheckedAttribute(null, ATTR_LOCAL_NAME);
310         cache = new ArrayLongFPCache();
311     }
312 }