View Javadoc

1   /* LexicalCrawlMapper
2    * 
3    * Created on Sep 30, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor;
24  
25  import java.io.BufferedReader;
26  import java.io.File;
27  import java.io.FileReader;
28  import java.io.IOException;
29  import java.io.InputStreamReader;
30  import java.io.Reader;
31  import java.net.URL;
32  import java.net.URLConnection;
33  import java.util.Iterator;
34  import java.util.SortedMap;
35  import java.util.TreeMap;
36  
37  import org.archive.crawler.datamodel.CandidateURI;
38  import org.archive.crawler.settings.SimpleType;
39  import org.archive.util.iterator.LineReadingIterator;
40  import org.archive.util.iterator.RegexpLineIterator;
41  
42  
43  /***
44   * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
45   * between crawlers by diverting some range of URIs to local log files
46   * (which can then be imported to other crawlers). 
47   * 
48   * May operate on a CrawlURI (typically early in the processing chain) or
49   * its CandidateURI outlinks (late in the processing chain, after 
50   * LinksScoper), or both (if inserted and configured in both places). 
51   * 
52   * <p>Uses lexical comparisons of classKeys to map URIs to crawlers. The
53   * 'map' is specified via either a local or HTTP-fetchable file. Each
54   * line of this file should contain two space-separated tokens, the
55   * first a key and the second a crawler node name (which should be
56   * legal as part of a filename). All URIs will be mapped to the crawler
57   * node name associated with the nearest mapping key equal or subsequent 
58   * to the URI's own classKey. If there are no mapping keys equal or 
59   * after the classKey, the mapping 'wraps around' to the first mapping key.
60   * 
61   * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
62   * this name are not diverted, but continue to be processed normally.
63   * 
64   * <p>For example, assume a SurtAuthorityQueueAssignmentPolicy and
65   * a simple mapping file:
66   * 
67   * <pre>
68   *  d crawlerA
69   *  ~ crawlerB
70   * </pre>
71   * <p>All URIs with "com," classKeys will find the 'd' key as the nearest
72   * subsequent mapping key, and thus be mapped to 'crawlerA'. If that's
73   * the 'local name', the URIs will be processed normally; otherwise, the
74   * URI will be written to a diversion log aimed for 'crawlerA'. 
75   * 
76   * <p>If using the JMX importUris operation importing URLs dropped by
77   * a {@link LexicalCrawlMapper} instance, use <code>recoveryLog</code> style.
78   * 
79   * @author gojomo
80   * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
81   */
82  public class LexicalCrawlMapper extends CrawlMapper {
83      private static final long serialVersionUID = 1L;
84      
85      /*** where to load map from */
86      public static final String ATTR_MAP_SOURCE = "map-source";
87      public static final String DEFAULT_MAP_SOURCE = "";
88      
89      /***
90       * Mapping of classKey ranges (as represented by their start) to 
91       * crawlers (by abstract name/filename)
92       */
93      TreeMap<String, String> map = new TreeMap<String, String>();
94  
95      /***
96       * Constructor.
97       * @param name Name of this processor.
98       */
99      public LexicalCrawlMapper(String name) {
100         super(name, "LexicalCrawlMapper. Maps URIs to a named " +
101                 "crawler by a lexical comparison of the URI's " +
102                 "classKey to a supplied ranges map.");
103         addElementToDefinition(new SimpleType(ATTR_MAP_SOURCE,
104             "Path (or HTTP URL) to map specification file. Each line " +
105             "should include 2 whitespace-separated tokens: the first a " +
106             "key indicating the end of a range, the second the crawler " +
107             "node to which URIs in the key range should be mapped.",
108             DEFAULT_MAP_SOURCE));
109     }
110 
111     /***
112      * Look up the crawler node name to which the given CandidateURI 
113      * should be mapped. 
114      * 
115      * @param cauri CandidateURI to consider
116      * @return String node name which should handle URI
117      */
118     protected String map(CandidateURI cauri) {
119         // get classKey, via frontier to generate if necessary
120         String classKey = getController().getFrontier().getClassKey(cauri);
121         SortedMap tail = map.tailMap(classKey);
122         if(tail.isEmpty()) {
123             // wraparound
124             tail = map;
125         }
126         // target node is value of nearest subsequent key
127         return (String) tail.get(tail.firstKey());
128     }
129 
130     protected void initialTasks() {
131         super.initialTasks();
132         try {
133             loadMap();
134         } catch (IOException e) {
135             e.printStackTrace();
136             throw new RuntimeException(e);
137         }
138     }
139 
140     /***
141      * Retrieve and parse the mapping specification from a local path or
142      * HTTP URL. 
143      * 
144      * @throws IOException
145      */
146     protected void loadMap() throws IOException {
147         map.clear();
148         String mapSource = (String) getUncheckedAttribute(null,ATTR_MAP_SOURCE);
149         Reader reader = null;
150         if(!mapSource.startsWith("http://")) {
151             // file-based source
152             File source = new File(mapSource);
153             if (!source.isAbsolute()) {
154                 source = new File(getSettingsHandler().getOrder()
155                         .getController().getDisk(), mapSource);
156             }
157             reader = new FileReader(source);
158         } else {
159             URLConnection conn = (new URL(mapSource)).openConnection();
160             reader = new InputStreamReader(conn.getInputStream());
161         }
162         reader = new BufferedReader(reader);
163         Iterator iter = 
164             new RegexpLineIterator(
165                     new LineReadingIterator((BufferedReader) reader),
166                     RegexpLineIterator.COMMENT_LINE,
167                     RegexpLineIterator.TRIMMED_ENTRY_TRAILING_COMMENT,
168                     RegexpLineIterator.ENTRY);
169         while (iter.hasNext()) {
170             String[] entry = ((String) iter.next()).split("//s+");
171             map.put(entry[0],entry[1]);
172         }
173         reader.close();
174     }
175 }