1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor;
24
25 import java.io.BufferedReader;
26 import java.io.File;
27 import java.io.FileReader;
28 import java.io.IOException;
29 import java.io.InputStreamReader;
30 import java.io.Reader;
31 import java.net.URL;
32 import java.net.URLConnection;
33 import java.util.Iterator;
34 import java.util.SortedMap;
35 import java.util.TreeMap;
36
37 import org.archive.crawler.datamodel.CandidateURI;
38 import org.archive.crawler.settings.SimpleType;
39 import org.archive.util.iterator.LineReadingIterator;
40 import org.archive.util.iterator.RegexpLineIterator;
41
42
43 /***
44 * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
45 * between crawlers by diverting some range of URIs to local log files
46 * (which can then be imported to other crawlers).
47 *
48 * May operate on a CrawlURI (typically early in the processing chain) or
49 * its CandidateURI outlinks (late in the processing chain, after
50 * LinksScoper), or both (if inserted and configured in both places).
51 *
52 * <p>Uses lexical comparisons of classKeys to map URIs to crawlers. The
53 * 'map' is specified via either a local or HTTP-fetchable file. Each
54 * line of this file should contain two space-separated tokens, the
55 * first a key and the second a crawler node name (which should be
56 * legal as part of a filename). All URIs will be mapped to the crawler
57 * node name associated with the nearest mapping key equal or subsequent
58 * to the URI's own classKey. If there are no mapping keys equal or
59 * after the classKey, the mapping 'wraps around' to the first mapping key.
60 *
61 * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
62 * this name are not diverted, but continue to be processed normally.
63 *
64 * <p>For example, assume a SurtAuthorityQueueAssignmentPolicy and
65 * a simple mapping file:
66 *
67 * <pre>
68 * d crawlerA
69 * ~ crawlerB
70 * </pre>
71 * <p>All URIs with "com," classKeys will find the 'd' key as the nearest
72 * subsequent mapping key, and thus be mapped to 'crawlerA'. If that's
73 * the 'local name', the URIs will be processed normally; otherwise, the
74 * URI will be written to a diversion log aimed for 'crawlerA'.
75 *
76 * <p>If using the JMX importUris operation importing URLs dropped by
77 * a {@link LexicalCrawlMapper} instance, use <code>recoveryLog</code> style.
78 *
79 * @author gojomo
80 * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
81 */
82 public class LexicalCrawlMapper extends CrawlMapper {
83 private static final long serialVersionUID = 1L;
84
85 /*** where to load map from */
86 public static final String ATTR_MAP_SOURCE = "map-source";
87 public static final String DEFAULT_MAP_SOURCE = "";
88
89 /***
90 * Mapping of classKey ranges (as represented by their start) to
91 * crawlers (by abstract name/filename)
92 */
93 TreeMap<String, String> map = new TreeMap<String, String>();
94
95 /***
96 * Constructor.
97 * @param name Name of this processor.
98 */
99 public LexicalCrawlMapper(String name) {
100 super(name, "LexicalCrawlMapper. Maps URIs to a named " +
101 "crawler by a lexical comparison of the URI's " +
102 "classKey to a supplied ranges map.");
103 addElementToDefinition(new SimpleType(ATTR_MAP_SOURCE,
104 "Path (or HTTP URL) to map specification file. Each line " +
105 "should include 2 whitespace-separated tokens: the first a " +
106 "key indicating the end of a range, the second the crawler " +
107 "node to which URIs in the key range should be mapped.",
108 DEFAULT_MAP_SOURCE));
109 }
110
111 /***
112 * Look up the crawler node name to which the given CandidateURI
113 * should be mapped.
114 *
115 * @param cauri CandidateURI to consider
116 * @return String node name which should handle URI
117 */
118 protected String map(CandidateURI cauri) {
119
120 String classKey = getController().getFrontier().getClassKey(cauri);
121 SortedMap tail = map.tailMap(classKey);
122 if(tail.isEmpty()) {
123
124 tail = map;
125 }
126
127 return (String) tail.get(tail.firstKey());
128 }
129
130 protected void initialTasks() {
131 super.initialTasks();
132 try {
133 loadMap();
134 } catch (IOException e) {
135 e.printStackTrace();
136 throw new RuntimeException(e);
137 }
138 }
139
140 /***
141 * Retrieve and parse the mapping specification from a local path or
142 * HTTP URL.
143 *
144 * @throws IOException
145 */
146 protected void loadMap() throws IOException {
147 map.clear();
148 String mapSource = (String) getUncheckedAttribute(null,ATTR_MAP_SOURCE);
149 Reader reader = null;
150 if(!mapSource.startsWith("http://")) {
151
152 File source = new File(mapSource);
153 if (!source.isAbsolute()) {
154 source = new File(getSettingsHandler().getOrder()
155 .getController().getDisk(), mapSource);
156 }
157 reader = new FileReader(source);
158 } else {
159 URLConnection conn = (new URL(mapSource)).openConnection();
160 reader = new InputStreamReader(conn.getInputStream());
161 }
162 reader = new BufferedReader(reader);
163 Iterator iter =
164 new RegexpLineIterator(
165 new LineReadingIterator((BufferedReader) reader),
166 RegexpLineIterator.COMMENT_LINE,
167 RegexpLineIterator.TRIMMED_ENTRY_TRAILING_COMMENT,
168 RegexpLineIterator.ENTRY);
169 while (iter.hasNext()) {
170 String[] entry = ((String) iter.next()).split("//s+");
171 map.put(entry[0],entry[1]);
172 }
173 reader.close();
174 }
175 }