1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor;
24
25 import java.util.regex.Matcher;
26
27 import org.archive.crawler.datamodel.CandidateURI;
28 import org.archive.crawler.settings.SimpleType;
29 import org.archive.net.PublicSuffixes;
30 import org.archive.util.TextUtils;
31
32 import st.ata.util.FPGenerator;
33
34 /***
35 * Maps URIs to one of N crawler names by applying a hash to the
36 * URI's (possibly-transformed) classKey.
37 *
38 * @author gojomo
39 * @version $Date: 2007-06-19 02:00:24 +0000 (Tue, 19 Jun 2007) $, $Revision: 5215 $
40 */
41 public class HashCrawlMapper extends CrawlMapper {
42 private static final long serialVersionUID = 1L;
43
44 /*** count of crawlers */
45 public static final String ATTR_CRAWLER_COUNT = "crawler-count";
46 public static final Long DEFAULT_CRAWLER_COUNT = new Long(1);
47
48 /*** ruse publicsuffixes pattern for reducing classKey? */
49 public static final String ATTR_USE_PUBLICSUFFIX_REDUCE =
50 "use_publicsuffix_reduction";
51 public static final Boolean DEFAULT_USE_PUBLICSUFFIX_REDUCE = true;
52
53 /*** regex pattern for reducing classKey */
54 public static final String ATTR_REDUCE_PATTERN = "reduce-prefix-pattern";
55 public static final String DEFAULT_REDUCE_PATTERN = "";
56
57 long bucketCount = 1;
58 String reducePattern = null;
59
60 /***
61 * Constructor.
62 * @param name Name of this processor.
63 */
64 public HashCrawlMapper(String name) {
65 super(name, "HashCrawlMapper. Maps URIs to a numerically named " +
66 "crawler by hashing the URI's (possibly transfored) " +
67 "classKey to one of the specified number of buckets.");
68 addElementToDefinition(new SimpleType(ATTR_CRAWLER_COUNT,
69 "Number of crawlers among which to split up the URIs. " +
70 "Their names are assumed to be 0..N-1.",
71 DEFAULT_CRAWLER_COUNT));
72 addElementToDefinition(new SimpleType(ATTR_USE_PUBLICSUFFIX_REDUCE,
73 "Whether to use a built-in regular expression, built from " +
74 "the 'public suffix' list at publicsuffix.org, for " +
75 "reducing classKeys to mapping keys. If true, the default, " +
76 "then the '"+ATTR_REDUCE_PATTERN+"' setting is ignored.",
77 DEFAULT_USE_PUBLICSUFFIX_REDUCE));
78 addElementToDefinition(new SimpleType(ATTR_REDUCE_PATTERN,
79 "A regex pattern to apply to the classKey, using " +
80 "the first match as the mapping key. Ignored if '"+
81 ATTR_USE_PUBLICSUFFIX_REDUCE+"' is set true. If empty " +
82 "(the default), use the full classKey.",
83 DEFAULT_REDUCE_PATTERN));
84 }
85
86 /***
87 * Look up the crawler node name to which the given CandidateURI
88 * should be mapped.
89 *
90 * @param cauri CandidateURI to consider
91 * @return String node name which should handle URI
92 */
93 protected String map(CandidateURI cauri) {
94
95 String key = getController().getFrontier().getClassKey(cauri);
96 return mapString(key, reducePattern, bucketCount);
97 }
98
99 protected void initialTasks() {
100 super.initialTasks();
101 bucketCount = (Long) getUncheckedAttribute(null,ATTR_CRAWLER_COUNT);
102 kickUpdate();
103 }
104
105 @Override
106 public void kickUpdate() {
107 super.kickUpdate();
108 if ((Boolean) getUncheckedAttribute(null, ATTR_USE_PUBLICSUFFIX_REDUCE)) {
109 reducePattern = PublicSuffixes.getTopmostAssignedSurtPrefixRegex();
110 } else {
111 reducePattern = (String) getUncheckedAttribute(null,
112 ATTR_REDUCE_PATTERN);
113 }
114 }
115
116 public static String mapString(String key, String reducePattern, long bucketCount) {
117 if(reducePattern!=null && reducePattern.length()>0) {
118 Matcher matcher = TextUtils.getMatcher(reducePattern,key);
119 if(matcher.find()) {
120 key = matcher.group();
121 }
122 TextUtils.recycleMatcher(matcher);
123 }
124 long fp = FPGenerator.std64.fp(key);
125 long bucket = fp % bucketCount;
126 return Long.toString(bucket >= 0 ? bucket : -bucket);
127 }
128 }