View Javadoc

1   /* HashCrawlMapper
2    * 
3    * Created on Sep 30, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor;
24  
25  import java.util.regex.Matcher;
26  
27  import org.archive.crawler.datamodel.CandidateURI;
28  import org.archive.crawler.settings.SimpleType;
29  import org.archive.net.PublicSuffixes;
30  import org.archive.util.TextUtils;
31  
32  import st.ata.util.FPGenerator;
33  
34  /***
35   * Maps URIs to one of N crawler names by applying a hash to the
36   * URI's (possibly-transformed) classKey. 
37   * 
38   * @author gojomo
39   * @version $Date: 2007-06-19 02:00:24 +0000 (Tue, 19 Jun 2007) $, $Revision: 5215 $
40   */
41  public class HashCrawlMapper extends CrawlMapper {
42      private static final long serialVersionUID = 1L;
43      
44      /*** count of crawlers */
45      public static final String ATTR_CRAWLER_COUNT = "crawler-count";
46      public static final Long DEFAULT_CRAWLER_COUNT = new Long(1);
47  
48      /*** ruse publicsuffixes pattern for reducing classKey? */
49      public static final String ATTR_USE_PUBLICSUFFIX_REDUCE = 
50          "use_publicsuffix_reduction";
51      public static final Boolean DEFAULT_USE_PUBLICSUFFIX_REDUCE = true;
52      
53      /*** regex pattern for reducing classKey */
54      public static final String ATTR_REDUCE_PATTERN = "reduce-prefix-pattern";
55      public static final String DEFAULT_REDUCE_PATTERN = "";
56   
57      long bucketCount = 1;
58      String reducePattern = null;
59   
60      /***
61       * Constructor.
62       * @param name Name of this processor.
63       */
64      public HashCrawlMapper(String name) {
65          super(name, "HashCrawlMapper. Maps URIs to a numerically named " +
66                  "crawler by hashing the URI's (possibly transfored) " +
67                  "classKey to one of the specified number of buckets.");
68          addElementToDefinition(new SimpleType(ATTR_CRAWLER_COUNT,
69              "Number of crawlers among which to split up the URIs. " +
70              "Their names are assumed to be 0..N-1.",
71              DEFAULT_CRAWLER_COUNT));
72          addElementToDefinition(new SimpleType(ATTR_USE_PUBLICSUFFIX_REDUCE,
73                  "Whether to use a built-in regular expression, built from " +
74                  "the 'public suffix' list at publicsuffix.org, for " +
75                  "reducing classKeys to mapping keys. If true, the default, " +
76                  "then the '"+ATTR_REDUCE_PATTERN+"' setting is ignored.",
77                  DEFAULT_USE_PUBLICSUFFIX_REDUCE));
78          addElementToDefinition(new SimpleType(ATTR_REDUCE_PATTERN,
79                  "A regex pattern to apply to the classKey, using " +
80                  "the first match as the mapping key. Ignored if '"+
81                  ATTR_USE_PUBLICSUFFIX_REDUCE+"' is set true. If empty " +
82                  "(the default), use the full classKey.",
83                  DEFAULT_REDUCE_PATTERN));
84      }
85  
86      /***
87       * Look up the crawler node name to which the given CandidateURI 
88       * should be mapped. 
89       * 
90       * @param cauri CandidateURI to consider
91       * @return String node name which should handle URI
92       */
93      protected String map(CandidateURI cauri) {
94          // get classKey, via frontier to generate if necessary
95          String key = getController().getFrontier().getClassKey(cauri);
96          return mapString(key, reducePattern, bucketCount); 
97      }
98  
99      protected void initialTasks() {
100         super.initialTasks();
101         bucketCount = (Long) getUncheckedAttribute(null,ATTR_CRAWLER_COUNT);
102         kickUpdate();
103     }
104 
105     @Override
106     public void kickUpdate() {
107         super.kickUpdate();
108         if ((Boolean) getUncheckedAttribute(null, ATTR_USE_PUBLICSUFFIX_REDUCE)) {
109             reducePattern = PublicSuffixes.getTopmostAssignedSurtPrefixRegex();
110         } else {
111             reducePattern = (String) getUncheckedAttribute(null,
112                     ATTR_REDUCE_PATTERN);
113         }
114     }
115     
116     public static String mapString(String key, String reducePattern, long bucketCount) {
117         if(reducePattern!=null && reducePattern.length()>0) {
118            Matcher matcher = TextUtils.getMatcher(reducePattern,key);
119            if(matcher.find()) {
120                key = matcher.group();
121            }
122            TextUtils.recycleMatcher(matcher);
123         }
124         long fp = FPGenerator.std64.fp(key);
125         long bucket = fp % bucketCount;
126         return Long.toString(bucket >= 0 ? bucket : -bucket);
127     }
128 }