1   /* OnDomainsDecideRule
2   *
3   * $Id: OnDomainsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on Apr 5, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  
28  import org.archive.util.SurtPrefixSet;
29  
30  
31  /***
32   * Rule applies configured decision to any URIs that
33   * are on one of the domains in the configured set of
34   * domains, filled from the seed set. 
35   *
36   * @author gojomo
37   */
38  public class OnDomainsDecideRule extends SurtPrefixedDecideRule {
39  
40      private static final long serialVersionUID = -3872369060554558805L;
41      //private static final Logger logger =
42      //    Logger.getLogger(OnDomainsDecideRule.class.getName());
43      /***
44       * Usual constructor. 
45       * @param name
46       */
47      public OnDomainsDecideRule(String name) {
48          super(name);
49          setDescription(
50                   "OnDomainsDecideRule. Makes the configured decision " +
51                   "for any URI which is inside one of the domains in the " +
52                   "configured set of domains (derived from the seed" +
53                   "list, with 'www' removed when present).");
54          // disable direct setting of SURTs-related options
55         //getElementFromDefinition(ATTR_SEEDS_AS_SURT_PREFIXES).setTransient(true);
56         //getElementFromDefinition(ATTR_SURTS_SOURCE_FILE).setTransient(true);
57         // leaving surts-dump as option helpful for debugging/learning, for now
58         //getElementFromDefinition(ATTR_SURTS_DUMP_FILE).setTransient(true);
59      }
60  
61      /***
62       * Patch the SURT prefix set so that it only includes host-enforcing prefixes
63       * 
64       * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes()
65       */
66      protected void readPrefixes() {
67          buildSurtPrefixSet();
68          surtPrefixes.convertAllPrefixesToDomains();
69          dumpSurtPrefixSet();
70      }
71      
72  	protected String prefixFrom(String uri) {
73  		return SurtPrefixSet.convertPrefixToDomain(super.prefixFrom(uri));
74  	}
75  }