1 /* OnDomainsDecideRule 2 * 3 * $Id: OnDomainsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $ 4 * 5 * Created on Apr 5, 2005 6 * 7 * Copyright (C) 2005 Internet Archive. 8 * 9 * This file is part of the Heritrix web crawler (crawler.archive.org). 10 * 11 * Heritrix is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser Public License as published by 13 * the Free Software Foundation; either version 2.1 of the License, or 14 * any later version. 15 * 16 * Heritrix is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU Lesser Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser Public License 22 * along with Heritrix; if not, write to the Free Software 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 */ 25 package org.archive.crawler.deciderules; 26 27 28 import org.archive.util.SurtPrefixSet; 29 30 31 /*** 32 * Rule applies configured decision to any URIs that 33 * are on one of the domains in the configured set of 34 * domains, filled from the seed set. 35 * 36 * @author gojomo 37 */ 38 public class OnDomainsDecideRule extends SurtPrefixedDecideRule { 39 40 private static final long serialVersionUID = -3872369060554558805L; 41 //private static final Logger logger = 42 // Logger.getLogger(OnDomainsDecideRule.class.getName()); 43 /*** 44 * Usual constructor. 45 * @param name 46 */ 47 public OnDomainsDecideRule(String name) { 48 super(name); 49 setDescription( 50 "OnDomainsDecideRule. Makes the configured decision " + 51 "for any URI which is inside one of the domains in the " + 52 "configured set of domains (derived from the seed" + 53 "list, with 'www' removed when present)."); 54 // disable direct setting of SURTs-related options 55 //getElementFromDefinition(ATTR_SEEDS_AS_SURT_PREFIXES).setTransient(true); 56 //getElementFromDefinition(ATTR_SURTS_SOURCE_FILE).setTransient(true); 57 // leaving surts-dump as option helpful for debugging/learning, for now 58 //getElementFromDefinition(ATTR_SURTS_DUMP_FILE).setTransient(true); 59 } 60 61 /*** 62 * Patch the SURT prefix set so that it only includes host-enforcing prefixes 63 * 64 * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes() 65 */ 66 protected void readPrefixes() { 67 buildSurtPrefixSet(); 68 surtPrefixes.convertAllPrefixesToDomains(); 69 dumpSurtPrefixSet(); 70 } 71 72 protected String prefixFrom(String uri) { 73 return SurtPrefixSet.convertPrefixToDomain(super.prefixFrom(uri)); 74 } 75 }