1   /* SurtPrefixedDecideRule
2   *
3   * $Id: SurtPrefixedDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on Apr 5, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import java.io.File;
28  import java.io.FileReader;
29  import java.io.FileWriter;
30  import java.io.IOException;
31  
32  import org.archive.crawler.datamodel.CandidateURI;
33  import org.archive.crawler.framework.CrawlScope;
34  import org.archive.crawler.scope.SeedListener;
35  import org.archive.crawler.settings.SimpleType;
36  import org.archive.crawler.settings.Type;
37  import org.archive.util.SurtPrefixSet;
38  
39  
40  
41  /***
42   * Rule applies configured decision to any URIs that, when 
43   * expressed in SURT form, begin with one of the prefixes
44   * in the configured set. 
45   * 
46   * The set can be filled with SURT prefixes implied or
47   * listed in the seeds file, or another external file. 
48   *
49   * The "also-check-via" option to implement "one hop off" 
50   * scoping derives from a contribution by Shifra Raffel
51   * of the California Digital Library. 
52   * 
53   * @author gojomo
54   */
55  public class SurtPrefixedDecideRule extends PredicatedDecideRule 
56          implements SeedListener {
57  
58      private static final long serialVersionUID = 2075790126085405015L;
59  
60      //private static final Logger logger =
61      //    Logger.getLogger(SurtPrefixedDecideRule.class.getName());
62      
63      public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
64      public static final String ATTR_SEEDS_AS_SURT_PREFIXES =
65          "seeds-as-surt-prefixes";
66      public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
67      
68      private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES =
69          new Boolean(true);
70  
71      /***
72       * Whether every config change should trigger a 
73       * rebuilding of the prefix set.
74       */
75      public static final String 
76          ATTR_REBUILD_ON_RECONFIG = "rebuild-on-reconfig";
77      public static final Boolean
78          DEFAULT_REBUILD_ON_RECONFIG = Boolean.TRUE;
79      
80      /***
81       * Whether the 'via' of CrawlURIs should also be checked
82       * to see if it is prefixed by the set of SURT prefixes
83       */
84      public static final String 
85          ATTR_ALSO_CHECK_VIA = "also-check-via";
86      public static final Boolean
87          DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
88      
89      protected SurtPrefixSet surtPrefixes = null;
90  
91      /***
92       * Usual constructor. 
93       * @param name
94       */
95      public SurtPrefixedDecideRule(String name) {
96          super(name);
97          setDescription("SurtPrefixedDecideRule. Makes the configured decision "
98                  + "for any URI which, when expressed in SURT form, begins "
99                  + "with any of the established prefixes (from either seeds "
100                 + "specification or an external file).");
101         addElementToDefinition(new SimpleType(ATTR_SURTS_SOURCE_FILE,
102                 "Source file from which to infer SURT prefixes. Any URLs " +
103                 "in file will be converted to the implied SURT prefix, and " +
104                 "literal SURT prefixes may be listed on lines beginning " +
105                 "with a '+' character.",
106                 ""));
107         addElementToDefinition(new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,
108                 "Should seeds also be interpreted as SURT prefixes.",
109                 DEFAULT_SEEDS_AS_SURT_PREFIXES));
110         Type t = addElementToDefinition(new SimpleType(ATTR_SURTS_DUMP_FILE,
111                 "Dump file to save SURT prefixes actually used: " +
112                 "Useful debugging SURTs.", ""));
113         t.setExpertSetting(true);
114         t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
115                 "Whether to also make the configured decision if a " +
116                 "URI's 'via' URI (the URI from which it was discovered) " +
117                 "in SURT form begins with any of the established prefixes. " +
118                 "For example, can be used to ACCEPT URIs that are 'one hop " +
119                 "off' URIs fitting the SURT prefixes. Default is false.",
120                 DEFAULT_ALSO_CHECK_VIA));
121         t.setOverrideable(false);
122         t.setExpertSetting(true);
123         t = addElementToDefinition(new SimpleType(ATTR_REBUILD_ON_RECONFIG,
124                 "Whether to rebuild the internal structures from source " +
125                 "files (including seeds if appropriate) every time any " +
126                 "configuration change occurs. If true, " +
127                 "rule is rebuilt from sources even when (for example) " +
128                 "unrelated new domain overrides are set. Rereading large" +
129                 "source files can take a long time.", 
130                 DEFAULT_REBUILD_ON_RECONFIG));
131         t.setOverrideable(false);
132         t.setExpertSetting(true);
133     }
134 
135     /***
136      * Evaluate whether given object's URI is covered by the SURT prefix set
137      * 
138      * @param object Item to evaluate.
139      * @return true if item, as SURT form URI, is prefixed by an item in the set
140      */
141     protected boolean evaluate(Object object) {
142         if ( (object instanceof CandidateURI) && 
143                 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
144                     .booleanValue()) {
145             if(evaluate(((CandidateURI)object).getVia())) {
146                 return true;
147             }
148         }
149         String candidateSurt;
150         candidateSurt = SurtPrefixSet.getCandidateSurt(object);
151         if (candidateSurt == null) {
152             return false;
153         }
154         return getPrefixes().containsPrefixOf(candidateSurt);
155     }
156 
157     /***
158      * Synchronized get of prefix set to use
159      * 
160      * @return SurtPrefixSet to use for check
161      */
162     private synchronized SurtPrefixSet getPrefixes() {
163         if (surtPrefixes == null) {
164             readPrefixes();
165         }
166         return surtPrefixes;
167     }
168 
169     protected void readPrefixes() {
170         buildSurtPrefixSet();
171         dumpSurtPrefixSet();
172     }
173     
174     /***
175      * Dump the current prefixes in use to configured dump file (if any)
176      */
177     protected void dumpSurtPrefixSet() {
178         // dump surts to file, if appropriate
179         String dumpPath = (String)getUncheckedAttribute(null,
180             ATTR_SURTS_DUMP_FILE);
181         if (dumpPath.length() > 0) {
182             File dump = new File(dumpPath);
183             if (!dump.isAbsolute()) {
184                 dump = new File(getSettingsHandler().getOrder().getController()
185                     .getDisk(), dumpPath);
186             }
187             try {
188                 FileWriter fw = new FileWriter(dump);
189                 try {
190                     surtPrefixes.exportTo(fw);
191                 } finally {
192                     fw.close();
193                 }
194             } catch (IOException e) {
195                 e.printStackTrace();
196                 throw new RuntimeException(e);
197             }
198         }
199     }
200 
201     /***
202      * Construct the set of prefixes to use, from the seed list (
203      * which may include both URIs and '+'-prefixed directives).
204      */
205     protected void buildSurtPrefixSet() {
206         SurtPrefixSet newSurtPrefixes = new SurtPrefixSet();
207         FileReader fr = null;
208 
209         // read SURTs from file, if appropriate
210         String sourcePath = (String)getUncheckedAttribute(null,
211                 ATTR_SURTS_SOURCE_FILE);
212         if (sourcePath.length() > 0) {
213             File source = new File(sourcePath);
214             if (!source.isAbsolute()) {
215                 source = new File(getSettingsHandler().getOrder()
216                     .getController().getDisk(), sourcePath);
217             }
218             try {
219                 fr = new FileReader(source);
220                 try {
221                     newSurtPrefixes.importFromMixed(fr, true);
222                 } finally {
223                     fr.close();
224                 }
225             } catch (IOException e) {
226                 e.printStackTrace();
227                 throw new RuntimeException(e);
228             }
229         }
230         
231         // interpret seeds as surts, if appropriate
232         boolean deduceFromSeeds = ((Boolean)getUncheckedAttribute(null,
233                 ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue();
234         if(deduceFromSeeds) {
235             try {
236                 fr = new FileReader(getSeedfile());
237                 try {
238                     newSurtPrefixes.importFromMixed(fr, deduceFromSeeds);
239                 } finally {
240                     fr.close();
241                 }
242             } catch (IOException e) {
243                 e.printStackTrace();
244                 throw new RuntimeException(e);
245             }
246         }
247 
248         surtPrefixes = newSurtPrefixes;
249     }
250 
251     /***
252      * Re-read prefixes after an update.
253      * 
254      * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
255      */
256     public synchronized void kickUpdate() {
257         super.kickUpdate();
258         if (((Boolean) getUncheckedAttribute(null, ATTR_REBUILD_ON_RECONFIG))
259                 .booleanValue()) {
260             readPrefixes();
261         }
262         // TODO: make conditional on file having actually changed,
263         // perhaps by remembering mod-time
264     }
265 
266     /***
267      * Dig through everything to get the crawl-global seeds file. 
268      * Add self as listener while at it. 
269      * 
270      * @return Seed list file
271      */
272     protected File getSeedfile() {
273         CrawlScope scope =
274             getSettingsHandler().getOrder().getController().getScope();
275         scope.addSeedListener(this);
276         return scope.getSeedfile();
277     }
278 
279     public synchronized void addedSeed(final CandidateURI curi) {
280         SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone();
281         newSurtPrefixes.add(prefixFrom(curi.toString()));
282         surtPrefixes = newSurtPrefixes;
283     }
284     
285     protected String prefixFrom(String uri) {
286     	return SurtPrefixSet.prefixFromPlain(uri);
287     }
288 }