1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SurtPrefixScope.java
20   * Created on Oct 1, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.scope;
25  
26  import java.io.File;
27  import java.io.FileReader;
28  import java.io.FileWriter;
29  import java.io.IOException;
30  
31  import org.archive.crawler.datamodel.CandidateURI;
32  import org.archive.crawler.deciderules.DecidingScope;
33  import org.archive.crawler.framework.CrawlController;
34  import org.archive.crawler.settings.SimpleType;
35  import org.archive.crawler.settings.Type;
36  import org.archive.util.SurtPrefixSet;
37  
38  /***
39   * A specialized CrawlScope suitable for the most common crawl needs.
40   * 
41   * Roughly, as with other existing CrawlScope variants, SurtPrefixScope's logic
42   * is that a URI is included if:
43   * <pre>
44   *  ( isSeed(uri) || focusFilter.accepts(uri) ) ||
45   *     transitiveFilter.accepts(uri) ) && ! excludeFilter.accepts(uri)
46   * </pre>
47   * Specifically, SurtPrefixScope uses a SurtFilter to test for focus-inclusion.
48   * 
49   * @author gojomo
50   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingScope}.
51   */
52  public class SurtPrefixScope extends RefinedScope {
53  
54      private static final long serialVersionUID = 2652008287322770123L;
55  
56      public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
57      public static final String ATTR_SEEDS_AS_SURT_PREFIXES = "seeds-as-surt-prefixes";
58      public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
59      
60      private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES = new Boolean(true);
61  
62      /***
63       * Whether the 'via' of CrawlURIs should also be checked
64       * to see if it is prefixed by the set of SURT prefixes
65       */
66      public static final String 
67          ATTR_ALSO_CHECK_VIA = "also-check-via";
68      public static final Boolean
69          DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
70      
71      SurtPrefixSet surtPrefixes = null;
72  
73      public SurtPrefixScope(String name) {
74          super(name);
75          setDescription(
76                  "SurtPrefixScope: A scope for crawls limited to regions of " +
77                  "the web defined by a set of SURT prefixes *Deprecated* " +
78                  "Use DecidingScope instead. (The SURT form of " +
79                  "a URI has its hostname reordered to ease sorting and "
80                  + "grouping by domain hierarchies.)");
81          addElementToDefinition(
82                  new SimpleType(ATTR_SURTS_SOURCE_FILE, 
83                  		"Source file from which to infer SURT prefixes. Any URLs " +
84                          "in file will be converted to the implied SURT prefix, and " +
85                          "literal SURT prefixes may be listed on lines beginning " +
86                          "with a '+' character.", 
87                          ""));
88          addElementToDefinition(
89                  new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES, 
90                          "Should seeds also be interpreted as SURT prefixes.", 
91                          DEFAULT_SEEDS_AS_SURT_PREFIXES));
92          
93          Type t = addElementToDefinition(
94                  new SimpleType(ATTR_SURTS_DUMP_FILE, 
95                          "Dump file to save SURT prefixes actually used.", 
96                          ""));
97          t.setExpertSetting(true);
98          t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
99                  "Whether to also rule URI in-scope if a " +
100                 "URI's 'via' URI (the URI from which it was discovered) " +
101                 "in SURT form begins with any of the established prefixes. " +
102                 "For example, can be used to accept URIs that are 'one hop " +
103                 "off' URIs fitting the SURT prefixes. Default is false.",
104                 DEFAULT_ALSO_CHECK_VIA));
105         t.setOverrideable(false);
106         t.setExpertSetting(true);
107 
108     }
109 
110     
111     /* (non-Javadoc)
112      * @see org.archive.crawler.framework.CrawlScope#initialize(org.archive.crawler.framework.CrawlController)
113      */
114     public void initialize(CrawlController controller) {
115         super.initialize(controller);
116         readPrefixes();
117     }
118     
119     /***
120      * Check if a URI is part of this scope.
121      * 
122      * @param object
123      *            An instance of UURI or of CandidateURI.
124      * @return True if focus filter accepts passed object.
125      */
126     protected synchronized boolean focusAccepts(Object object) {
127         // TODO: eliminate duplication wrt/SurtPrefixedDecideRule.evaluate
128         if (surtPrefixes == null) {
129             readPrefixes();
130         }
131         if ( (object instanceof CandidateURI) && 
132                 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
133                     .booleanValue()) {
134             if(focusAccepts(((CandidateURI)object).getVia())) {
135                 return true;
136             }
137         }
138         String candidateSurt = SurtPrefixSet.getCandidateSurt(object);
139         if(candidateSurt == null) {
140             return false; 
141         }
142         return surtPrefixes.containsPrefixOf(candidateSurt);
143     }
144     
145     private void readPrefixes() {
146         surtPrefixes = new SurtPrefixSet(); 
147         FileReader fr = null;
148         
149         // read SURTs from file, if appropriate 
150         String sourcePath = (String) getUncheckedAttribute(null,
151                 ATTR_SURTS_SOURCE_FILE);
152         if(sourcePath.length()>0) {
153             File source = new File(sourcePath);
154             if (!source.isAbsolute()) {
155                 source = new File(getSettingsHandler().getOrder()
156                         .getController().getDisk(), sourcePath);
157             }
158             try {
159                 fr = new FileReader(source);
160                 try {
161                     surtPrefixes.importFromMixed(fr,true);
162                 } finally {
163                     fr.close();
164                 }
165         
166             } catch (IOException e) {
167                 e.printStackTrace();
168                 throw new RuntimeException(e);
169             } 
170         }
171         
172         // interpret seeds as surts, if appropriate
173         boolean deduceFromSeeds = 
174             ((Boolean) getUncheckedAttribute(null, ATTR_SEEDS_AS_SURT_PREFIXES))
175             .booleanValue();
176         try {
177             fr = new FileReader(getSeedfile());
178             try {
179                 surtPrefixes.importFromMixed(fr,deduceFromSeeds);
180             } finally {
181                 fr.close();
182             }
183         } catch (IOException e) {
184             e.printStackTrace();
185             throw new RuntimeException(e);
186         }  
187 
188         // dump surts to file, if appropriate
189         String dumpPath = (String) getUncheckedAttribute(null,
190                 ATTR_SURTS_DUMP_FILE);
191         if(dumpPath.length()>0) {
192             File dump = new File(dumpPath);
193             if (!dump.isAbsolute()) {
194                 dump = new File(getSettingsHandler().getOrder()
195                         .getController().getDisk(), dumpPath);
196             }
197             try {
198                 FileWriter fw = new FileWriter(dump);
199                 try {
200                     surtPrefixes.exportTo(fw);
201                 } finally {
202                     fw.close();
203                 }
204             } catch (IOException e) {
205                 e.printStackTrace();
206                 throw new RuntimeException(e);
207             }
208         }
209     }
210 
211     /***
212      * Re-read prefixes after an update. 
213      * 
214      * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
215      */
216     public synchronized void kickUpdate() {
217         super.kickUpdate();
218         // TODO: make conditional on file having actually changed,
219         // perhaps by remembering mod-time
220         readPrefixes();
221     }
222 }