1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.scope;
25
26 import java.io.File;
27 import java.io.FileReader;
28 import java.io.FileWriter;
29 import java.io.IOException;
30
31 import org.archive.crawler.datamodel.CandidateURI;
32 import org.archive.crawler.deciderules.DecidingScope;
33 import org.archive.crawler.framework.CrawlController;
34 import org.archive.crawler.settings.SimpleType;
35 import org.archive.crawler.settings.Type;
36 import org.archive.util.SurtPrefixSet;
37
38 /***
39 * A specialized CrawlScope suitable for the most common crawl needs.
40 *
41 * Roughly, as with other existing CrawlScope variants, SurtPrefixScope's logic
42 * is that a URI is included if:
43 * <pre>
44 * ( isSeed(uri) || focusFilter.accepts(uri) ) ||
45 * transitiveFilter.accepts(uri) ) && ! excludeFilter.accepts(uri)
46 * </pre>
47 * Specifically, SurtPrefixScope uses a SurtFilter to test for focus-inclusion.
48 *
49 * @author gojomo
50 * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
51 */
52 public class SurtPrefixScope extends RefinedScope {
53
54 private static final long serialVersionUID = 2652008287322770123L;
55
56 public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
57 public static final String ATTR_SEEDS_AS_SURT_PREFIXES = "seeds-as-surt-prefixes";
58 public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
59
60 private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES = new Boolean(true);
61
62 /***
63 * Whether the 'via' of CrawlURIs should also be checked
64 * to see if it is prefixed by the set of SURT prefixes
65 */
66 public static final String
67 ATTR_ALSO_CHECK_VIA = "also-check-via";
68 public static final Boolean
69 DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
70
71 SurtPrefixSet surtPrefixes = null;
72
73 public SurtPrefixScope(String name) {
74 super(name);
75 setDescription(
76 "SurtPrefixScope: A scope for crawls limited to regions of " +
77 "the web defined by a set of SURT prefixes *Deprecated* " +
78 "Use DecidingScope instead. (The SURT form of " +
79 "a URI has its hostname reordered to ease sorting and "
80 + "grouping by domain hierarchies.)");
81 addElementToDefinition(
82 new SimpleType(ATTR_SURTS_SOURCE_FILE,
83 "Source file from which to infer SURT prefixes. Any URLs " +
84 "in file will be converted to the implied SURT prefix, and " +
85 "literal SURT prefixes may be listed on lines beginning " +
86 "with a '+' character.",
87 ""));
88 addElementToDefinition(
89 new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,
90 "Should seeds also be interpreted as SURT prefixes.",
91 DEFAULT_SEEDS_AS_SURT_PREFIXES));
92
93 Type t = addElementToDefinition(
94 new SimpleType(ATTR_SURTS_DUMP_FILE,
95 "Dump file to save SURT prefixes actually used.",
96 ""));
97 t.setExpertSetting(true);
98 t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
99 "Whether to also rule URI in-scope if a " +
100 "URI's 'via' URI (the URI from which it was discovered) " +
101 "in SURT form begins with any of the established prefixes. " +
102 "For example, can be used to accept URIs that are 'one hop " +
103 "off' URIs fitting the SURT prefixes. Default is false.",
104 DEFAULT_ALSO_CHECK_VIA));
105 t.setOverrideable(false);
106 t.setExpertSetting(true);
107
108 }
109
110
111
112
113
114 public void initialize(CrawlController controller) {
115 super.initialize(controller);
116 readPrefixes();
117 }
118
119 /***
120 * Check if a URI is part of this scope.
121 *
122 * @param object
123 * An instance of UURI or of CandidateURI.
124 * @return True if focus filter accepts passed object.
125 */
126 protected synchronized boolean focusAccepts(Object object) {
127
128 if (surtPrefixes == null) {
129 readPrefixes();
130 }
131 if ( (object instanceof CandidateURI) &&
132 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
133 .booleanValue()) {
134 if(focusAccepts(((CandidateURI)object).getVia())) {
135 return true;
136 }
137 }
138 String candidateSurt = SurtPrefixSet.getCandidateSurt(object);
139 if(candidateSurt == null) {
140 return false;
141 }
142 return surtPrefixes.containsPrefixOf(candidateSurt);
143 }
144
145 private void readPrefixes() {
146 surtPrefixes = new SurtPrefixSet();
147 FileReader fr = null;
148
149
150 String sourcePath = (String) getUncheckedAttribute(null,
151 ATTR_SURTS_SOURCE_FILE);
152 if(sourcePath.length()>0) {
153 File source = new File(sourcePath);
154 if (!source.isAbsolute()) {
155 source = new File(getSettingsHandler().getOrder()
156 .getController().getDisk(), sourcePath);
157 }
158 try {
159 fr = new FileReader(source);
160 try {
161 surtPrefixes.importFromMixed(fr,true);
162 } finally {
163 fr.close();
164 }
165
166 } catch (IOException e) {
167 e.printStackTrace();
168 throw new RuntimeException(e);
169 }
170 }
171
172
173 boolean deduceFromSeeds =
174 ((Boolean) getUncheckedAttribute(null, ATTR_SEEDS_AS_SURT_PREFIXES))
175 .booleanValue();
176 try {
177 fr = new FileReader(getSeedfile());
178 try {
179 surtPrefixes.importFromMixed(fr,deduceFromSeeds);
180 } finally {
181 fr.close();
182 }
183 } catch (IOException e) {
184 e.printStackTrace();
185 throw new RuntimeException(e);
186 }
187
188
189 String dumpPath = (String) getUncheckedAttribute(null,
190 ATTR_SURTS_DUMP_FILE);
191 if(dumpPath.length()>0) {
192 File dump = new File(dumpPath);
193 if (!dump.isAbsolute()) {
194 dump = new File(getSettingsHandler().getOrder()
195 .getController().getDisk(), dumpPath);
196 }
197 try {
198 FileWriter fw = new FileWriter(dump);
199 try {
200 surtPrefixes.exportTo(fw);
201 } finally {
202 fw.close();
203 }
204 } catch (IOException e) {
205 e.printStackTrace();
206 throw new RuntimeException(e);
207 }
208 }
209 }
210
211 /***
212 * Re-read prefixes after an update.
213 *
214 * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
215 */
216 public synchronized void kickUpdate() {
217 super.kickUpdate();
218
219
220 readPrefixes();
221 }
222 }