1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.deciderules;
26
27 import java.io.File;
28 import java.io.FileReader;
29 import java.io.FileWriter;
30 import java.io.IOException;
31
32 import org.archive.crawler.datamodel.CandidateURI;
33 import org.archive.crawler.framework.CrawlScope;
34 import org.archive.crawler.scope.SeedListener;
35 import org.archive.crawler.settings.SimpleType;
36 import org.archive.crawler.settings.Type;
37 import org.archive.util.SurtPrefixSet;
38
39
40
41 /***
42 * Rule applies configured decision to any URIs that, when
43 * expressed in SURT form, begin with one of the prefixes
44 * in the configured set.
45 *
46 * The set can be filled with SURT prefixes implied or
47 * listed in the seeds file, or another external file.
48 *
49 * The "also-check-via" option to implement "one hop off"
50 * scoping derives from a contribution by Shifra Raffel
51 * of the California Digital Library.
52 *
53 * @author gojomo
54 */
55 public class SurtPrefixedDecideRule extends PredicatedDecideRule
56 implements SeedListener {
57
58 private static final long serialVersionUID = 2075790126085405015L;
59
60
61
62
63 public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
64 public static final String ATTR_SEEDS_AS_SURT_PREFIXES =
65 "seeds-as-surt-prefixes";
66 public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
67
68 private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES =
69 new Boolean(true);
70
71 /***
72 * Whether every config change should trigger a
73 * rebuilding of the prefix set.
74 */
75 public static final String
76 ATTR_REBUILD_ON_RECONFIG = "rebuild-on-reconfig";
77 public static final Boolean
78 DEFAULT_REBUILD_ON_RECONFIG = Boolean.TRUE;
79
80 /***
81 * Whether the 'via' of CrawlURIs should also be checked
82 * to see if it is prefixed by the set of SURT prefixes
83 */
84 public static final String
85 ATTR_ALSO_CHECK_VIA = "also-check-via";
86 public static final Boolean
87 DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
88
89 protected SurtPrefixSet surtPrefixes = null;
90
91 /***
92 * Usual constructor.
93 * @param name
94 */
95 public SurtPrefixedDecideRule(String name) {
96 super(name);
97 setDescription("SurtPrefixedDecideRule. Makes the configured decision "
98 + "for any URI which, when expressed in SURT form, begins "
99 + "with any of the established prefixes (from either seeds "
100 + "specification or an external file).");
101 addElementToDefinition(new SimpleType(ATTR_SURTS_SOURCE_FILE,
102 "Source file from which to infer SURT prefixes. Any URLs " +
103 "in file will be converted to the implied SURT prefix, and " +
104 "literal SURT prefixes may be listed on lines beginning " +
105 "with a '+' character.",
106 ""));
107 addElementToDefinition(new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,
108 "Should seeds also be interpreted as SURT prefixes.",
109 DEFAULT_SEEDS_AS_SURT_PREFIXES));
110 Type t = addElementToDefinition(new SimpleType(ATTR_SURTS_DUMP_FILE,
111 "Dump file to save SURT prefixes actually used: " +
112 "Useful debugging SURTs.", ""));
113 t.setExpertSetting(true);
114 t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
115 "Whether to also make the configured decision if a " +
116 "URI's 'via' URI (the URI from which it was discovered) " +
117 "in SURT form begins with any of the established prefixes. " +
118 "For example, can be used to ACCEPT URIs that are 'one hop " +
119 "off' URIs fitting the SURT prefixes. Default is false.",
120 DEFAULT_ALSO_CHECK_VIA));
121 t.setOverrideable(false);
122 t.setExpertSetting(true);
123 t = addElementToDefinition(new SimpleType(ATTR_REBUILD_ON_RECONFIG,
124 "Whether to rebuild the internal structures from source " +
125 "files (including seeds if appropriate) every time any " +
126 "configuration change occurs. If true, " +
127 "rule is rebuilt from sources even when (for example) " +
128 "unrelated new domain overrides are set. Rereading large" +
129 "source files can take a long time.",
130 DEFAULT_REBUILD_ON_RECONFIG));
131 t.setOverrideable(false);
132 t.setExpertSetting(true);
133 }
134
135 /***
136 * Evaluate whether given object's URI is covered by the SURT prefix set
137 *
138 * @param object Item to evaluate.
139 * @return true if item, as SURT form URI, is prefixed by an item in the set
140 */
141 protected boolean evaluate(Object object) {
142 if ( (object instanceof CandidateURI) &&
143 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
144 .booleanValue()) {
145 if(evaluate(((CandidateURI)object).getVia())) {
146 return true;
147 }
148 }
149 String candidateSurt;
150 candidateSurt = SurtPrefixSet.getCandidateSurt(object);
151 if (candidateSurt == null) {
152 return false;
153 }
154 return getPrefixes().containsPrefixOf(candidateSurt);
155 }
156
157 /***
158 * Synchronized get of prefix set to use
159 *
160 * @return SurtPrefixSet to use for check
161 */
162 private synchronized SurtPrefixSet getPrefixes() {
163 if (surtPrefixes == null) {
164 readPrefixes();
165 }
166 return surtPrefixes;
167 }
168
169 protected void readPrefixes() {
170 buildSurtPrefixSet();
171 dumpSurtPrefixSet();
172 }
173
174 /***
175 * Dump the current prefixes in use to configured dump file (if any)
176 */
177 protected void dumpSurtPrefixSet() {
178
179 String dumpPath = (String)getUncheckedAttribute(null,
180 ATTR_SURTS_DUMP_FILE);
181 if (dumpPath.length() > 0) {
182 File dump = new File(dumpPath);
183 if (!dump.isAbsolute()) {
184 dump = new File(getSettingsHandler().getOrder().getController()
185 .getDisk(), dumpPath);
186 }
187 try {
188 FileWriter fw = new FileWriter(dump);
189 try {
190 surtPrefixes.exportTo(fw);
191 } finally {
192 fw.close();
193 }
194 } catch (IOException e) {
195 e.printStackTrace();
196 throw new RuntimeException(e);
197 }
198 }
199 }
200
201 /***
202 * Construct the set of prefixes to use, from the seed list (
203 * which may include both URIs and '+'-prefixed directives).
204 */
205 protected void buildSurtPrefixSet() {
206 SurtPrefixSet newSurtPrefixes = new SurtPrefixSet();
207 FileReader fr = null;
208
209
210 String sourcePath = (String)getUncheckedAttribute(null,
211 ATTR_SURTS_SOURCE_FILE);
212 if (sourcePath.length() > 0) {
213 File source = new File(sourcePath);
214 if (!source.isAbsolute()) {
215 source = new File(getSettingsHandler().getOrder()
216 .getController().getDisk(), sourcePath);
217 }
218 try {
219 fr = new FileReader(source);
220 try {
221 newSurtPrefixes.importFromMixed(fr, true);
222 } finally {
223 fr.close();
224 }
225 } catch (IOException e) {
226 e.printStackTrace();
227 throw new RuntimeException(e);
228 }
229 }
230
231
232 boolean deduceFromSeeds = ((Boolean)getUncheckedAttribute(null,
233 ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue();
234 if(deduceFromSeeds) {
235 try {
236 fr = new FileReader(getSeedfile());
237 try {
238 newSurtPrefixes.importFromMixed(fr, deduceFromSeeds);
239 } finally {
240 fr.close();
241 }
242 } catch (IOException e) {
243 e.printStackTrace();
244 throw new RuntimeException(e);
245 }
246 }
247
248 surtPrefixes = newSurtPrefixes;
249 }
250
251 /***
252 * Re-read prefixes after an update.
253 *
254 * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
255 */
256 public synchronized void kickUpdate() {
257 super.kickUpdate();
258 if (((Boolean) getUncheckedAttribute(null, ATTR_REBUILD_ON_RECONFIG))
259 .booleanValue()) {
260 readPrefixes();
261 }
262
263
264 }
265
266 /***
267 * Dig through everything to get the crawl-global seeds file.
268 * Add self as listener while at it.
269 *
270 * @return Seed list file
271 */
272 protected File getSeedfile() {
273 CrawlScope scope =
274 getSettingsHandler().getOrder().getController().getScope();
275 scope.addSeedListener(this);
276 return scope.getSeedfile();
277 }
278
279 public synchronized void addedSeed(final CandidateURI curi) {
280 SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone();
281 newSurtPrefixes.add(prefixFrom(curi.toString()));
282 surtPrefixes = newSurtPrefixes;
283 }
284
285 protected String prefixFrom(String uri) {
286 return SurtPrefixSet.prefixFromPlain(uri);
287 }
288 }