1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.framework;
25
26 import java.io.BufferedReader;
27 import java.io.File;
28 import java.io.FileReader;
29 import java.io.FileWriter;
30 import java.io.IOException;
31 import java.io.Writer;
32 import java.util.HashSet;
33 import java.util.Iterator;
34 import java.util.List;
35 import java.util.Set;
36 import java.util.logging.Logger;
37
38 import javax.management.AttributeNotFoundException;
39 import javax.management.MBeanException;
40 import javax.management.ReflectionException;
41
42 import org.apache.commons.httpclient.URIException;
43 import org.archive.crawler.datamodel.CandidateURI;
44 import org.archive.crawler.scope.SeedFileIterator;
45 import org.archive.crawler.scope.SeedListener;
46 import org.archive.crawler.settings.CrawlerSettings;
47 import org.archive.crawler.settings.SimpleType;
48 import org.archive.crawler.settings.Type;
49 import org.archive.net.UURI;
50 import org.archive.util.DevUtils;
51
52 /***
53 * A CrawlScope instance defines which URIs are "in"
54 * a particular crawl.
55 *
56 * It is essentially a Filter which determines, looking at
57 * the totality of information available about a
58 * CandidateURI/CrawlURI instamce, if that URI should be
59 * scheduled for crawling.
60 *
61 * Dynamic information inherent in the discovery of the
62 * URI -- such as the path by which it was discovered --
63 * may be considered.
64 *
65 * Dynamic information which requires the consultation
66 * of external and potentially volatile information --
67 * such as current robots.txt requests and the history
68 * of attempts to crawl the same URI -- should NOT be
69 * considered. Those potentially high-latency decisions
70 * should be made at another step.
71 *
72 * @author gojomo
73 *
74 */
75 public class CrawlScope extends Filter {
76
77 private static final long serialVersionUID = -3321533224526211277L;
78
79 private static final Logger logger =
80 Logger.getLogger(CrawlScope.class.getName());
81 public static final String ATTR_NAME = "scope";
82 public static final String ATTR_SEEDS = "seedsfile";
83
84 /***
85 * Whether every configu change should trigger a
86 * rereading of the original seeds spec/file.
87 */
88 public static final String
89 ATTR_REREAD_SEEDS_ON_CONFIG = "reread-seeds-on-config";
90 public static final Boolean
91 DEFAULT_REREAD_SEEDS_ON_CONFIG = Boolean.TRUE;
92
93 protected Set<SeedListener> seedListeners = new HashSet<SeedListener>();
94
95 /*** Constructs a new CrawlScope.
96 *
97 * @param name the name is ignored since it always have to be the value of
98 * the constant ATT_NAME.
99 */
100 public CrawlScope(String name) {
101
102 super(ATTR_NAME, "Crawl scope");
103 Type t;
104 t = addElementToDefinition(new SimpleType(ATTR_SEEDS,
105 "File from which to extract seeds.", "seeds.txt"));
106 t.setOverrideable(false);
107 t.setExpertSetting(true);
108 t = addElementToDefinition(new SimpleType(ATTR_REREAD_SEEDS_ON_CONFIG,
109 "Whether to reread the seeds specification, whether it has " +
110 "changed or not, every time any configuration change occurs. " +
111 "If true, seeds are reread even when (for example) new " +
112 "domain overrides are set. Rereading the seeds can take a " +
113 "long time with large seed lists.",
114 DEFAULT_REREAD_SEEDS_ON_CONFIG));
115 t.setOverrideable(false);
116 t.setExpertSetting(true);
117
118 }
119
120 /*** Default constructor.
121 */
122 public CrawlScope() {
123 this(ATTR_NAME);
124 }
125
126 /***
127 * Initialize is called just before the crawler starts to run.
128 *
129 * The settings system is up and initialized so can be used. This
130 * initialize happens after {@link #earlyInitialize(CrawlerSettings)}.
131 *
132 * @param controller Controller object.
133 */
134 public void initialize(CrawlController controller) {
135
136 }
137
138 public String toString() {
139 return "CrawlScope<" + getName() + ">";
140 }
141
142 /***
143 * Refresh seeds.
144 *
145 */
146 public void refreshSeeds() {
147
148 }
149
150 /***
151 * @return Seed list file or null if problem getting settings file.
152 */
153 public File getSeedfile() {
154 File file = null;
155 try {
156 file = getSettingsHandler().getPathRelativeToWorkingDirectory(
157 (String)getAttribute(ATTR_SEEDS));
158 if (!file.exists() || !file.canRead()) {
159 throw new IOException("Seeds file " +
160 file.getAbsolutePath() + " does not exist or unreadable.");
161 }
162 } catch (IOException e) {
163 DevUtils.warnHandle(e, "problem reading seeds");
164 } catch (AttributeNotFoundException e) {
165 DevUtils.warnHandle(e, "problem reading seeds");
166 } catch (MBeanException e) {
167 DevUtils.warnHandle(e, "problem reading seeds");
168 e.printStackTrace();
169 } catch (ReflectionException e) {
170 DevUtils.warnHandle(e, "problem reading seeds");
171 e.printStackTrace();
172 }
173
174 return file;
175 }
176
177 /*** Check if a URI is in the seeds.
178 *
179 * @param o the URI to check.
180 * @return true if URI is a seed.
181 */
182 protected boolean isSeed(Object o) {
183 return o instanceof CandidateURI && ((CandidateURI) o).isSeed();
184 }
185
186 /***
187 * @param a First UURI of compare.
188 * @param b Second UURI of compare.
189 * @return True if UURIs are of same host.
190 */
191 protected boolean isSameHost(UURI a, UURI b) {
192 boolean isSameHost = false;
193 if (a != null && b != null) {
194
195
196 try {
197 if (a.getReferencedHost() != null && b.getReferencedHost() != null) {
198 if (a.getReferencedHost().equals(b.getReferencedHost())) {
199 isSameHost = true;
200 }
201 }
202 }
203 catch (URIException e) {
204 logger.severe("Failed compare of " + a + " " + b + ": " +
205 e.getMessage());
206 }
207 }
208 return isSameHost;
209 }
210
211
212
213
214
215
216 public void listUsedFiles(List<String> list){
217
218 try {
219 File file = getSettingsHandler().getPathRelativeToWorkingDirectory(
220 (String)getAttribute(ATTR_SEEDS));
221 list.add(file.getAbsolutePath());
222 } catch (AttributeNotFoundException e) {
223
224 e.printStackTrace();
225 } catch (MBeanException e) {
226
227 e.printStackTrace();
228 } catch (ReflectionException e) {
229
230 e.printStackTrace();
231 }
232 }
233
234 /***
235 * Take note of a situation (such as settings edit) where
236 * involved reconfiguration (such as reading from external
237 * files) may be necessary.
238 */
239 public void kickUpdate() {
240
241
242
243 if (((Boolean) getUncheckedAttribute(null, ATTR_REREAD_SEEDS_ON_CONFIG))
244 .booleanValue()) {
245 refreshSeeds();
246 getSettingsHandler().getOrder().getController().getFrontier().loadSeeds();
247 }
248 }
249
250 /***
251 * Gets an iterator over all configured seeds. Subclasses
252 * which cache seeds in memory can override with more
253 * efficient implementation.
254 *
255 * @return Iterator, perhaps over a disk file, of seeds
256 */
257 public Iterator<UURI> seedsIterator() {
258 return seedsIterator(null);
259 }
260
261 /***
262 * Gets an iterator over all configured seeds. Subclasses
263 * which cache seeds in memory can override with more
264 * efficient implementation.
265 *
266 * @param ignoredItemWriter optional writer to get ignored seed items report
267 * @return Iterator, perhaps over a disk file, of seeds
268 */
269 public Iterator<UURI> seedsIterator(Writer ignoredItemWriter) {
270 BufferedReader br;
271 try {
272 br = new BufferedReader(new FileReader(getSeedfile()));
273 } catch (IOException e) {
274 throw new RuntimeException(e);
275 }
276 return new SeedFileIterator(br,ignoredItemWriter);
277 }
278
279 /***
280 * Convenience method to close SeedFileIterator, if appropriate.
281 *
282 * @param iter Iterator to check if SeedFileIterator needing closing
283 */
284 protected void checkClose(Iterator iter) {
285 if(iter instanceof SeedFileIterator) {
286 ((SeedFileIterator)iter).close();
287 }
288 }
289
290 /***
291 * Add a new seed to scope. By default, simply appends
292 * to seeds file, though subclasses may handle differently.
293 *
294 * <p>This method is *not* sufficient to get the new seed
295 * scheduled in the Frontier for crawling -- it only
296 * affects the Scope's seed record (and decisions which
297 * flow from seeds).
298 *
299 * @param curi CandidateUri to add
300 * @return true if successful, false if add failed for any reason
301 */
302 public boolean addSeed(final CandidateURI curi) {
303 File f = getSeedfile();
304 if (f != null) {
305 try {
306 FileWriter fw = new FileWriter(f, true);
307
308 fw.write("\n");
309 fw.write("# Heritrix added seed " +
310 ((curi.getVia() != null) ? "redirect from " + curi.getVia():
311 "(JMX)") + ".\n");
312 fw.write(curi.toString());
313 fw.flush();
314 fw.close();
315 Iterator iter = seedListeners.iterator();
316 while(iter.hasNext()) {
317 ((SeedListener)iter.next()).addedSeed(curi);
318 }
319 return true;
320 } catch (IOException e) {
321 DevUtils.warnHandle(e, "problem writing new seed");
322 }
323 }
324 return false;
325 }
326
327 public void addSeedListener(SeedListener sl) {
328 seedListeners.add(sl);
329 }
330 }