1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlScope.java
20   * Created on Oct 1, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.framework;
25  
26  import java.io.BufferedReader;
27  import java.io.File;
28  import java.io.FileReader;
29  import java.io.FileWriter;
30  import java.io.IOException;
31  import java.io.Writer;
32  import java.util.HashSet;
33  import java.util.Iterator;
34  import java.util.List;
35  import java.util.Set;
36  import java.util.logging.Logger;
37  
38  import javax.management.AttributeNotFoundException;
39  import javax.management.MBeanException;
40  import javax.management.ReflectionException;
41  
42  import org.apache.commons.httpclient.URIException;
43  import org.archive.crawler.datamodel.CandidateURI;
44  import org.archive.crawler.scope.SeedFileIterator;
45  import org.archive.crawler.scope.SeedListener;
46  import org.archive.crawler.settings.CrawlerSettings;
47  import org.archive.crawler.settings.SimpleType;
48  import org.archive.crawler.settings.Type;
49  import org.archive.net.UURI;
50  import org.archive.util.DevUtils;
51  
52  /***
53   * A CrawlScope instance defines which URIs are "in"
54   * a particular crawl.
55   *
56   * It is essentially a Filter which determines, looking at
57   * the totality of information available about a
58   * CandidateURI/CrawlURI instamce, if that URI should be
59   * scheduled for crawling.
60   *
61   * Dynamic information inherent in the discovery of the
62   * URI -- such as the path by which it was discovered --
63   * may be considered.
64   *
65   * Dynamic information which requires the consultation
66   * of external and potentially volatile information --
67   * such as current robots.txt requests and the history
68   * of attempts to crawl the same URI -- should NOT be
69   * considered. Those potentially high-latency decisions
70   * should be made at another step.
71   *
72   * @author gojomo
73   *
74   */
75  public class CrawlScope extends Filter {
76  
77      private static final long serialVersionUID = -3321533224526211277L;
78  
79      private static final Logger logger =
80          Logger.getLogger(CrawlScope.class.getName());
81      public static final String ATTR_NAME = "scope";
82      public static final String ATTR_SEEDS = "seedsfile";
83      
84      /***
85       * Whether every configu change should trigger a 
86       * rereading of the original seeds spec/file.
87       */
88      public static final String 
89          ATTR_REREAD_SEEDS_ON_CONFIG = "reread-seeds-on-config";
90      public static final Boolean
91          DEFAULT_REREAD_SEEDS_ON_CONFIG = Boolean.TRUE;
92      
93      protected Set<SeedListener> seedListeners = new HashSet<SeedListener>();
94  
95      /*** Constructs a new CrawlScope.
96       *
97       * @param name the name is ignored since it always have to be the value of
98       *        the constant ATT_NAME.
99       */
100     public CrawlScope(String name) {
101         // 'name' is never used.
102         super(ATTR_NAME, "Crawl scope");
103         Type t;
104         t = addElementToDefinition(new SimpleType(ATTR_SEEDS,
105                 "File from which to extract seeds.", "seeds.txt"));
106         t.setOverrideable(false);
107         t.setExpertSetting(true);
108         t = addElementToDefinition(new SimpleType(ATTR_REREAD_SEEDS_ON_CONFIG,
109                 "Whether to reread the seeds specification, whether it has " +
110                 "changed or not, every time any configuration change occurs. " +
111                 "If true, seeds are reread even when (for example) new " +
112                 "domain overrides are set. Rereading the seeds can take a " +
113                 "long time with large seed lists.", 
114                 DEFAULT_REREAD_SEEDS_ON_CONFIG));
115         t.setOverrideable(false);
116         t.setExpertSetting(true);
117 
118     }
119 
120     /*** Default constructor.
121      */
122     public CrawlScope() {
123         this(ATTR_NAME);
124     }
125 
126     /***
127      * Initialize is called just before the crawler starts to run.
128      *
129      * The settings system is up and initialized so can be used.  This
130      * initialize happens after {@link #earlyInitialize(CrawlerSettings)}.
131      *
132      * @param controller Controller object.
133      */
134     public void initialize(CrawlController controller) {
135         // by default do nothing (subclasses override)
136     }
137 
138     public String toString() {
139         return "CrawlScope<" + getName() + ">";
140     }
141 
142     /***
143      * Refresh seeds.
144      *
145      */
146     public void refreshSeeds() {
147         // by default do nothing (subclasses which cache should override)
148     }
149 
150     /***
151      * @return Seed list file or null if problem getting settings file.
152      */
153     public File getSeedfile() {
154         File file = null;
155         try {
156             file = getSettingsHandler().getPathRelativeToWorkingDirectory(
157                 (String)getAttribute(ATTR_SEEDS));
158             if (!file.exists() || !file.canRead()) {
159                 throw new IOException("Seeds file " +
160                     file.getAbsolutePath() + " does not exist or unreadable.");
161             }
162         } catch (IOException e) {
163             DevUtils.warnHandle(e, "problem reading seeds");
164         } catch (AttributeNotFoundException e) {
165             DevUtils.warnHandle(e, "problem reading seeds");
166         } catch (MBeanException e) {
167             DevUtils.warnHandle(e, "problem reading seeds");
168             e.printStackTrace();
169         } catch (ReflectionException e) {
170             DevUtils.warnHandle(e, "problem reading seeds");
171             e.printStackTrace();
172         }
173 
174         return file;
175     }
176 
177     /*** Check if a URI is in the seeds.
178      *
179      * @param o the URI to check.
180      * @return true if URI is a seed.
181      */
182     protected boolean isSeed(Object o) {
183         return o instanceof CandidateURI && ((CandidateURI) o).isSeed();
184     }
185 
186     /***
187      * @param a First UURI of compare.
188      * @param b Second UURI of compare.
189      * @return True if UURIs are of same host.
190      */
191     protected boolean isSameHost(UURI a, UURI b) {
192         boolean isSameHost = false;
193         if (a != null && b != null) {
194             // getHost can come back null.  See
195             // "[ 910120 ] java.net.URI#getHost fails when leading digit"
196             try {
197                 if (a.getReferencedHost() != null && b.getReferencedHost() != null) {
198                     if (a.getReferencedHost().equals(b.getReferencedHost())) {
199                         isSameHost = true;
200                     }
201                 }
202             }
203             catch (URIException e) {
204                 logger.severe("Failed compare of " + a + " " + b + ": " +
205                     e.getMessage());
206             }
207         }
208         return isSameHost;
209     }
210 
211 
212 
213     /* (non-Javadoc)
214      * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
215      */
216     public void listUsedFiles(List<String> list){
217         // Add seed file
218         try {
219             File file = getSettingsHandler().getPathRelativeToWorkingDirectory(
220                     (String)getAttribute(ATTR_SEEDS));
221             list.add(file.getAbsolutePath());
222         } catch (AttributeNotFoundException e) {
223             // TODO Auto-generated catch block
224             e.printStackTrace();
225         } catch (MBeanException e) {
226             // TODO Auto-generated catch block
227             e.printStackTrace();
228         } catch (ReflectionException e) {
229             // TODO Auto-generated catch block
230             e.printStackTrace();
231         }
232     }
233 
234     /***
235      * Take note of a situation (such as settings edit) where
236      * involved reconfiguration (such as reading from external
237      * files) may be necessary.
238      */
239     public void kickUpdate() {
240         // TODO: further improve this so that case with hundreds of
241         // thousands or millions of seeds works better without requiring
242         // this specific settings check 
243         if (((Boolean) getUncheckedAttribute(null, ATTR_REREAD_SEEDS_ON_CONFIG))
244                 .booleanValue()) {
245             refreshSeeds();
246             getSettingsHandler().getOrder().getController().getFrontier().loadSeeds();
247         }
248     }
249 
250     /***
251      * Gets an iterator over all configured seeds. Subclasses
252      * which cache seeds in memory can override with more
253      * efficient implementation. 
254      *
255      * @return Iterator, perhaps over a disk file, of seeds
256      */
257     public Iterator<UURI> seedsIterator() {
258         return seedsIterator(null);
259     }
260     
261     /***
262      * Gets an iterator over all configured seeds. Subclasses
263      * which cache seeds in memory can override with more
264      * efficient implementation. 
265      *
266      * @param ignoredItemWriter optional writer to get ignored seed items report
267      * @return Iterator, perhaps over a disk file, of seeds
268      */
269     public Iterator<UURI> seedsIterator(Writer ignoredItemWriter) {
270         BufferedReader br;
271         try {
272             br = new BufferedReader(new FileReader(getSeedfile()));
273         } catch (IOException e) {
274             throw new RuntimeException(e);
275         }
276         return new SeedFileIterator(br,ignoredItemWriter);
277     }
278     
279     /***
280      * Convenience method to close SeedFileIterator, if appropriate.
281      * 
282      * @param iter Iterator to check if SeedFileIterator needing closing
283      */
284     protected void checkClose(Iterator iter) {
285         if(iter instanceof SeedFileIterator) {
286             ((SeedFileIterator)iter).close();
287         }
288     }
289     
290     /***
291      * Add a new seed to scope. By default, simply appends
292      * to seeds file, though subclasses may handle differently.
293      *
294      * <p>This method is *not* sufficient to get the new seed 
295      * scheduled in the Frontier for crawling -- it only 
296      * affects the Scope's seed record (and decisions which
297      * flow from seeds). 
298      *
299      * @param curi CandidateUri to add
300      * @return true if successful, false if add failed for any reason
301      */
302     public boolean addSeed(final CandidateURI curi) {
303         File f = getSeedfile();
304         if (f != null) {
305             try {
306                 FileWriter fw = new FileWriter(f, true);
307                 // Write to new (last) line the URL.
308                 fw.write("\n");
309                 fw.write("# Heritrix added seed " +
310                     ((curi.getVia() != null) ? "redirect from " + curi.getVia():
311                         "(JMX)") + ".\n");
312                 fw.write(curi.toString());
313                 fw.flush();
314                 fw.close();
315                 Iterator iter = seedListeners.iterator();
316                 while(iter.hasNext()) {
317                     ((SeedListener)iter.next()).addedSeed(curi);
318                 }
319                 return true;
320             } catch (IOException e) {
321                 DevUtils.warnHandle(e, "problem writing new seed");
322             }
323         }
324         return false; 
325     }
326     
327     public void addSeedListener(SeedListener sl) {
328         seedListeners.add(sl);
329     }
330 }