View Javadoc

1   /* SupplementaryLinksScoper
2    * 
3    * $Id: SupplementaryLinksScoper.java 4911 2007-02-18 19:55:55Z gojomo $
4    *
5    * Created on Oct 2, 2003
6    * 
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   *
25   */
26  package org.archive.crawler.postprocessor;
27  
28  import java.util.Collection;
29  import java.util.HashSet;
30  import java.util.logging.Level;
31  import java.util.logging.Logger;
32  
33  import javax.management.AttributeNotFoundException;
34  
35  import org.archive.crawler.datamodel.CandidateURI;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.deciderules.DecideRule;
38  import org.archive.crawler.deciderules.DecideRuleSequence;
39  import org.archive.crawler.framework.Filter;
40  import org.archive.crawler.framework.Scoper;
41  import org.archive.crawler.settings.MapType;
42  
43  /***
44   * Run CandidateURI links carried in the passed CrawlURI through a filter
45   * and 'handle' rejections.
46   * Used to do supplementary processing of links after they've been scope
47   * processed and ruled 'in-scope' by LinkScoper.  An example of
48   * 'supplementary processing' would check that a Link is intended for
49   * this host to crawl in a multimachine crawl setting. Configure filters to
50   * rule on links.  Default handler writes rejected URLs to disk.  Subclass
51   * to handle rejected URLs otherwise.
52   * @author stack
53   */
54  public class SupplementaryLinksScoper extends Scoper {
55  
56      private static final long serialVersionUID = -775819977752790418L;
57  
58      private static Logger LOGGER =
59          Logger.getLogger(SupplementaryLinksScoper.class.getName());
60      
61      public static final String ATTR_LINKS_DECIDE_RULES = "link-rules";
62  
63      /***
64       * @param name Name of this filter.
65       */
66      public SupplementaryLinksScoper(String name) {
67          super(name, "SupplementaryLinksScoper. Use to do supplementary " +
68              "processing of in-scope links.  Will run each link through " +
69              "configured filters.  Must be run after LinkScoper and " +
70              "before FrontierScheduler. " +
71              "Optionally logs rejected links (Enable " +
72              ATTR_OVERRIDE_LOGGER_ENABLED + " and set logger level " +
73              "at INFO or above).");
74          
75          addElementToDefinition(
76                  new DecideRuleSequence(ATTR_LINKS_DECIDE_RULES,
77                      "DecideRules which if their final decision on a link is " +
78                      "REJECT, cause the link to be ruled out-of-scope, even " +
79                      "if it had previously been accepted by the main scope."));
80      }
81  
82      protected void innerProcess(final CrawlURI curi) {
83          // If prerequisites or no links, nothing to be done in here.
84          if (curi.hasPrerequisiteUri() || curi.outlinksSize() <= 0) {
85              return;
86          }
87          
88          Collection<CandidateURI> inScopeLinks = new HashSet<CandidateURI>();
89          for (CandidateURI cauri: curi.getOutCandidates()) {
90              if (isInScope(cauri)) {
91                  inScopeLinks.add(cauri);
92              }
93          }
94          // Replace current links collection w/ inscopeLinks.  May be
95          // an empty collection.
96          curi.replaceOutlinks(inScopeLinks);
97      }
98      
99      protected boolean isInScope(CandidateURI caUri) {
100         // TODO: Fix filters so work on CandidateURI.
101         CrawlURI curi = (caUri instanceof CrawlURI)?
102             (CrawlURI)caUri:
103             new CrawlURI(caUri.getUURI());
104         boolean result = false;
105         if (rulesAccept(getLinkRules(curi), curi)) {
106             result = true;
107             if (LOGGER.isLoggable(Level.FINER)) {
108                 LOGGER.finer("Accepted: " + caUri);
109             }
110         } else {
111             outOfScope(caUri);
112         }
113         return result;
114     }
115     
116     protected DecideRule getLinkRules(Object o) {
117         try {
118             return (DecideRule)getAttribute(o, ATTR_LINKS_DECIDE_RULES);
119         } catch (AttributeNotFoundException e) {
120             throw new RuntimeException(e);
121         }
122     }
123     
124     /***
125      * Called when a CandidateUri is ruled out of scope.
126      * @param caUri CandidateURI that is out of scope.
127      */
128     protected void outOfScope(CandidateURI caUri) {
129         if (!LOGGER.isLoggable(Level.INFO)) {
130             return;
131         }
132         LOGGER.info(caUri.getUURI().toString());
133     }
134 }