1   /* LinksScoper
2    * 
3    * $Id: LinksScoper.java 4911 2007-02-18 19:55:55Z gojomo $
4    *
5    * Created on Oct 2, 2003
6    * 
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   *
25   */
26  package org.archive.crawler.postprocessor;
27  
28  import java.util.Collection;
29  import java.util.HashSet;
30  import java.util.Iterator;
31  import java.util.logging.Level;
32  import java.util.logging.Logger;
33  
34  import javax.management.AttributeNotFoundException;
35  
36  import org.apache.commons.httpclient.URIException;
37  import org.archive.crawler.datamodel.CandidateURI;
38  import org.archive.crawler.datamodel.CrawlURI;
39  import org.archive.crawler.datamodel.FetchStatusCodes;
40  import org.archive.crawler.deciderules.DecideRule;
41  import org.archive.crawler.deciderules.DecideRuleSequence;
42  import org.archive.crawler.extractor.Link;
43  import org.archive.crawler.framework.Filter;
44  import org.archive.crawler.framework.Scoper;
45  import org.archive.crawler.settings.MapType;
46  import org.archive.crawler.settings.SimpleType;
47  import org.archive.crawler.settings.Type;
48  
49  /***
50   * Determine which extracted links are within scope.
51   * TODO: To test scope, requires that Link be converted to
52   * a CandidateURI.  Make it so don't have to make a CandidateURI to test
53   * if Link is in scope.
54   * <p>Since this scoper has to create CandidateURIs, no sense
55   * discarding them since later in the processing chain CandidateURIs rather
56   * than Links are whats needed scheduling extracted links w/ the
57   * Frontier (Frontier#schedule expects CandidateURI, not Link).  This class
58   * replaces Links w/ the CandidateURI that wraps the Link in the CrawlURI.
59   *
60   * @author gojomo
61   * @author stack
62   */
63  public class LinksScoper extends Scoper
64  implements FetchStatusCodes {
65  
66      private static final long serialVersionUID = -4074442117992496793L;
67  
68      private static Logger LOGGER =
69          Logger.getLogger(LinksScoper.class.getName());
70  
71      private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS =
72          "seed-redirects-new-seed";
73      
74      private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS =
75          new Boolean(true);
76      
77      public static final String ATTR_REJECTLOG_DECIDE_RULES =
78          "scope-rejected-url-rules";
79      
80      public static final String ATTR_PREFERENCE_DEPTH_HOPS =
81          "preference-depth-hops";
82  
83      private final static Integer DEFAULT_PREFERENCE_DEPTH_HOPS =
84          new Integer(-1);
85      
86      /***
87       * Instance of rejected uris log filters.
88       */
89      private MapType rejectLogFilters = null;
90      
91      /***
92       * @param name Name of this filter.
93       */
94      public LinksScoper(String name) {
95          super(name, "LinksScoper. Rules on which extracted links " +
96              "are within configured scope.");
97          
98          Type t;
99          t = addElementToDefinition(
100             new SimpleType(ATTR_SEED_REDIRECTS_NEW_SEEDS,
101             "If enabled, any URL found because a seed redirected to it " +
102             "(original seed returned 301 or 302), will also be treated " +
103             "as a seed.", DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
104         t.setExpertSetting(true);
105 
106         t = addElementToDefinition(new SimpleType(ATTR_PREFERENCE_DEPTH_HOPS,
107             "Number of hops (of any sort) from a seed up to which a URI has higher " +
108         "priority scheduling than any remaining seed. For example, if set to 1 items one " + 
109         "hop (link, embed, redirect, etc.) away from a seed will be scheduled " + 
110         "with HIGH priority. If set to -1, no " + 
111         "preferencing will occur, and a breadth-first search with seeds " + 
112         "processed before discovered links will proceed. If set to zero, a " + 
113         "purely depth-first search will proceed, with all discovered links processed " + 
114         "before remaining seeds.  Seed redirects are treated as one hop from a seed.",
115         DEFAULT_PREFERENCE_DEPTH_HOPS));
116         t.setExpertSetting(true);
117         
118         addElementToDefinition(
119             new DecideRuleSequence(ATTR_REJECTLOG_DECIDE_RULES,
120                 "DecideRules which, if their final decision on a link is " +
121                 "not REJECT, cause the otherwise scope-rejected links to " +
122                 "be logged"));
123 
124     }
125 
126     protected void innerProcess(final CrawlURI curi) {
127         if (LOGGER.isLoggable(Level.FINEST)) {
128             LOGGER.finest(getName() + " processing " + curi);
129         }
130         
131         // If prerequisites, nothing to be done in here.
132         if (curi.hasPrerequisiteUri()) {
133             handlePrerequisite(curi);
134             return;
135         }
136         
137         // Don't extract links of error pages.
138         if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
139             curi.clearOutlinks();
140             return;
141         }
142         
143         if (curi.outlinksSize() <= 0) {
144             // No outlinks to process.
145             return;
146         }
147 
148         final boolean redirectsNewSeeds = ((Boolean)getUncheckedAttribute(curi,
149             ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
150         int preferenceDepthHops = ((Integer)getUncheckedAttribute(curi,
151             ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
152         Collection<CandidateURI> inScopeLinks = new HashSet<CandidateURI>();
153         for (final Iterator i = curi.getOutObjects().iterator(); i.hasNext();) {
154             Object o = i.next();
155             if(o instanceof Link){
156                 final Link wref = (Link)o;
157                 try {
158                     final int directive = getSchedulingFor(curi, wref, 
159                         preferenceDepthHops);
160                     final CandidateURI caURI =
161                         curi.createCandidateURI(curi.getBaseURI(), wref, 
162                             directive, 
163                             considerAsSeed(curi, wref, redirectsNewSeeds));
164                     if (isInScope(caURI)) {
165                         inScopeLinks.add(caURI);
166                     }
167                 } catch (URIException e) {
168                     getController().logUriError(e, curi.getUURI(), 
169                         wref.getDestination().toString());
170                 }
171             } else if(o instanceof CandidateURI){
172                 CandidateURI caURI = (CandidateURI)o;
173                 if(isInScope(caURI)){
174                     inScopeLinks.add(caURI);
175                 }
176             } else {
177                 LOGGER.severe("Unexpected type: " + o);
178             }
179         }
180         // Replace current links collection w/ inscopeLinks.  May be
181         // an empty collection.
182         curi.replaceOutlinks(inScopeLinks);
183     }
184     
185     /***
186      * The CrawlURI has a prerequisite; apply scoping and update
187      * Link to CandidateURI in manner analogous to outlink handling. 
188      * @param curi CrawlURI with prereq to consider
189      */
190     protected void handlePrerequisite(CrawlURI curi) {
191         try {
192             // Create prerequisite CandidateURI
193             CandidateURI caUri =
194                 curi.createCandidateURI(curi.getBaseURI(),
195                     (Link) curi.getPrerequisiteUri());
196             int prereqPriority = curi.getSchedulingDirective() - 1;
197             if (prereqPriority < 0) {
198                 prereqPriority = 0;
199                 LOGGER.severe("Unable to promote prerequisite " + caUri +
200                     " above " + curi);
201             }
202             caUri.setSchedulingDirective(prereqPriority);
203             caUri.setForceFetch(true);
204             if(isInScope(caUri)) {
205                 // replace link with CandidateURI
206                 curi.setPrerequisiteUri(caUri);
207             } else {
208                 // prerequisite is out-of-scope; mark CrawlURI as error,
209                 // preventinting normal S_DEFERRED handling
210                 curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
211             }
212        } catch (URIException ex) {
213             Object[] array = {curi, curi.getPrerequisiteUri()};
214             getController().uriErrors.log(Level.INFO,ex.getMessage(), array);
215         } catch (NumberFormatException e) {
216             // UURI.createUURI will occasionally throw this error.
217             Object[] array = {curi, curi.getPrerequisiteUri()};
218             getController().uriErrors.log(Level.INFO,e.getMessage(), array);
219         }
220     }
221 
222     protected void outOfScope(CandidateURI caUri) {
223         super.outOfScope(caUri);
224         if (!LOGGER.isLoggable(Level.INFO)) {
225             return;
226         }
227         // TODO: Fix filters so work on CandidateURI.
228         CrawlURI curi = (caUri instanceof CrawlURI)?
229             (CrawlURI)caUri:
230             new CrawlURI(caUri.getUURI());
231         if (rulesAccept(getRejectLogRules(curi), curi)) {
232             LOGGER.info(curi.getUURI().toString());
233         }
234     }
235     
236     protected DecideRule getRejectLogRules(Object o) {
237         try {
238             return (DecideRule)getAttribute(o, ATTR_REJECTLOG_DECIDE_RULES);
239         } catch (AttributeNotFoundException e) {
240             throw new RuntimeException(e);
241         }
242     }
243     
244     private boolean considerAsSeed(final CrawlURI curi, final Link wref,
245             final boolean redirectsNewSeeds) {
246         // Check if this is a seed with a 301 or 302.
247         if (curi.isSeed()
248                 && (curi.getFetchStatus() == 301 ||
249                     curi.getFetchStatus() == 302)
250                 && wref.getHopType() == Link.REFER_HOP) {
251             // Check if redirects from seeds should be treated as seeds.
252             if (redirectsNewSeeds) {
253                 return true;
254             }
255         }
256         return false;
257     }
258     
259     /***
260      * Determine scheduling for the  <code>curi</code>.
261      * As with the LinksScoper in general, this only handles extracted links,
262      * seeds do not pass through here, but are given MEDIUM priority.  
263      * Imports into the frontier similarly do not pass through here, 
264      * but are given NORMAL priority.
265      */
266     protected int getSchedulingFor(final CrawlURI curi, final Link wref,
267             final int preferenceDepthHops) {
268         final char c = wref.getHopType();
269         if (LOGGER.isLoggable(Level.FINEST)) {
270             LOGGER.finest(curi + " with path=" + curi.getPathFromSeed() +
271                 " isSeed=" + curi.isSeed() + " with fetchStatus=" +
272                 curi.getFetchStatus() + " -> " + wref.getDestination() +
273                 " type " + c + " with context=" + wref.getContext());
274         }
275 
276         switch (c) {
277             case Link.REFER_HOP:
278                 // Treat redirects somewhat urgently
279                 // This also ensures seed redirects remain seed priority
280                 return (preferenceDepthHops >= 0 ? CandidateURI.HIGH :
281                     CandidateURI.MEDIUM);
282             default:
283                 if (preferenceDepthHops == 0)
284                     return CandidateURI.HIGH;
285                     // this implies seed redirects are treated as path
286                     // length 1, which I belive is standard.
287                     // curi.getPathFromSeed() can never be null here, because
288                     // we're processing a link extracted from curi
289                 if (preferenceDepthHops > 0 && 
290                     curi.getPathFromSeed().length() + 1 <= preferenceDepthHops)
291                     return CandidateURI.HIGH;
292                 // Everything else normal (at least for now)
293                 return CandidateURI.NORMAL;
294         }
295     }
296 }