View Javadoc

1   /* Scoper
2    * 
3    * Created on Jun 6, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.framework;
24  
25  import java.util.logging.Level;
26  import java.util.logging.Logger;
27  
28  import javax.management.AttributeNotFoundException;
29  
30  import org.archive.crawler.datamodel.CandidateURI;
31  import org.archive.crawler.settings.SimpleType;
32  import org.archive.crawler.settings.Type;
33  import org.archive.crawler.util.LogUtils;
34  
35  /***
36   * Base class for Scopers.
37   * Scopers test CandidateURIs against a scope.
38   * Scopers allow logging of rejected CandidateURIs.
39   * @author stack
40   * @version $Date: 2006-09-25 23:59:43 +0000 (Mon, 25 Sep 2006) $, $Revision: 4664 $
41   */
42  public abstract class Scoper extends Processor {
43      private static Logger LOGGER =
44          Logger.getLogger(Scoper.class.getName());
45      
46      /***
47       * Protected so avaiilable to subclasses.
48       */
49      protected static final String ATTR_OVERRIDE_LOGGER_ENABLED =
50          "override-logger";
51      
52      /***
53       * Constructor.
54       * @param name
55       * @param description
56       */
57      public Scoper(String name, String description) {
58          super(name, description);
59          Type t = addElementToDefinition(
60              new SimpleType(ATTR_OVERRIDE_LOGGER_ENABLED,
61              "If enabled, override default logger for this class (Default " +
62              "logger writes the console).  Override " +
63              "logger will instead send all logging to a file named for this " +
64              "class in the job log directory. Set the logging level and " +
65              "other " +
66              "characteristics of the override logger such as rotation size, " +
67              "suffix pattern, etc. in heritrix.properties. This attribute " +
68              "is only checked once, on startup of a job.",
69              new Boolean(false)));
70          t.setExpertSetting(true);
71      }
72      
73      protected void initialTasks() {
74          super.initialTasks();
75          if (!isOverrideLogger(null)) {
76              return;
77          }
78          // Set up logger for this instance.  May have special directives
79          // since this class can log scope-rejected URLs.
80          LogUtils.createFileLogger(getController().getLogsDir(),
81              this.getClass().getName(),
82              Logger.getLogger(this.getClass().getName()));
83      }
84      
85      /***
86       * @param context Context to use looking up attribute.
87       * @return True if we are to override default logger (default logs
88       * to console) with a logger that writes all loggings to a file
89       * named for this class.
90       */
91      protected boolean isOverrideLogger(Object context) {
92          boolean result = true;
93          try {
94              Boolean b = (Boolean)getAttribute(context,
95                  ATTR_OVERRIDE_LOGGER_ENABLED);
96              if (b != null) {
97                  result = b.booleanValue();
98              }
99          } catch (AttributeNotFoundException e) {
100             LOGGER.warning("Failed get of 'enabled' attribute.");
101         }
102 
103         return result;
104     }
105     
106     /***
107      * Schedule the given {@link CandidateURI CandidateURI} with the Frontier.
108      * @param caUri The CandidateURI to be scheduled.
109      * @return true if CandidateURI was accepted by crawl scope, false
110      * otherwise.
111      */
112     protected boolean isInScope(CandidateURI caUri) {
113         boolean result = false;
114         if (getController().getScope().accepts(caUri)) {
115             result = true;
116             if (LOGGER.isLoggable(Level.FINER)) {
117                 LOGGER.finer("Accepted: " + caUri);
118             }
119         } else {
120             outOfScope(caUri);
121         }
122         return result;
123     }
124     
125     /***
126      * Called when a CandidateUri is ruled out of scope.
127      * Override if you don't want logs as coming from this class.
128      * @param caUri CandidateURI that is out of scope.
129      */
130     protected void outOfScope(CandidateURI caUri) {
131         if (!LOGGER.isLoggable(Level.INFO)) {
132             return;
133         }
134         LOGGER.info(caUri.getUURI().toString());
135     }
136 }