1   /* BaseRule
2    * 
3    * Created on Oct 5, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.url.canonicalize;
24  
25  import java.util.logging.Logger;
26  import java.util.regex.Matcher;
27  
28  import javax.management.AttributeNotFoundException;
29  
30  import org.archive.crawler.settings.ModuleType;
31  import org.archive.crawler.settings.SimpleType;
32  import org.archive.crawler.url.CanonicalizationRule;
33  
34  /***
35   * Base of all rules applied canonicalizing a URL that are configurable
36   * via the Heritrix settings system.
37   * 
38   * This base class is abstact.  Subclasses must implement the
39   * {@link CanonicalizationRule#canonicalize(String, Object)} method.
40   * 
41   * @author stack
42   * @version $Date: 2005-11-04 23:00:23 +0000 (Fri, 04 Nov 2005) $, $Revision: 3932 $
43   */
44  public abstract class BaseRule
45  extends ModuleType
46  implements CanonicalizationRule {
47      private static Logger logger =
48          Logger.getLogger(BaseRule.class.getName());
49      public static final String ATTR_ENABLED = "enabled";
50      
51      /***
52       * Constructor.
53       * @param name Name of this canonicalization rule.
54       * @param description Description of what this rule does.
55       */
56      public BaseRule(String name, String description) {
57          super(name, description);
58          setExpertSetting(true);
59          setOverrideable(true);
60          Object [] possibleValues = {Boolean.TRUE, Boolean.FALSE};
61          addElementToDefinition(new SimpleType(ATTR_ENABLED,
62              "Rule is enabled.", new Boolean(true), possibleValues));
63      }
64      
65      public boolean isEnabled(Object context) {
66          boolean result = true;
67          try {
68              Boolean b = (Boolean)getAttribute(context, ATTR_ENABLED);
69              if (b != null) {
70                  result = b.booleanValue();
71              }
72          } catch (AttributeNotFoundException e) {
73              logger.warning("Failed get of 'enabled' attribute.");
74          }
75  
76          return result;
77      }
78      
79      /***
80       * Run a regex that strips elements of a string.
81       * 
82       * Assumes the regex has a form that wants to strip elements of the passed
83       * string.  Assumes that if a match, appending group 1
84       * and group 2 yields desired result.
85       * @param url Url to search in.
86       * @param matcher Matcher whose form yields a group 1 and group 2 if a
87       * match (non-null.
88       * @return Original <code>url</code> else concatenization of group 1
89       * and group 2.
90       */
91      protected String doStripRegexMatch(String url, Matcher matcher) {
92          return (matcher != null && matcher.matches())?
93              checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)):
94              url;
95      }
96  
97      /***
98       * @param string String to check.
99       * @return <code>string</code> if non-null, else empty string ("").
100      */
101     private String checkForNull(String string) {
102         return (string != null)? string: "";
103     }
104 }