1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.logging.Logger;
26 import java.util.regex.Matcher;
27
28 import javax.management.AttributeNotFoundException;
29
30 import org.archive.crawler.settings.ModuleType;
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.crawler.url.CanonicalizationRule;
33
34 /***
35 * Base of all rules applied canonicalizing a URL that are configurable
36 * via the Heritrix settings system.
37 *
38 * This base class is abstact. Subclasses must implement the
39 * {@link CanonicalizationRule#canonicalize(String, Object)} method.
40 *
41 * @author stack
42 * @version $Date: 2005-11-04 23:00:23 +0000 (Fri, 04 Nov 2005) $, $Revision: 3932 $
43 */
44 public abstract class BaseRule
45 extends ModuleType
46 implements CanonicalizationRule {
47 private static Logger logger =
48 Logger.getLogger(BaseRule.class.getName());
49 public static final String ATTR_ENABLED = "enabled";
50
51 /***
52 * Constructor.
53 * @param name Name of this canonicalization rule.
54 * @param description Description of what this rule does.
55 */
56 public BaseRule(String name, String description) {
57 super(name, description);
58 setExpertSetting(true);
59 setOverrideable(true);
60 Object [] possibleValues = {Boolean.TRUE, Boolean.FALSE};
61 addElementToDefinition(new SimpleType(ATTR_ENABLED,
62 "Rule is enabled.", new Boolean(true), possibleValues));
63 }
64
65 public boolean isEnabled(Object context) {
66 boolean result = true;
67 try {
68 Boolean b = (Boolean)getAttribute(context, ATTR_ENABLED);
69 if (b != null) {
70 result = b.booleanValue();
71 }
72 } catch (AttributeNotFoundException e) {
73 logger.warning("Failed get of 'enabled' attribute.");
74 }
75
76 return result;
77 }
78
79 /***
80 * Run a regex that strips elements of a string.
81 *
82 * Assumes the regex has a form that wants to strip elements of the passed
83 * string. Assumes that if a match, appending group 1
84 * and group 2 yields desired result.
85 * @param url Url to search in.
86 * @param matcher Matcher whose form yields a group 1 and group 2 if a
87 * match (non-null.
88 * @return Original <code>url</code> else concatenization of group 1
89 * and group 2.
90 */
91 protected String doStripRegexMatch(String url, Matcher matcher) {
92 return (matcher != null && matcher.matches())?
93 checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)):
94 url;
95 }
96
97 /***
98 * @param string String to check.
99 * @return <code>string</code> if non-null, else empty string ("").
100 */
101 private String checkForNull(String string) {
102 return (string != null)? string: "";
103 }
104 }