1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.deciderules;
28
29 import java.util.logging.Logger;
30
31 import javax.management.AttributeNotFoundException;
32
33 import org.archive.crawler.settings.SimpleType;
34
35 /***
36 * Compares suffix of a passed CrawlURI, UURI, or String against a regular
37 * expression pattern, applying its configured decision to all matches.
38 *
39 * Several predefined patterns are available for convenience. Choosing
40 * 'custom' makes this the same as a regular MatchesRegExpDecideRule.
41 *
42 * @author Igor Ranitovic
43 */
44 public class MatchesFilePatternDecideRule extends MatchesRegExpDecideRule {
45
46 private static final long serialVersionUID = -4182743018517062411L;
47
48 private static final Logger logger =
49 Logger.getLogger(MatchesFilePatternDecideRule.class.getName());
50 public static final String ATTR_USE_PRESET = "use-preset-pattern";
51 public static final String IMAGES_PATTERNS =
52 ".*(?i)(//.(bmp|gif|jpe?g|png|tiff?))$";
53 public static final String AUDIO_PATTERNS =
54 ".*(?i)(//.(mid|mp2|mp3|mp4|wav))$";
55 public static final String VIDEO_PATTERNS =
56 ".*(?i)(//.(avi|mov|mpeg|ram|rm|smil|wmv))$";
57 public static final String MISC_PATTERNS =
58 ".*(?i)(//.(doc|pdf|ppt|swf))$";
59 public static final String ALL_DEFAULT_PATTERNS =
60 ".*(?i)(//.(bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg" +
61 "|ram|rm|smil|wmv|doc|pdf|ppt|swf))$";
62
63 public static final String ALL = "All";
64 public static final String IMAGES = "Images";
65 public static final String AUDIO = "Audio";
66 public static final String VIDEO = "Video";
67 public static final String MISC = "Miscellaneous";
68 public static final String CUSTOM = "Custom";
69
70 /***
71 * Usual constructor.
72 * @param name
73 */
74 public MatchesFilePatternDecideRule(String name) {
75 super(name);
76 setDescription("MatchesFilePatternDecideRule. Applies its decision " + "to all URIs that end with the specified pattern(s). Anything " +
77 " that does not match is let PASS. " +
78 " Default file patterns are: .avi, .bmp, " +
79 ".doc, .gif, .jp(e)g, .mid, .mov, .mp2, .mp3, .mp4, .mpeg, " +
80 ".pdf, .png, .ppt, .ram, .rm,.smil, .swf, .tif(f), .wav, .wmv. " +
81 "It is also possible to specify a custom regular expression, " +
82 "in which case this behaves exactly like the " +
83 " MatchesRegExpDecideRule. See also " +
84 "NotMatchesFilePatternDecideRule.");
85
86 String[] options = new String[] {ALL, IMAGES, AUDIO, VIDEO, MISC,
87 CUSTOM};
88
89 addElementToDefinition(
90 new SimpleType(ATTR_USE_PRESET, "URIs that match selected file " +
91 "patterns will have the decision applied. Default file " +
92 "patterns are:\n" +
93 "Images: .bmp, .gif, .jp(e)g, .png, .tif(f)\n" +
94 "Audio: .mid, mp2, .mp3, .mp4, .wav\n" +
95 "Video: .avi, .mov, .mpeg, .ram, .rm, .smil, .wmv\n" +
96 "Miscellaneous: .doc, .pdf, .ppt, .swf\n" +
97 "All: All above patterns\n" +
98 "Choose 'Custom' to specify your own pattern. Preset " +
99 "patterns are case insensitive.",
100 "All", options));
101
102 addElementToDefinition(
103 new SimpleType(ATTR_REGEXP, "Custom java regular expression. " +
104 "This regular expression will be used instead of the " +
105 "supplied pattern groups for matching. An example " +
106 "of such a regular expression (Miscellaneous): " +
107 ".*(?i)(//.(doc|pdf|ppt|swf))$ " +
108 "Any arbitrary regular expression may be entered and " +
109 "will be applied to the URI.", ""));
110 }
111
112 /***
113 * Use a preset if configured to do so.
114 * @param o Context
115 * @return Regex to use.
116 *
117 * @see org.archive.crawler.filter.URIRegExpFilter#getRegexp(Object)
118 */
119 protected String getRegexp(Object o) {
120 try {
121 String patternType = (String) getAttribute(o, ATTR_USE_PRESET);
122 if (patternType.equals(ALL)) {
123 return ALL_DEFAULT_PATTERNS;
124 } else if (patternType.equals(IMAGES)) {
125 return IMAGES_PATTERNS;
126 } else if (patternType.equals(AUDIO)) {
127 return AUDIO_PATTERNS;
128 } else if (patternType.equals(VIDEO)) {
129 return VIDEO_PATTERNS;
130 } else if (patternType.equals(MISC)) {
131 return MISC_PATTERNS;
132 } else if (patternType.equals(CUSTOM)) {
133 return super.getRegexp(o);
134 } else {
135 assert false : "Unrecognized pattern type " + patternType
136 + ". Should never happen!";
137 }
138 } catch (AttributeNotFoundException e) {
139 logger.severe(e.getMessage());
140 }
141 return null;
142 }
143 }