1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.filter;
24
25 import org.archive.crawler.datamodel.CrawlURI;
26 import org.archive.util.TextUtils;
27
28 /***
29 * Compares the content-type of the passed CrawlURI to a regular expression.
30 *
31 * @author Tom Emerson
32 * @version $Date: 2006-09-25 18:41:10 +0000 (Mon, 25 Sep 2006) $, $Revision: 4652 $
33 * @deprecated As of release 1.10.0. To be replaced by an equivalent
34 * {@link DecideRule}.
35 */
36 public class ContentTypeRegExpFilter extends URIRegExpFilter {
37
38 private static final long serialVersionUID = 206378978342655106L;
39
40 private static final String DESCRIPTION = "ContentType regexp filter" +
41 "*Deprecated* To be replaced by an equivalent DecideRule. " +
42 "Cannot be used until after fetcher processors. Only then is the" +
43 " Content-Type known. A good place for this filter is at" +
44 " the writer step processing. If the content-type is null," +
45 " 301s usually have no content-type, the filter returns true.";
46
47 /***
48 * @param name Filter name.
49 */
50 public ContentTypeRegExpFilter(String name) {
51 super
52 (name, DESCRIPTION, "");
53 }
54
55 public ContentTypeRegExpFilter(String name, String regexp) {
56 super(name, DESCRIPTION, regexp);
57 }
58
59 protected boolean innerAccepts(Object o) {
60
61 if (!(o instanceof CrawlURI)) {
62 return false;
63 }
64 String content_type = ((CrawlURI)o).getContentType();
65 String regexp = getRegexp(o);
66 return (regexp == null)? false:
67 (content_type == null)? true:
68 TextUtils.matches(getRegexp(o), content_type);
69 }
70 }