1   /* ContentTypeRegExpFilter.java
2    *
3    * Created on Sep 13, 2004
4    *
5    * Copyright (C) 2004 Tom Emerson.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.filter;
24  
25  import org.archive.crawler.datamodel.CrawlURI;
26  import org.archive.util.TextUtils;
27  
28  /***
29   * Compares the content-type of the passed CrawlURI to a regular expression.
30   *
31   * @author Tom Emerson
32   * @version $Date: 2006-09-25 18:41:10 +0000 (Mon, 25 Sep 2006) $, $Revision: 4652 $
33   * @deprecated As of release 1.10.0.  To be replaced by an equivalent
34   * {@link DecideRule}.
35   */
36  public class ContentTypeRegExpFilter extends URIRegExpFilter {
37  
38      private static final long serialVersionUID = 206378978342655106L;
39  
40      private static final String DESCRIPTION = "ContentType regexp filter" +
41      		"*Deprecated* To be replaced by an equivalent DecideRule. " +
42          "Cannot be used until after fetcher processors. Only then is the" +
43          " Content-Type known. A good place for this filter is at" +
44          " the writer step processing.  If the content-type is null," +
45          " 301s usually have no content-type, the filter returns true.";
46  
47      /***
48       * @param name Filter name.
49       */
50      public ContentTypeRegExpFilter(String name) {
51          super
52          (name, DESCRIPTION, "");
53      }
54  
55      public ContentTypeRegExpFilter(String name, String regexp) {
56          super(name, DESCRIPTION, regexp);
57      }
58      
59      protected boolean innerAccepts(Object o) {
60          // FIXME: can o ever be anything but a CrawlURI?
61          if (!(o instanceof CrawlURI)) {
62              return false;
63          }
64          String content_type = ((CrawlURI)o).getContentType();
65          String regexp = getRegexp(o);
66          return (regexp == null)? false:
67              (content_type == null)? true:
68                  TextUtils.matches(getRegexp(o), content_type);
69      }
70  }