1   /*
2    * ExtractorCSS
3    *
4    * $Id: RegexpCSSLinkExtractor.java 4646 2006-09-22 17:23:04Z paul_jack $
5    *
6    * Created on Mar 29, 2005
7    *
8    * Copyright (C) 2005 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.extractor;
28  
29  import java.util.regex.Matcher;
30  
31  import org.apache.commons.httpclient.URIException;
32  import org.archive.crawler.extractor.Link;
33  import org.archive.net.UURIFactory;
34  import org.archive.util.DevUtils;
35  import org.archive.util.TextUtils;
36  
37  /***
38   * This extractor is parsing URIs from CSS type files.
39   * The format of a CSS URL value is 'url(' followed by optional white space
40   * followed by an optional single quote (') or double quote (") character
41   * followed by the URL itself followed by an optional single quote (') or
42   * double quote (") character followed by optional white space followed by ')'.
43   * Parentheses, commas, white space characters, single quotes (') and double
44   * quotes (") appearing in a URL must be escaped with a backslash:
45   * '\(', '\)', '\,'. Partial URLs are interpreted relative to the source of
46   * the style sheet, not relative to the document. <a href="http://www.w3.org/TR/REC-CSS1#url">
47   * Source: www.w3.org</a>
48   *
49   * ROUGH DRAFT IN PROGRESS / incomplete... untested... major changes likely
50   *
51   * @author igor gojomo
52   *
53   **/
54  
55  public class RegexpCSSLinkExtractor extends CharSequenceLinkExtractor {
56  
57      // private static Logger logger =
58      //    Logger.getLogger(RegexpCSSLinkExtractor.class.getName());
59  
60      private static String ESCAPED_AMP = "&amp";
61      // CSS escapes: "Parentheses, commas, whitespace characters, single
62      // quotes (') and double quotes (") appearing in a URL must be
63      // escaped with a backslash"
64      static final String CSS_BACKSLASH_ESCAPE = "////([,'\"//(//)//s])";
65  
66      protected Matcher uris;
67  
68      /***
69       *  CSS URL extractor pattern.
70       *
71       *  This pattern extracts URIs for CSS files
72       **/
73      static final String CSS_URI_EXTRACTOR =
74      "(?:@import (?:url[(]|)|url[(])//s*([//\"\']?)([^//\"\'].*?)//1//s*[);]";
75  
76      protected boolean findNextLink() {
77          if (uris == null) {
78              uris = TextUtils.getMatcher(CSS_URI_EXTRACTOR, sourceContent);
79              // NOTE: this matcher can't be recycled in this method because
80              // it is reused on rentry
81          }
82          String cssUri;
83          try {
84              while (uris.find()) {
85                  cssUri = uris.group(2);
86                  // TODO: Escape more HTML Entities.
87                  cssUri = TextUtils.replaceAll(ESCAPED_AMP, cssUri, "&");
88                  // Remove backslashes when used as escape character in CSS URL
89                  cssUri = TextUtils.replaceAll(CSS_BACKSLASH_ESCAPE, cssUri, "$1");
90                  // TODO: handle relative URIs?
91                  try {
92                      Link link = new Link(source, UURIFactory.getInstance(base,
93                              cssUri), Link.EMBED_MISC, Link.EMBED_HOP);
94                      next.addLast(link);
95                  } catch (URIException e) {
96                      extractErrorListener.noteExtractError(e, source, cssUri);
97                  }
98                  return true;
99              }
100         } catch (StackOverflowError e) {
101             DevUtils.warnHandle(e, "RegexpCSSLinkExtractor StackOverflowError");
102         }
103         return false;
104     }
105 
106     public void reset() {
107         super.reset();
108         TextUtils.recycleMatcher(uris);
109         uris = null;
110     }
111     
112     protected static CharSequenceLinkExtractor newDefaultInstance() {
113         return new RegexpCSSLinkExtractor();
114     }
115 }