1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.extractor;
28
29 import java.util.regex.Matcher;
30
31 import org.apache.commons.httpclient.URIException;
32 import org.archive.crawler.extractor.Link;
33 import org.archive.net.UURIFactory;
34 import org.archive.util.DevUtils;
35 import org.archive.util.TextUtils;
36
37 /***
38 * This extractor is parsing URIs from CSS type files.
39 * The format of a CSS URL value is 'url(' followed by optional white space
40 * followed by an optional single quote (') or double quote (") character
41 * followed by the URL itself followed by an optional single quote (') or
42 * double quote (") character followed by optional white space followed by ')'.
43 * Parentheses, commas, white space characters, single quotes (') and double
44 * quotes (") appearing in a URL must be escaped with a backslash:
45 * '\(', '\)', '\,'. Partial URLs are interpreted relative to the source of
46 * the style sheet, not relative to the document. <a href="http://www.w3.org/TR/REC-CSS1#url">
47 * Source: www.w3.org</a>
48 *
49 * ROUGH DRAFT IN PROGRESS / incomplete... untested... major changes likely
50 *
51 * @author igor gojomo
52 *
53 **/
54
55 public class RegexpCSSLinkExtractor extends CharSequenceLinkExtractor {
56
57
58
59
60 private static String ESCAPED_AMP = "&";
61
62
63
64 static final String CSS_BACKSLASH_ESCAPE = "////([,'\"//(//)//s])";
65
66 protected Matcher uris;
67
68 /***
69 * CSS URL extractor pattern.
70 *
71 * This pattern extracts URIs for CSS files
72 **/
73 static final String CSS_URI_EXTRACTOR =
74 "(?:@import (?:url[(]|)|url[(])//s*([//\"\']?)([^//\"\'].*?)//1//s*[);]";
75
76 protected boolean findNextLink() {
77 if (uris == null) {
78 uris = TextUtils.getMatcher(CSS_URI_EXTRACTOR, sourceContent);
79
80
81 }
82 String cssUri;
83 try {
84 while (uris.find()) {
85 cssUri = uris.group(2);
86
87 cssUri = TextUtils.replaceAll(ESCAPED_AMP, cssUri, "&");
88
89 cssUri = TextUtils.replaceAll(CSS_BACKSLASH_ESCAPE, cssUri, "$1");
90
91 try {
92 Link link = new Link(source, UURIFactory.getInstance(base,
93 cssUri), Link.EMBED_MISC, Link.EMBED_HOP);
94 next.addLast(link);
95 } catch (URIException e) {
96 extractErrorListener.noteExtractError(e, source, cssUri);
97 }
98 return true;
99 }
100 } catch (StackOverflowError e) {
101 DevUtils.warnHandle(e, "RegexpCSSLinkExtractor StackOverflowError");
102 }
103 return false;
104 }
105
106 public void reset() {
107 super.reset();
108 TextUtils.recycleMatcher(uris);
109 uris = null;
110 }
111
112 protected static CharSequenceLinkExtractor newDefaultInstance() {
113 return new RegexpCSSLinkExtractor();
114 }
115 }