1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.extractor;
25
26 import java.io.IOException;
27 import java.util.logging.Logger;
28 import java.util.regex.Matcher;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CoreAttributeConstants;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.CrawlController;
34 import org.archive.io.ReplayCharSequence;
35 import org.archive.net.UURI;
36 import org.archive.util.DevUtils;
37 import org.archive.util.TextUtils;
38
39 /***
40 * Processes Javascript files for strings that are likely to be
41 * crawlable URIs.
42 *
43 * @author gojomo
44 *
45 */
46 public class ExtractorJS extends Extractor implements CoreAttributeConstants {
47
48 private static final long serialVersionUID = -2231962381454717720L;
49
50 private static Logger LOGGER =
51 Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
52
53 static final String AMP = "&";
54 static final String ESCAPED_AMP = "&";
55 static final String WHITESPACE = "//s";
56
57
58
59
60 static final String JAVASCRIPT_STRING_EXTRACTOR =
61 "(////{0,8}+(?:\"|\'))(//S{0,"+UURI.MAX_URL_LENGTH+"}?)(?://1)";
62
63
64
65
66
67
68
69 static final String STRING_URI_DETECTOR =
70 "(?://w|[//.]{0,2}/)[//S&&[^<>]]*(?://.|/)[//S&&[^<>]]*(?://w|/)";
71
72 protected long numberOfCURIsHandled = 0;
73 protected static long numberOfLinksExtracted = 0;
74
75 /***
76 * @param name
77 */
78 public ExtractorJS(String name) {
79 super(name, "JavaScript extractor. Link extraction on JavaScript" +
80 " files (.js).");
81 }
82
83
84
85
86 public void extract(CrawlURI curi) {
87 if (!isHttpTransactionContentToProcess(curi)) {
88 return;
89 }
90 String contentType = curi.getContentType();
91 if ((contentType == null)) {
92 return;
93 }
94
95
96 if((contentType.indexOf("javascript") < 0) &&
97 (contentType.indexOf("jscript") < 0) &&
98 (contentType.indexOf("ecmascript") < 0) &&
99 (!curi.toString().toLowerCase().endsWith(".js")) &&
100 (curi.getViaContext() == null || !curi.getViaContext().
101 toString().toLowerCase().startsWith("script"))) {
102 return;
103 }
104
105 this.numberOfCURIsHandled++;
106
107 ReplayCharSequence cs = null;
108 try {
109 cs = curi.getHttpRecorder().getReplayCharSequence();
110 } catch (IOException e) {
111 curi.addLocalizedError(this.getName(), e,
112 "Failed get of replay char sequence.");
113 }
114 if (cs == null) {
115 LOGGER.warning("Failed getting ReplayCharSequence: " +
116 curi.toString());
117 return;
118 }
119
120 try {
121 try {
122 numberOfLinksExtracted += considerStrings(curi, cs,
123 getController(), true);
124 } catch (StackOverflowError e) {
125 DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
126 }
127
128 curi.linkExtractorFinished();
129 } finally {
130
131 if (cs != null) {
132 try {
133 cs.close();
134 } catch (IOException ioe) {
135 LOGGER.warning(TextUtils.exceptionToString(
136 "Failed close of ReplayCharSequence.", ioe));
137 }
138 }
139 }
140 }
141
142 public static long considerStrings(CrawlURI curi, CharSequence cs,
143 CrawlController controller, boolean handlingJSFile) {
144 long foundLinks = 0;
145 Matcher strings =
146 TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
147 while(strings.find()) {
148 CharSequence subsequence =
149 cs.subSequence(strings.start(2), strings.end(2));
150 Matcher uri =
151 TextUtils.getMatcher(STRING_URI_DETECTOR, subsequence);
152 if(uri.matches()) {
153 String string = uri.group();
154 string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP);
155 foundLinks++;
156 try {
157 if (handlingJSFile) {
158 curi.createAndAddLinkRelativeToVia(string,
159 Link.JS_MISC, Link.SPECULATIVE_HOP);
160 } else {
161 curi.createAndAddLinkRelativeToBase(string,
162 Link.JS_MISC, Link.SPECULATIVE_HOP);
163 }
164 } catch (URIException e) {
165
166
167 if (controller != null) {
168 controller.logUriError(e, curi.getUURI(), string);
169 } else {
170 LOGGER.info(curi + ", " + string + ": " +
171 e.getMessage());
172 }
173 }
174 } else {
175 foundLinks += considerStrings(curi, subsequence,
176 controller, handlingJSFile);
177 }
178 TextUtils.recycleMatcher(uri);
179 }
180 TextUtils.recycleMatcher(strings);
181 return foundLinks;
182 }
183
184
185
186
187 public String report() {
188 StringBuffer ret = new StringBuffer();
189 ret.append("Processor: org.archive.crawler.extractor.ExtractorJS\n");
190 ret.append(" Function: Link extraction on JavaScript code\n");
191 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
192 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
193
194 return ret.toString();
195 }
196 }