1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.extractor;
25
26 import java.util.LinkedList;
27 import java.util.regex.Matcher;
28 import java.util.regex.Pattern;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.extractor.Link;
32 import org.archive.net.UURI;
33 import org.archive.net.UURIFactory;
34 import org.archive.util.TextUtils;
35
36 /***
37 * Uses regular expressions to find likely URIs inside Javascript.
38 *
39 * ROUGH DRAFT IN PROGRESS / incomplete... untested...
40 *
41 * @author gojomo
42 */
43 public class RegexpJSLinkExtractor extends CharSequenceLinkExtractor {
44
45
46
47 static final String AMP = "&";
48 static final String ESCAPED_AMP = "&";
49 static final String WHITESPACE = "//s";
50
51
52
53
54 static final Pattern JAVASCRIPT_STRING_EXTRACTOR = Pattern.compile(
55 "(////{0,8}+(?:\"|\'))(.+?)(?://1)");
56
57
58
59
60 static final Pattern STRING_URI_DETECTOR = Pattern.compile(
61 "(?://w|[//.]{0,2}/)[//S&&[^<>]]*(?://.|/)[//S&&[^<>]]*(?://w|/)");
62
63 Matcher strings;
64 LinkedList<Matcher> matcherStack = new LinkedList<Matcher>();
65
66 protected boolean findNextLink() {
67 if(strings==null) {
68 strings = JAVASCRIPT_STRING_EXTRACTOR.matcher(sourceContent);
69 }
70 while(strings!=null) {
71 while(strings.find()) {
72 CharSequence subsequence =
73 sourceContent.subSequence(strings.start(2), strings.end(2));
74 Matcher uri = STRING_URI_DETECTOR.matcher(subsequence);
75 if ((subsequence.length() <= UURI.MAX_URL_LENGTH) && uri.matches()) {
76 String string = uri.group();
77 string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP);
78 try {
79 Link link = new Link(source, UURIFactory.getInstance(
80 source, string), Link.JS_MISC, Link.SPECULATIVE_HOP);
81 next.add(link);
82 return true;
83 } catch (URIException e) {
84 extractErrorListener.noteExtractError(e,source,string);
85 }
86 } else {
87
88 matcherStack.addFirst(strings);
89
90 strings = JAVASCRIPT_STRING_EXTRACTOR.matcher(subsequence);
91 }
92 }
93
94 strings = (Matcher) (matcherStack.isEmpty() ? null : matcherStack.removeFirst());
95 }
96 return false;
97 }
98
99
100
101
102
103 public void reset() {
104 super.reset();
105 matcherStack.clear();
106 strings = null;
107 }
108
109 protected static CharSequenceLinkExtractor newDefaultInstance() {
110 return new RegexpJSLinkExtractor();
111 }
112 }