1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.extractor;
25
26 import java.util.ArrayList;
27 import java.util.Iterator;
28 import java.util.LinkedList;
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31 import java.util.regex.Matcher;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.extractor.Link;
35 import org.archive.net.UURI;
36 import org.archive.net.UURIFactory;
37 import org.archive.util.DevUtils;
38 import org.archive.util.TextUtils;
39
40
41 /***
42 * Basic link-extraction, from an HTML content-body,
43 * using regular expressions.
44 *
45 * ROUGH DRAFT IN PROGRESS / incomplete... untested...
46 *
47 * @author gojomo
48 */
49 public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor {
50 private static Logger logger =
51 Logger.getLogger(RegexpHTMLLinkExtractor.class.getName());
52
53 boolean honorRobots = true;
54 boolean extractInlineCss = true;
55 boolean extractInlineJs = true;
56
57 protected LinkedList<Link> next = new LinkedList<Link>();
58 protected Matcher tags;
59
60
61
62
63 protected boolean findNextLink() {
64 if (tags == null) {
65 tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, sourceContent);
66 }
67 while(tags.find()) {
68 if(Thread.interrupted()){
69
70 break;
71 }
72 if (tags.start(8) > 0) {
73
74
75 } else if (tags.start(7) > 0) {
76
77 int start = tags.start(5);
78 int end = tags.end(5);
79 processMeta(sourceContent.subSequence(start, end));
80 } else if (tags.start(5) > 0) {
81
82 int start5 = tags.start(5);
83 int end5 = tags.end(5);
84 int start6 = tags.start(6);
85 int end6 = tags.end(6);
86 processGeneralTag(sourceContent.subSequence(start6, end6),
87 sourceContent.subSequence(start5, end5));
88 } else if (tags.start(1) > 0) {
89
90 int start = tags.start(1);
91 int end = tags.end(1);
92 processScript(sourceContent.subSequence(start, end),
93 tags.end(2) - start);
94 } else if (tags.start(3) > 0){
95
96 int start = tags.start(3);
97 int end = tags.end(3);
98 processStyle(sourceContent.subSequence(start, end),
99 tags.end(4) - start);
100 }
101 if(!next.isEmpty()) {
102
103 return true;
104 }
105 }
106
107 return false;
108 }
109
110 /***
111 * Compiled relevant tag extractor.
112 *
113 * <p>
114 * This pattern extracts either:
115 * <li> (1) whole <script>...</script> or
116 * <li> (2) <style>...</style> or
117 * <li> (3) <meta ...> or
118 * <li> (4) any other open-tag with at least one attribute
119 * (eg matches "<a href='boo'>" but not "</a>" or "<br>")
120 * <p>
121 * groups:
122 * <li> 1: SCRIPT SRC=foo>boo</SCRIPT
123 * <li> 2: just script open tag
124 * <li> 3: STYLE TYPE=moo>zoo</STYLE
125 * <li> 4: just style open tag
126 * <li> 5: entire other tag, without '<' '>'
127 * <li> 6: element
128 * <li> 7: META
129 * <li> 8: !-- comment --
130 */
131 static final String RELEVANT_TAG_EXTRACTOR =
132 "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?://w+))//s+[^>]*+)|(!--.*?--))>";
133
134
135
136
137 static final String EACH_ATTRIBUTE_EXTRACTOR =
138 "(?is)//s((href)|(action)|(on//w*)"
139 +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)"
140 +"|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))"
141 +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)"
142 +"|(value)|([-//w]+))"
143 +"//s*=//s*"
144 +"(?:(?:\"(.*?)(?:\"|$))"
145 +"|(?:'(.*?)(?:'|$))"
146 +"|(//S+))";
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171 static final String LIKELY_URI_PATH =
172 "(//.{0,2}[^//.//n//r//s\"']*(//.[^//.//n//r//s\"']+)+)";
173 static final String ESCAPED_AMP = "&";
174 static final String AMP ="&";
175 static final String WHITESPACE = "//s";
176 static final String CLASSEXT =".class";
177 static final String APPLET = "applet";
178 static final String BASE = "base";
179 static final String LINK = "link";
180
181 protected boolean processGeneralTag(CharSequence element, CharSequence cs) {
182
183 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
184
185
186 String codebase = null;
187 ArrayList<String> resources = null;
188 long tally = next.size();
189
190 while (attr.find()) {
191 int valueGroup =
192 (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
193 int start = attr.start(valueGroup);
194 int end = attr.end(valueGroup);
195 CharSequence value = cs.subSequence(start, end);
196 if (attr.start(2) > -1) {
197
198 CharSequence context = Link.elementContext(element, attr.group(2));
199 if(element.toString().equalsIgnoreCase(LINK)) {
200
201 processEmbed(value, context);
202 } else {
203 if (element.toString().equalsIgnoreCase(BASE)) {
204 try {
205 base = UURIFactory.getInstance(value.toString());
206 } catch (URIException e) {
207 extractErrorListener.noteExtractError(e,source,value);
208 }
209 }
210
211 processLink(value, context);
212 }
213 } else if (attr.start(3) > -1) {
214
215 CharSequence context = Link.elementContext(element, attr.group(3));
216 processLink(value, context);
217 } else if (attr.start(4) > -1) {
218
219 processScriptCode(value);
220 } else if (attr.start(5) > -1) {
221
222 CharSequence context = Link.elementContext(element, attr.group(5));
223 processEmbed(value, context);
224 } else if (attr.start(6) > -1) {
225
226
227 codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP);
228 CharSequence context = Link.elementContext(element,attr.group(6));
229 processEmbed(codebase, context);
230 } else if (attr.start(7) > -1) {
231
232 if (resources == null) {
233 resources = new ArrayList<String>();
234 }
235 resources.add(value.toString());
236 } else if (attr.start(8) > -1) {
237
238 if (resources==null) {
239 resources = new ArrayList<String>();
240 }
241 String[] multi = TextUtils.split(WHITESPACE, value);
242 for(int i = 0; i < multi.length; i++ ) {
243 resources.add(multi[i]);
244 }
245 } else if (attr.start(9) > -1) {
246
247 if (resources==null) {
248 resources = new ArrayList<String>();
249 }
250
251
252 if (element.toString().toLowerCase().equals(APPLET) &&
253 !value.toString().toLowerCase().endsWith(CLASSEXT)) {
254 resources.add(value.toString() + CLASSEXT);
255 } else {
256 resources.add(value.toString());
257 }
258
259 } else if (attr.start(10) > -1) {
260
261 if(TextUtils.matches(LIKELY_URI_PATH, value)) {
262 CharSequence context = Link.elementContext(element, attr.group(10));
263 processLink(value, context);
264 }
265
266 } else if (attr.start(11) > -1) {
267
268
269
270
271
272 }
273 }
274 TextUtils.recycleMatcher(attr);
275
276
277 if (resources == null) {
278 return (tally-next.size())>0;
279 }
280 Iterator iter = resources.iterator();
281 UURI codebaseURI = null;
282 String res = null;
283 try {
284 if (codebase != null) {
285
286 codebaseURI = UURIFactory.getInstance(base, codebase);
287 }
288 while(iter.hasNext()) {
289 res = iter.next().toString();
290
291 res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);
292 if (codebaseURI != null) {
293 res = codebaseURI.resolve(res).toString();
294 }
295 processEmbed(res, element);
296 }
297 } catch (URIException e) {
298 extractErrorListener.noteExtractError(e,source,codebase);
299 } catch (IllegalArgumentException e) {
300 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
301 "codebase=" + codebase + " res=" + res + "\n" +
302 DevUtils.extraInfo(), e);
303 }
304 return (tally-next.size())>0;
305 }
306
307 /***
308 * @param cs
309 */
310 protected void processScriptCode(CharSequence cs) {
311 RegexpJSLinkExtractor.extract(cs, source, base, next,
312 extractErrorListener);
313 }
314
315 static final String JAVASCRIPT = "(?i)^javascript:.*";
316
317 /***
318 * @param value
319 * @param context
320 */
321 protected void processLink(CharSequence value, CharSequence context) {
322 String link = TextUtils.replaceAll(ESCAPED_AMP, value, "&");
323
324 if(TextUtils.matches(JAVASCRIPT, link)) {
325 processScriptCode(value.subSequence(11, value.length()));
326 } else {
327 addLinkFromString(link, context,Link.NAVLINK_HOP);
328 }
329 }
330
331 /***
332 * @param uri
333 * @param context
334 */
335 private void addLinkFromString(String uri, CharSequence context, char hopType) {
336 try {
337 Link link = new Link(source, UURIFactory.getInstance(
338 base, uri), context, hopType);
339 next.addLast(link);
340 } catch (URIException e) {
341 extractErrorListener.noteExtractError(e,source,uri);
342 }
343 }
344
345 protected long processEmbed(CharSequence value, CharSequence context) {
346 String embed = TextUtils.replaceAll(ESCAPED_AMP, value, "&");
347 addLinkFromString(embed, context,Link.EMBED_HOP);
348 return 1;
349 }
350
351 static final String NON_HTML_PATH_EXTENSION =
352 "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
353 "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
354
355 protected void processScript(CharSequence sequence, int endOfOpenTag) {
356
357
358 processGeneralTag(sequence.subSequence(0,6),
359 sequence.subSequence(0,endOfOpenTag));
360
361
362
363 processScriptCode(sequence.subSequence(endOfOpenTag, sequence.length()));
364 }
365
366 protected void processMeta(CharSequence cs) {
367 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
368
369 String name = null;
370 String httpEquiv = null;
371 String content = null;
372
373 while (attr.find()) {
374 int valueGroup =
375 (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
376 CharSequence value =
377 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
378 if (attr.group(1).equalsIgnoreCase("name")) {
379 name = value.toString();
380 } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
381 httpEquiv = value.toString();
382 } else if (attr.group(1).equalsIgnoreCase("content")) {
383 content = value.toString();
384 }
385
386 }
387 TextUtils.recycleMatcher(attr);
388
389
390 if("robots".equalsIgnoreCase(name) && content != null ) {
391 if (getHonorRobots()) {
392 String contentLower = content.toLowerCase();
393 if ((contentLower.indexOf("nofollow") >= 0
394 || contentLower.indexOf("none") >= 0)) {
395
396
397 logger.fine("HTML extraction skipped due to robots meta-tag for: "
398 + source);
399 cancelFurtherExtraction();
400 return;
401 }
402 }
403 } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
404 String refreshUri = content.substring(content.indexOf("=") + 1);
405 try {
406 Link refreshLink = new Link(source, UURIFactory.getInstance(base,refreshUri), Link.elementContext("meta",httpEquiv),Link.REFER_HOP);
407 next.addLast(refreshLink);
408 } catch (URIException e) {
409 extractErrorListener.noteExtractError(e,source,refreshUri);
410 }
411 }
412 }
413
414 /***
415 * @return whether to honor internal robots directives (eg meta robots)
416 */
417 private boolean getHonorRobots() {
418 return honorRobots;
419 }
420
421 /***
422 * Ensure no further Links are extracted (by setting matcher up to fail)
423 */
424 private void cancelFurtherExtraction() {
425
426
427 tags.reset("");
428 }
429
430 /***
431 * @param sequence
432 * @param endOfOpenTag
433 */
434 protected void processStyle(CharSequence sequence,
435 int endOfOpenTag)
436 {
437
438 processGeneralTag(sequence.subSequence(0,6),
439 sequence.subSequence(0,endOfOpenTag));
440
441
442 RegexpCSSLinkExtractor.extract(sequence.subSequence(endOfOpenTag,
443 sequence.length()), source, base, next, extractErrorListener);
444 }
445
446 /***
447 * Discard all state. Another setup() is required to use again.
448 */
449 public void reset() {
450 super.reset();
451 TextUtils.recycleMatcher(tags);
452 tags = null;
453 }
454
455 protected static CharSequenceLinkExtractor newDefaultInstance() {
456 return new RegexpHTMLLinkExtractor();
457 }
458 }
459