1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.extractor;
25
26 import java.io.IOException;
27 import java.util.ArrayList;
28 import java.util.Iterator;
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31 import java.util.regex.Matcher;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CoreAttributeConstants;
35 import org.archive.crawler.datamodel.CrawlURI;
36 import org.archive.crawler.datamodel.RobotsHonoringPolicy;
37 import org.archive.crawler.settings.SimpleType;
38 import org.archive.crawler.settings.Type;
39 import org.archive.io.ReplayCharSequence;
40 import org.archive.net.UURI;
41 import org.archive.net.UURIFactory;
42 import org.archive.util.DevUtils;
43 import org.archive.util.HttpRecorder;
44 import org.archive.util.TextUtils;
45
46 /***
47 * Basic link-extraction, from an HTML content-body,
48 * using regular expressions.
49 *
50 * @author gojomo
51 *
52 */
53 public class ExtractorHTML extends Extractor
54 implements CoreAttributeConstants {
55
56 private static final long serialVersionUID = 5855731422080471017L;
57
58 private static Logger logger =
59 Logger.getLogger(ExtractorHTML.class.getName());
60
61 /***
62 * Compiled relevant tag extractor.
63 *
64 * <p>
65 * This pattern extracts either:
66 * <li> (1) whole <script>...</script> or
67 * <li> (2) <style>...</style> or
68 * <li> (3) <meta ...> or
69 * <li> (4) any other open-tag with at least one attribute
70 * (eg matches "<a href='boo'>" but not "</a>" or "<br>")
71 * <p>
72 * groups:
73 * <li> 1: SCRIPT SRC=foo>boo</SCRIPT
74 * <li> 2: just script open tag
75 * <li> 3: STYLE TYPE=moo>zoo</STYLE
76 * <li> 4: just style open tag
77 * <li> 5: entire other tag, without '<' '>'
78 * <li> 6: element
79 * <li> 7: META
80 * <li> 8: !-- comment --
81 */
82
83 private static final int MAX_ELEMENT_LENGTH =
84 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
85 ".maxElementNameLength", "1024"));
86
87 static final String RELEVANT_TAG_EXTRACTOR =
88 "(?is)<(?:((script[^>]*+)>.*?</script)" +
89 "|((style[^>]*+)>.*?</style)" +
90 "|(((meta)|(?://w{1,"+MAX_ELEMENT_LENGTH+"}))//s+[^>]*+)" +
91 "|(!--.*?--))>";
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108 private static final int MAX_ATTR_NAME_LENGTH =
109 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
110 ".maxAttributeNameLength", "1024"));
111
112 static final int MAX_ATTR_VAL_LENGTH =
113 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
114 ".maxAttributeValueLength", "16384"));
115
116
117
118
119
120
121 static final String EACH_ATTRIBUTE_EXTRACTOR =
122 "(?is)//s?((href)|(action)|(on//w*)"
123 +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)"
124 +"|(?:usemap)|(?:profile)|(?:datasrc))"
125 +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)"
126 +"|(value)|(style)|(method)"
127 +"|([-//w]{1,"+MAX_ATTR_NAME_LENGTH+"}))"
128 +"//s*=//s*"
129 +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))"
130 +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))"
131 +"|(//S{1,"+MAX_ATTR_VAL_LENGTH+"}))";
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158 static final String LIKELY_URI_PATH =
159 "(//.{0,2}[^//.//n//r//s\"']*(//.[^//.//n//r//s\"']+)+)";
160 static final String WHITESPACE = "//s";
161 static final String CLASSEXT =".class";
162 static final String APPLET = "applet";
163 static final String BASE = "base";
164 static final String LINK = "link";
165 static final String FRAME = "frame";
166 static final String IFRAME = "iframe";
167
168 public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS =
169 "treat-frames-as-embed-links";
170
171 public static final String ATTR_IGNORE_FORM_ACTION_URLS =
172 "ignore-form-action-urls";
173
174 public static final String ATTR_EXTRACT_ONLY_FORM_GETS =
175 "extract-only-form-gets";
176
177 /*** whether to try finding links in Javscript; default true */
178 public static final String ATTR_EXTRACT_JAVASCRIPT =
179 "extract-javascript";
180
181 public static final String EXTRACT_VALUE_ATTRIBUTES =
182 "extract-value-attributes";
183
184 public static final String ATTR_IGNORE_UNEXPECTED_HTML =
185 "ignore-unexpected-html";
186
187
188 protected long numberOfCURIsHandled = 0;
189 protected long numberOfLinksExtracted = 0;
190
191 public ExtractorHTML(String name) {
192 this(name, "HTML extractor. Extracts links from HTML documents");
193 }
194
195 public ExtractorHTML(String name, String description) {
196 super(name, description);
197 Type t = addElementToDefinition(
198 new SimpleType(ATTR_EXTRACT_JAVASCRIPT,
199 "If true, in-page Javascript is scanned for strings that " +
200 "appear likely to be URIs. This typically finds both valid " +
201 "and invalid URIs, and attempts to fetch the invalid URIs " +
202 "sometimes generates webmaster concerns over odd crawler " +
203 "behavior. Default is true.",
204 Boolean.TRUE));
205 t.setExpertSetting(true);
206 t = addElementToDefinition(
207 new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
208 "If true, FRAME/IFRAME SRC-links are treated as embedded " +
209 "resources (like IMG, 'E' hop-type), otherwise they are " +
210 "treated as navigational links. Default is true.", Boolean.TRUE));
211 t.setExpertSetting(true);
212 t = addElementToDefinition(
213 new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,
214 "If true, URIs appearing as the ACTION attribute in " +
215 "HTML FORMs are ignored. Default is false.", Boolean.FALSE));
216 t.setExpertSetting(true);
217 t = addElementToDefinition(
218 new SimpleType(ATTR_EXTRACT_ONLY_FORM_GETS,
219 "If true, only HTML FORM ACTIONs associated with the GET "+
220 "method are extracted. (Form ACTIONs with method POST "+
221 "will be ignored. Default is true", Boolean.TRUE));
222 t.setExpertSetting(true);
223 t = addElementToDefinition(
224 new SimpleType(EXTRACT_VALUE_ATTRIBUTES,
225 "If true, strings that look like URIs found in element VALUE " +
226 "attributes (which are sometimes used as URIs by in-page " +
227 "Javascript or server-side redirects) will be extracted. " +
228 "This typically finds both valid and invalid URIs, and " +
229 "attempts to fetch the invalid URIs sometimes generate " +
230 "webmaster concerns over odd crawler behavior. Default " +
231 "is true.",
232 Boolean.TRUE));
233 t.setExpertSetting(true);
234 t = addElementToDefinition(
235 new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,
236 "If true, URIs which end in typical non-HTML extensions " +
237 "(such as .gif) will not be scanned as if it were HTML. " +
238 "Default is true.", Boolean.TRUE));
239 t.setExpertSetting(true);
240 }
241
242 protected void processGeneralTag(CrawlURI curi, CharSequence element,
243 CharSequence cs) {
244
245 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
246
247
248 String codebase = null;
249 ArrayList<String> resources = null;
250
251
252 CharSequence action = null;
253 CharSequence actionContext = null;
254 CharSequence method = null;
255
256 final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi,
257 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
258
259 final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi,
260 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
261
262 final boolean extractValueAttributes = ((Boolean)getUncheckedAttribute
263 (curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
264
265 final String elementStr = element.toString();
266
267 while (attr.find()) {
268 int valueGroup =
269 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
270 int start = attr.start(valueGroup);
271 int end = attr.end(valueGroup);
272 assert start >= 0: "Start is: " + start + ", " + curi;
273 assert end >= 0: "End is :" + end + ", " + curi;
274 CharSequence value = cs.subSequence(start, end);
275 value = TextUtils.unescapeHtml(value);
276 if (attr.start(2) > -1) {
277
278 CharSequence context =
279 Link.elementContext(element, attr.group(2));
280 if(elementStr.equalsIgnoreCase(LINK)) {
281
282 processEmbed(curi, value, context);
283 } else {
284
285 processLink(curi, value, context);
286 }
287 if (elementStr.equalsIgnoreCase(BASE)) {
288 try {
289 curi.setBaseURI(value.toString());
290 } catch (URIException e) {
291 if (getController() != null) {
292
293
294 getController().logUriError(e, curi.getUURI(),
295 value.toString());
296 } else {
297 logger.info("Failed set base uri: " +
298 curi + ", " + value.toString() + ": " +
299 e.getMessage());
300 }
301 }
302 }
303 } else if (attr.start(3) > -1) {
304
305 if (!ignoreFormActions) {
306 action = value;
307 actionContext = Link.elementContext(element,
308 attr.group(3));
309
310 }
311 } else if (attr.start(4) > -1) {
312
313 processScriptCode(curi, value);
314 } else if (attr.start(5) > -1) {
315
316 CharSequence context = Link.elementContext(element,
317 attr.group(5));
318
319
320 final char hopType;
321
322 if(!framesAsEmbeds
323 && (elementStr.equalsIgnoreCase(FRAME) || elementStr
324 .equalsIgnoreCase(IFRAME))) {
325 hopType = Link.NAVLINK_HOP;
326 } else {
327 hopType = Link.EMBED_HOP;
328 }
329 processEmbed(curi, value, context, hopType);
330 } else if (attr.start(6) > -1) {
331
332 codebase = (value instanceof String)?
333 (String)value: value.toString();
334 CharSequence context = Link.elementContext(element,
335 attr.group(6));
336 processEmbed(curi, codebase, context);
337 } else if (attr.start(7) > -1) {
338
339 if (resources == null) {
340 resources = new ArrayList<String>();
341 }
342 resources.add(value.toString());
343 } else if (attr.start(8) > -1) {
344
345 if (resources==null) {
346 resources = new ArrayList<String>();
347 }
348 String[] multi = TextUtils.split(WHITESPACE, value);
349 for(int i = 0; i < multi.length; i++ ) {
350 resources.add(multi[i]);
351 }
352 } else if (attr.start(9) > -1) {
353
354 if (resources==null) {
355 resources = new ArrayList<String>();
356 }
357
358
359 if (elementStr.equalsIgnoreCase(APPLET) &&
360 !value.toString().toLowerCase().endsWith(CLASSEXT)) {
361 resources.add(value.toString() + CLASSEXT);
362 } else {
363 resources.add(value.toString());
364 }
365 } else if (attr.start(10) > -1) {
366
367 if (extractValueAttributes
368 && TextUtils.matches(LIKELY_URI_PATH, value)) {
369 CharSequence context = Link.elementContext(element,
370 attr.group(10));
371 processLink(curi,value, context);
372 }
373
374 } else if (attr.start(11) > -1) {
375
376
377 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
378 curi, value, getController());
379
380 } else if (attr.start(12) > -1) {
381
382 method = value;
383
384 } else if (attr.start(13) > -1) {
385
386
387
388
389
390 }
391 }
392 TextUtils.recycleMatcher(attr);
393
394
395 if (resources != null) {
396 Iterator iter = resources.iterator();
397 UURI codebaseURI = null;
398 String res = null;
399 try {
400 if (codebase != null) {
401
402 codebaseURI = UURIFactory.
403 getInstance(curi.getUURI(), codebase);
404 }
405 while(iter.hasNext()) {
406 res = iter.next().toString();
407 res = (String) TextUtils.unescapeHtml(res);
408 if (codebaseURI != null) {
409 res = codebaseURI.resolve(res).toString();
410 }
411 processEmbed(curi, res, element);
412 }
413 } catch (URIException e) {
414 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
415 } catch (IllegalArgumentException e) {
416 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
417 "codebase=" + codebase + " res=" + res + "\n" +
418 DevUtils.extraInfo(), e);
419 }
420 }
421
422
423 if(action != null) {
424 if(method == null || "GET".equalsIgnoreCase(method.toString())
425 || ! ((Boolean)getUncheckedAttribute(curi,
426 ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) {
427 processLink(curi, action, actionContext);
428 }
429 }
430 }
431
432 /***
433 * Extract the (java)script source in the given CharSequence.
434 *
435 * @param curi source CrawlURI
436 * @param cs CharSequence of javascript code
437 */
438 protected void processScriptCode(CrawlURI curi, CharSequence cs) {
439 if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {
440 this.numberOfLinksExtracted +=
441 ExtractorJS.considerStrings(curi, cs, getController(), false);
442 }
443 }
444
445 static final String JAVASCRIPT = "(?i)^javascript:.*";
446
447 /***
448 * Handle generic HREF cases.
449 *
450 * @param curi
451 * @param value
452 * @param context
453 */
454 protected void processLink(CrawlURI curi, final CharSequence value,
455 CharSequence context) {
456 if (TextUtils.matches(JAVASCRIPT, value)) {
457 processScriptCode(curi, value. subSequence(11, value.length()));
458 } else {
459 if (logger.isLoggable(Level.FINEST)) {
460 logger.finest("link: " + value.toString() + " from " + curi);
461 }
462 addLinkFromString(curi,
463 (value instanceof String)?
464 (String)value: value.toString(),
465 context, Link.NAVLINK_HOP);
466 this.numberOfLinksExtracted++;
467 }
468 }
469
470 private void addLinkFromString(CrawlURI curi, String uri,
471 CharSequence context, char hopType) {
472 try {
473
474
475
476
477 curi.createAndAddLinkRelativeToBase(uri, context.toString(),
478 hopType);
479 } catch (URIException e) {
480 if (getController() != null) {
481 getController().logUriError(e, curi.getUURI(), uri);
482 } else {
483 logger.info("Failed createAndAddLinkRelativeToBase " +
484 curi + ", " + uri + ", " + context + ", " + hopType +
485 ": " + e);
486 }
487 }
488 }
489
490 protected final void processEmbed(CrawlURI curi, CharSequence value,
491 CharSequence context) {
492 processEmbed(curi, value, context, Link.EMBED_HOP);
493 }
494
495 protected void processEmbed(CrawlURI curi, final CharSequence value,
496 CharSequence context, char hopType) {
497 if (logger.isLoggable(Level.FINEST)) {
498 logger.finest("embed (" + hopType + "): " + value.toString() +
499 " from " + curi);
500 }
501 addLinkFromString(curi,
502 (value instanceof String)?
503 (String)value: value.toString(),
504 context, hopType);
505 this.numberOfLinksExtracted++;
506 }
507
508 public void extract(CrawlURI curi) {
509 if (!isHttpTransactionContentToProcess(curi) ||
510 ! (isExpectedMimeType(curi.getContentType(), "text/html")
511 || isExpectedMimeType(curi.getContentType(), "application/xhtml"))) {
512 return;
513 }
514
515 final boolean ignoreUnexpectedHTML =
516 ((Boolean)getUncheckedAttribute(curi,
517 ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();
518
519 if (ignoreUnexpectedHTML) {
520 try {
521 if(!isHtmlExpectedHere(curi)) {
522
523
524 return;
525 }
526 } catch (URIException e) {
527 logger.severe("Failed expectedHTML test: " + e.getMessage());
528 }
529 }
530
531 this.numberOfCURIsHandled++;
532
533 ReplayCharSequence cs = null;
534
535 try {
536 HttpRecorder hr = curi.getHttpRecorder();
537 if (hr == null) {
538 throw new IOException("Why is recorder null here?");
539 }
540 cs = hr.getReplayCharSequence();
541 } catch (IOException e) {
542 curi.addLocalizedError(this.getName(), e,
543 "Failed get of replay char sequence " + curi.toString() +
544 " " + e.getMessage());
545 logger.log(Level.SEVERE,"Failed get of replay char sequence in " +
546 Thread.currentThread().getName(), e);
547 }
548
549 if (cs == null) {
550 return;
551 }
552
553
554
555 try {
556
557 extract(curi, cs);
558
559 curi.linkExtractorFinished();
560 } finally {
561 if (cs != null) {
562 try {
563 cs.close();
564 } catch (IOException ioe) {
565 logger.warning(TextUtils.exceptionToString(
566 "Failed close of ReplayCharSequence.", ioe));
567 }
568 }
569 }
570 }
571
572 /***
573 * Run extractor.
574 * This method is package visible to ease testing.
575 * @param curi CrawlURI we're processing.
576 * @param cs Sequence from underlying ReplayCharSequence. This
577 * is TRANSIENT data. Make a copy if you want the data to live outside
578 * of this extractors' lifetime.
579 */
580 void extract(CrawlURI curi, CharSequence cs) {
581 Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
582 while(tags.find()) {
583 if(Thread.interrupted()){
584 break;
585 }
586 if (tags.start(8) > 0) {
587
588
589 } else if (tags.start(7) > 0) {
590
591 int start = tags.start(5);
592 int end = tags.end(5);
593 assert start >= 0: "Start is: " + start + ", " + curi;
594 assert end >= 0: "End is :" + end + ", " + curi;
595 if (processMeta(curi,
596 cs.subSequence(start, end))) {
597
598
599 break;
600 }
601 } else if (tags.start(5) > 0) {
602
603 int start5 = tags.start(5);
604 int end5 = tags.end(5);
605 assert start5 >= 0: "Start is: " + start5 + ", " + curi;
606 assert end5 >= 0: "End is :" + end5 + ", " + curi;
607 int start6 = tags.start(6);
608 int end6 = tags.end(6);
609 assert start6 >= 0: "Start is: " + start6 + ", " + curi;
610 assert end6 >= 0: "End is :" + end6 + ", " + curi;
611 processGeneralTag(curi,
612 cs.subSequence(start6, end6),
613 cs.subSequence(start5, end5));
614
615 } else if (tags.start(1) > 0) {
616
617 int start = tags.start(1);
618 int end = tags.end(1);
619 assert start >= 0: "Start is: " + start + ", " + curi;
620 assert end >= 0: "End is :" + end + ", " + curi;
621 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +
622 ", " + curi;
623 processScript(curi, cs.subSequence(start, end),
624 tags.end(2) - start);
625
626 } else if (tags.start(3) > 0){
627
628 int start = tags.start(3);
629 int end = tags.end(3);
630 assert start >= 0: "Start is: " + start + ", " + curi;
631 assert end >= 0: "End is :" + end + ", " + curi;
632 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +
633 ", " + curi;
634 processStyle(curi, cs.subSequence(start, end),
635 tags.end(4) - start);
636 }
637 }
638 TextUtils.recycleMatcher(tags);
639 }
640
641
642 static final String NON_HTML_PATH_EXTENSION =
643 "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
644 "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
645
646 /***
647 * Test whether this HTML is so unexpected (eg in place of a GIF URI)
648 * that it shouldn't be scanned for links.
649 *
650 * @param curi CrawlURI to examine.
651 * @return True if HTML is acceptable/expected here
652 * @throws URIException
653 */
654 protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
655 String path = curi.getUURI().getPath();
656 if(path==null) {
657
658 return true;
659 }
660 int dot = path.lastIndexOf('.');
661 if (dot < 0) {
662
663 return true;
664 }
665 if(dot<(path.length()-5)) {
666
667 return true;
668 }
669 String ext = path.substring(dot+1);
670 return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
671 }
672
673 protected void processScript(CrawlURI curi, CharSequence sequence,
674 int endOfOpenTag) {
675
676
677 processGeneralTag(curi,sequence.subSequence(0,6),
678 sequence.subSequence(0,endOfOpenTag));
679
680
681
682 processScriptCode(
683 curi, sequence.subSequence(endOfOpenTag, sequence.length()));
684 }
685
686 /***
687 * Process metadata tags.
688 * @param curi CrawlURI we're processing.
689 * @param cs Sequence from underlying ReplayCharSequence. This
690 * is TRANSIENT data. Make a copy if you want the data to live outside
691 * of this extractors' lifetime.
692 * @return True robots exclusion metatag.
693 */
694 protected boolean processMeta(CrawlURI curi, CharSequence cs) {
695 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
696 String name = null;
697 String httpEquiv = null;
698 String content = null;
699 while (attr.find()) {
700 int valueGroup =
701 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
702 CharSequence value =
703 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
704 if (attr.group(1).equalsIgnoreCase("name")) {
705 name = value.toString();
706 } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
707 httpEquiv = value.toString();
708 } else if (attr.group(1).equalsIgnoreCase("content")) {
709 content = value.toString();
710 }
711
712 }
713 TextUtils.recycleMatcher(attr);
714
715
716 if("robots".equalsIgnoreCase(name) && content != null ) {
717 curi.putString(A_META_ROBOTS, content);
718 RobotsHonoringPolicy policy =
719 getSettingsHandler().getOrder().getRobotsHonoringPolicy();
720 String contentLower = content.toLowerCase();
721 if ((policy == null
722 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
723 && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
724 && (contentLower.indexOf("nofollow") >= 0
725 || contentLower.indexOf("none") >= 0)) {
726
727
728 logger.fine("HTML extraction skipped due to robots meta-tag for: "
729 + curi.toString());
730 return true;
731 }
732 } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
733 String refreshUri = content.substring(content.indexOf("=") + 1);
734 try {
735 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
736 Link.REFER_HOP);
737 } catch (URIException e) {
738 if (getController() != null) {
739 getController().logUriError(e, curi.getUURI(), refreshUri);
740 } else {
741 logger.info("Failed createAndAddLinkRelativeToBase " +
742 curi + ", " + cs + ", " + refreshUri + ": " + e);
743 }
744 }
745 }
746 return false;
747 }
748
749 /***
750 * Process style text.
751 * @param curi CrawlURI we're processing.
752 * @param sequence Sequence from underlying ReplayCharSequence. This
753 * is TRANSIENT data. Make a copy if you want the data to live outside
754 * of this extractors' lifetime.
755 * @param endOfOpenTag
756 */
757 protected void processStyle(CrawlURI curi, CharSequence sequence,
758 int endOfOpenTag) {
759
760 processGeneralTag(curi, sequence.subSequence(0,6),
761 sequence.subSequence(0,endOfOpenTag));
762
763
764 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
765 curi, sequence.subSequence(endOfOpenTag,sequence.length()),
766 getController());
767 }
768
769
770
771
772
773
774 public String report() {
775 StringBuffer ret = new StringBuffer();
776 ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
777 ret.append(" Function: Link extraction on HTML documents\n");
778 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
779 ret.append(" Links extracted: " + this.numberOfLinksExtracted +
780 "\n\n");
781 return ret.toString();
782 }
783 }
784