View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimpleHTMLExtractor.java
20   * Created on Jun 5, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.extractor;
25  
26  import java.io.IOException;
27  import java.util.ArrayList;
28  import java.util.Iterator;
29  import java.util.logging.Level;
30  import java.util.logging.Logger;
31  import java.util.regex.Matcher;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.archive.crawler.datamodel.CoreAttributeConstants;
35  import org.archive.crawler.datamodel.CrawlURI;
36  import org.archive.crawler.datamodel.RobotsHonoringPolicy;
37  import org.archive.crawler.settings.SimpleType;
38  import org.archive.crawler.settings.Type;
39  import org.archive.io.ReplayCharSequence;
40  import org.archive.net.UURI;
41  import org.archive.net.UURIFactory;
42  import org.archive.util.DevUtils;
43  import org.archive.util.HttpRecorder;
44  import org.archive.util.TextUtils;
45  
46  /***
47   * Basic link-extraction, from an HTML content-body,
48   * using regular expressions.
49   *
50   * @author gojomo
51   *
52   */
53  public class ExtractorHTML extends Extractor
54  implements CoreAttributeConstants {
55  
56      private static final long serialVersionUID = 5855731422080471017L;
57  
58      private static Logger logger =
59          Logger.getLogger(ExtractorHTML.class.getName());
60  
61      /***
62       * Compiled relevant tag extractor.
63       *
64       * <p>
65       * This pattern extracts either:
66       * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
67       * <li> (2) &lt;style&gt;...&lt;/style&gt; or
68       * <li> (3) &lt;meta ...&gt; or
69       * <li> (4) any other open-tag with at least one attribute
70       * (eg matches "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
71       * <p>
72       * groups:
73       * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
74       * <li> 2: just script open tag
75       * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
76       * <li> 4: just style open tag
77       * <li> 5: entire other tag, without '<' '>'
78       * <li> 6: element
79       * <li> 7: META
80       * <li> 8: !-- comment --
81       */
82  // version w/ less unnecessary backtracking
83        private static final int MAX_ELEMENT_LENGTH =
84            Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
85                ".maxElementNameLength", "1024"));
86        
87        static final String RELEVANT_TAG_EXTRACTOR =
88            "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2
89            "|((style[^>]*+)>.*?</style)" + // 3, 4
90            "|(((meta)|(?://w{1,"+MAX_ELEMENT_LENGTH+"}))//s+[^>]*+)" + // 5, 6, 7
91            "|(!--.*?--))>"; // 8 
92  
93  //    version w/ problems with unclosed script tags 
94  //    static final String RELEVANT_TAG_EXTRACTOR =
95  //    "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?://w+))//s+.*?)|(!--.*?--))>";
96  
97  
98        
99  //    // this pattern extracts 'href' or 'src' attributes from
100 //    // any open-tag innards matched by the above
101 //    static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(
102 //     "(?is)(//w+)(?://s+|(?://s.*?//s))(?:(href)|(src))//s*=(?:(?://s*\"(.+?)\")|(?://s*'(.+?)')|(//S+))");
103 //
104 //    // this pattern extracts 'robots' attributes
105 //    static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(
106 //     "(?is)(//w+)//s+.*?(?:(robots))//s*=(?:(?://s*\"(.+)\")|(?://s*'(.+)')|(//S+))");
107 
108       private static final int MAX_ATTR_NAME_LENGTH =
109           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
110               ".maxAttributeNameLength", "1024")); // 1K; 
111       
112       static final int MAX_ATTR_VAL_LENGTH = 
113           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
114               ".maxAttributeValueLength", "16384")); // 16K; 
115       
116     // TODO: perhaps cut to near MAX_URI_LENGTH
117     
118     // this pattern extracts attributes from any open-tag innards
119     // matched by the above. attributes known to be URIs of various
120     // sorts are matched specially
121     static final String EACH_ATTRIBUTE_EXTRACTOR =
122       "(?is)//s?((href)|(action)|(on//w*)" // 1, 2, 3, 4 
123      +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...
124      +"|(?:usemap)|(?:profile)|(?:datasrc))" // 5
125      +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9
126      +"|(value)|(style)|(method)" // 10, 11, 12
127      +"|([-//w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 13
128      +"//s*=//s*"
129      +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 14
130      +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 15
131      +"|(//S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 16
132     // groups:
133     // 1: attribute name
134     // 2: HREF - single URI relative to doc base, or occasionally javascript:
135     // 3: ACTION - single URI relative to doc base, or occasionally javascript:
136     // 4: ON[WHATEVER] - script handler
137     // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC
138     //    single URI relative to doc base
139     // 6: CODEBASE - a single URI relative to doc base, affecting other
140     //    attributes
141     // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
142     // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
143     //    (if supplied)
144     // 9: CODE - a single URI relative to the CODEBASE (is specified).
145     // 10: VALUE - often includes a uri path on forms
146     // 11: STYLE - inline attribute style info
147     // 12: METHOD - form GET/POST
148     // 13: any other attribute
149     // 14: double-quote delimited attr value
150     // 15: single-quote delimited attr value
151     // 16: space-delimited attr value
152 
153 
154     // much like the javascript likely-URI extractor, but
155     // without requiring quotes -- this can indicate whether
156     // an HTML tag attribute that isn't definitionally a
157     // URI might be one anyway, as in form-tag VALUE attributes
158     static final String LIKELY_URI_PATH =
159      "(//.{0,2}[^//.//n//r//s\"']*(//.[^//.//n//r//s\"']+)+)";
160     static final String WHITESPACE = "//s";
161     static final String CLASSEXT =".class";
162     static final String APPLET = "applet";
163     static final String BASE = "base";
164     static final String LINK = "link";
165     static final String FRAME = "frame";
166     static final String IFRAME = "iframe";
167 
168     public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS =
169         "treat-frames-as-embed-links";
170     
171     public static final String ATTR_IGNORE_FORM_ACTION_URLS =
172         "ignore-form-action-urls";
173 
174     public static final String ATTR_EXTRACT_ONLY_FORM_GETS =
175         "extract-only-form-gets";
176 
177     /*** whether to try finding links in Javscript; default true */
178     public static final String ATTR_EXTRACT_JAVASCRIPT =
179         "extract-javascript";
180 
181     public static final String EXTRACT_VALUE_ATTRIBUTES =
182         "extract-value-attributes";
183     
184     public static final String ATTR_IGNORE_UNEXPECTED_HTML = 
185         "ignore-unexpected-html";
186 
187     
188     protected long numberOfCURIsHandled = 0;
189     protected long numberOfLinksExtracted = 0;
190 
191     public ExtractorHTML(String name) {
192         this(name, "HTML extractor. Extracts links from HTML documents");
193     }
194     
195     public ExtractorHTML(String name, String description) {
196         super(name, description);
197         Type t = addElementToDefinition(
198             new SimpleType(ATTR_EXTRACT_JAVASCRIPT,
199             "If true, in-page Javascript is scanned for strings that " +
200             "appear likely to be URIs. This typically finds both valid " +
201             "and invalid URIs, and attempts to fetch the invalid URIs " +
202             "sometimes generates webmaster concerns over odd crawler " +
203             "behavior. Default is true.",
204             Boolean.TRUE));
205         t.setExpertSetting(true);
206         t = addElementToDefinition(
207             new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
208             "If true, FRAME/IFRAME SRC-links are treated as embedded " +
209             "resources (like IMG, 'E' hop-type), otherwise they are " +
210             "treated as navigational links. Default is true.", Boolean.TRUE));
211         t.setExpertSetting(true);
212         t = addElementToDefinition(
213             new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,
214             "If true, URIs appearing as the ACTION attribute in " +
215             "HTML FORMs are ignored. Default is false.", Boolean.FALSE));
216         t.setExpertSetting(true);
217         t = addElementToDefinition(
218                 new SimpleType(ATTR_EXTRACT_ONLY_FORM_GETS,
219                 "If true, only HTML FORM ACTIONs associated with the GET "+ 
220                 "method are extracted. (Form ACTIONs with method POST "+
221                 "will be ignored. Default is true", Boolean.TRUE));
222         t.setExpertSetting(true);
223         t = addElementToDefinition(
224             new SimpleType(EXTRACT_VALUE_ATTRIBUTES,
225             "If true, strings that look like URIs found in element VALUE " +
226             "attributes (which are sometimes used as URIs by in-page " +
227             "Javascript or server-side redirects) will be extracted. " +
228             "This typically finds both valid and invalid URIs, and " +
229             "attempts to fetch the invalid URIs sometimes generate " +
230             "webmaster concerns over odd crawler behavior. Default " +
231             "is true.",
232             Boolean.TRUE));
233         t.setExpertSetting(true);
234         t = addElementToDefinition(
235             new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,
236             "If true, URIs which end in typical non-HTML extensions " +
237             "(such as .gif) will not be scanned as if it were HTML. " +
238             "Default is true.", Boolean.TRUE));
239         t.setExpertSetting(true);
240     }
241 
242     protected void processGeneralTag(CrawlURI curi, CharSequence element,
243             CharSequence cs) {
244 
245         Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
246 
247         // Just in case it's an OBJECT or APPLET tag
248         String codebase = null;
249         ArrayList<String> resources = null;
250         
251         // Just in case it's a FORM
252         CharSequence action = null;
253         CharSequence actionContext = null;
254         CharSequence method = null; 
255         
256         final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi,
257             ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
258 
259         final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi,
260                 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
261         
262         final boolean extractValueAttributes = ((Boolean)getUncheckedAttribute
263                 (curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
264         
265         final String elementStr = element.toString();
266 
267         while (attr.find()) {
268             int valueGroup =
269                 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
270             int start = attr.start(valueGroup);
271             int end = attr.end(valueGroup);
272             assert start >= 0: "Start is: " + start + ", " + curi;
273             assert end >= 0: "End is :" + end + ", " + curi;
274             CharSequence value = cs.subSequence(start, end);
275             value = TextUtils.unescapeHtml(value);
276             if (attr.start(2) > -1) {
277                 // HREF
278                 CharSequence context =
279                     Link.elementContext(element, attr.group(2));
280                 if(elementStr.equalsIgnoreCase(LINK)) {
281                     // <LINK> elements treated as embeds (css, ico, etc)
282                     processEmbed(curi, value, context);
283                 } else {
284                     // other HREFs treated as links
285                     processLink(curi, value, context);
286                 }
287                 if (elementStr.equalsIgnoreCase(BASE)) {
288                     try {
289                         curi.setBaseURI(value.toString());
290                     } catch (URIException e) {
291                         if (getController() != null) {
292                             // Controller can be null: e.g. when running
293                             // ExtractorTool.
294                             getController().logUriError(e, curi.getUURI(),
295                                 value.toString());
296                         } else {
297                             logger.info("Failed set base uri: " +
298                                 curi + ", " + value.toString() + ": " +
299                                 e.getMessage());
300                         }
301                     }
302                 }
303             } else if (attr.start(3) > -1) {
304                 // ACTION
305                 if (!ignoreFormActions) {
306                     action = value; 
307                     actionContext = Link.elementContext(element,
308                         attr.group(3));
309                     // handling finished only at end (after METHOD also collected)
310                 }
311             } else if (attr.start(4) > -1) {
312                 // ON____
313                 processScriptCode(curi, value); // TODO: context?
314             } else if (attr.start(5) > -1) {
315                 // SRC etc.
316                 CharSequence context = Link.elementContext(element,
317                     attr.group(5));
318                 
319                 // true, if we expect another HTML page instead of an image etc.
320                 final char hopType;
321                 
322                 if(!framesAsEmbeds
323                     && (elementStr.equalsIgnoreCase(FRAME) || elementStr
324                         .equalsIgnoreCase(IFRAME))) {
325                     hopType = Link.NAVLINK_HOP;
326                 } else {
327                     hopType = Link.EMBED_HOP;
328                 }
329                 processEmbed(curi, value, context, hopType);
330             } else if (attr.start(6) > -1) {
331                 // CODEBASE
332                 codebase = (value instanceof String)?
333                     (String)value: value.toString();
334                 CharSequence context = Link.elementContext(element,
335                     attr.group(6));
336                 processEmbed(curi, codebase, context);
337             } else if (attr.start(7) > -1) {
338                 // CLASSID, DATA
339                 if (resources == null) {
340                     resources = new ArrayList<String>();
341                 }
342                 resources.add(value.toString());
343             } else if (attr.start(8) > -1) {
344                 // ARCHIVE
345                 if (resources==null) {
346                     resources = new ArrayList<String>();
347                 }
348                 String[] multi = TextUtils.split(WHITESPACE, value);
349                 for(int i = 0; i < multi.length; i++ ) {
350                     resources.add(multi[i]);
351                 }
352             } else if (attr.start(9) > -1) {
353                 // CODE
354                 if (resources==null) {
355                     resources = new ArrayList<String>();
356                 }
357                 // If element is applet and code value does not end with
358                 // '.class' then append '.class' to the code value.
359                 if (elementStr.equalsIgnoreCase(APPLET) &&
360                         !value.toString().toLowerCase().endsWith(CLASSEXT)) {
361                     resources.add(value.toString() + CLASSEXT);
362                 } else {
363                     resources.add(value.toString());
364                 }
365             } else if (attr.start(10) > -1) {
366                 // VALUE, with possibility of URI
367                 if (extractValueAttributes 
368                         && TextUtils.matches(LIKELY_URI_PATH, value)) {
369                     CharSequence context = Link.elementContext(element,
370                         attr.group(10));
371                     processLink(curi,value, context);
372                 }
373 
374             } else if (attr.start(11) > -1) {
375                 // STYLE inline attribute
376                 // then, parse for URIs
377                 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
378                     curi, value, getController());
379                 
380             } else if (attr.start(12) > -1) {
381                 // METHOD
382                 method = value;
383                 // form processing finished at end (after ACTION also collected)
384             } else if (attr.start(13) > -1) {
385                 // any other attribute
386                 // ignore for now
387                 // could probe for path- or script-looking strings, but
388                 // those should be vanishingly rare in other attributes,
389                 // and/or symptomatic of page bugs
390             }
391         }
392         TextUtils.recycleMatcher(attr);
393 
394         // finish handling codebase/resources now that all available
395         if (resources != null) {
396             Iterator iter = resources.iterator();
397             UURI codebaseURI = null;
398             String res = null;
399             try {
400                 if (codebase != null) {
401                     // TODO: Pass in the charset.
402                     codebaseURI = UURIFactory.
403                         getInstance(curi.getUURI(), codebase);
404                 }
405                 while(iter.hasNext()) {
406                     res = iter.next().toString();
407                     res = (String) TextUtils.unescapeHtml(res);
408                     if (codebaseURI != null) {
409                         res = codebaseURI.resolve(res).toString();
410                     }
411                     processEmbed(curi, res, element); // TODO: include attribute too
412                 }
413             } catch (URIException e) {
414                 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
415             } catch (IllegalArgumentException e) {
416                 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
417                     "codebase=" + codebase + " res=" + res + "\n" +
418                     DevUtils.extraInfo(), e);
419             }
420         }
421         
422         // finish handling form action, now method is available
423         if(action != null) {
424             if(method == null || "GET".equalsIgnoreCase(method.toString()) 
425                     || ! ((Boolean)getUncheckedAttribute(curi,
426                             ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) {
427                 processLink(curi, action, actionContext);
428             }
429         }
430     }
431 
432     /***
433      * Extract the (java)script source in the given CharSequence. 
434      * 
435      * @param curi source CrawlURI
436      * @param cs CharSequence of javascript code
437      */
438     protected void processScriptCode(CrawlURI curi, CharSequence cs) {
439         if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {
440             this.numberOfLinksExtracted +=
441                 ExtractorJS.considerStrings(curi, cs, getController(), false);
442         } // else do nothing
443     }
444 
445     static final String JAVASCRIPT = "(?i)^javascript:.*";
446 
447     /***
448      * Handle generic HREF cases.
449      * 
450      * @param curi
451      * @param value
452      * @param context
453      */
454     protected void processLink(CrawlURI curi, final CharSequence value,
455             CharSequence context) {
456         if (TextUtils.matches(JAVASCRIPT, value)) {
457             processScriptCode(curi, value. subSequence(11, value.length()));
458         } else {    
459             if (logger.isLoggable(Level.FINEST)) {
460                 logger.finest("link: " + value.toString() + " from " + curi);
461             }
462             addLinkFromString(curi,
463                 (value instanceof String)?
464                     (String)value: value.toString(),
465                 context, Link.NAVLINK_HOP);
466             this.numberOfLinksExtracted++;
467         }
468     }
469 
470     private void addLinkFromString(CrawlURI curi, String uri,
471             CharSequence context, char hopType) {
472         try {
473             // We do a 'toString' on context because its a sequence from
474             // the underlying ReplayCharSequence and the link its about
475             // to become a part of is expected to outlive the current
476             // ReplayCharSequence.
477             curi.createAndAddLinkRelativeToBase(uri, context.toString(),
478                 hopType);
479         } catch (URIException e) {
480             if (getController() != null) {
481                 getController().logUriError(e, curi.getUURI(), uri);
482             } else {
483                 logger.info("Failed createAndAddLinkRelativeToBase " +
484                     curi + ", " + uri + ", " + context + ", " + hopType +
485                     ": " + e);
486             }
487         }
488     }
489 
490     protected final void processEmbed(CrawlURI curi, CharSequence value,
491             CharSequence context) {
492         processEmbed(curi, value, context, Link.EMBED_HOP);
493     }
494 
495     protected void processEmbed(CrawlURI curi, final CharSequence value,
496             CharSequence context, char hopType) {
497         if (logger.isLoggable(Level.FINEST)) {
498             logger.finest("embed (" + hopType + "): " + value.toString() +
499                 " from " + curi);
500         }
501         addLinkFromString(curi,
502             (value instanceof String)?
503                 (String)value: value.toString(),
504             context, hopType);
505         this.numberOfLinksExtracted++;
506     }
507 
508     public void extract(CrawlURI curi) {
509         if (!isHttpTransactionContentToProcess(curi) ||
510                 ! (isExpectedMimeType(curi.getContentType(), "text/html")
511                    || isExpectedMimeType(curi.getContentType(), "application/xhtml"))) {
512             return;
513         }
514 
515         final boolean ignoreUnexpectedHTML =
516              ((Boolean)getUncheckedAttribute(curi, 
517                  ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();        
518 
519         if (ignoreUnexpectedHTML) {
520             try {
521                 if(!isHtmlExpectedHere(curi)) {
522                     // HTML was not expected (eg a GIF was expected) so ignore
523                     // (as if a soft 404)
524                     return;
525                 }
526             } catch (URIException e) {
527                 logger.severe("Failed expectedHTML test: " + e.getMessage());
528             }
529         }
530 
531         this.numberOfCURIsHandled++;
532 
533         ReplayCharSequence cs = null;
534         
535         try {
536            HttpRecorder hr = curi.getHttpRecorder();
537            if (hr == null) {
538                throw new IOException("Why is recorder null here?");
539            }
540            cs = hr.getReplayCharSequence();
541         } catch (IOException e) {
542             curi.addLocalizedError(this.getName(), e,
543                 "Failed get of replay char sequence " + curi.toString() +
544                     " " + e.getMessage());
545             logger.log(Level.SEVERE,"Failed get of replay char sequence in " +
546                 Thread.currentThread().getName(), e);
547         }
548         
549         if (cs == null) {
550             return;
551         }
552 
553         // We have a ReplayCharSequence open.  Wrap all in finally so we
554         // for sure close it before we leave.
555         try {
556             // Extract all links from the charsequence
557             extract(curi, cs);
558             // Set flag to indicate that link extraction is completed.
559             curi.linkExtractorFinished();
560         } finally {
561             if (cs != null) {
562                 try {
563                     cs.close();
564                 } catch (IOException ioe) {
565                     logger.warning(TextUtils.exceptionToString(
566                         "Failed close of ReplayCharSequence.", ioe));
567                 }
568             }
569         }
570     }
571 
572     /***
573      * Run extractor.
574      * This method is package visible to ease testing.
575      * @param curi CrawlURI we're processing.
576      * @param cs Sequence from underlying ReplayCharSequence. This
577      * is TRANSIENT data. Make a copy if you want the data to live outside
578      * of this extractors' lifetime.
579      */
580     void extract(CrawlURI curi, CharSequence cs) {
581         Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
582         while(tags.find()) {
583             if(Thread.interrupted()){
584                 break;
585             }
586             if (tags.start(8) > 0) {
587                 // comment match
588                 // for now do nothing
589             } else if (tags.start(7) > 0) {
590                 // <meta> match
591                 int start = tags.start(5);
592                 int end = tags.end(5);
593                 assert start >= 0: "Start is: " + start + ", " + curi;
594                 assert end >= 0: "End is :" + end + ", " + curi;
595                 if (processMeta(curi,
596                     cs.subSequence(start, end))) {
597 
598                     // meta tag included NOFOLLOW; abort processing
599                     break;
600                 }
601             } else if (tags.start(5) > 0) {
602                 // generic <whatever> match
603                 int start5 = tags.start(5);
604                 int end5 = tags.end(5);
605                 assert start5 >= 0: "Start is: " + start5 + ", " + curi;
606                 assert end5 >= 0: "End is :" + end5 + ", " + curi;
607                 int start6 = tags.start(6);
608                 int end6 = tags.end(6);
609                 assert start6 >= 0: "Start is: " + start6 + ", " + curi;
610                 assert end6 >= 0: "End is :" + end6 + ", " + curi;
611                 processGeneralTag(curi,
612                     cs.subSequence(start6, end6),
613                     cs.subSequence(start5, end5));
614 
615             } else if (tags.start(1) > 0) {
616                 // <script> match
617                 int start = tags.start(1);
618                 int end = tags.end(1);
619                 assert start >= 0: "Start is: " + start + ", " + curi;
620                 assert end >= 0: "End is :" + end + ", " + curi;
621                 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +
622                     ", " + curi;
623                 processScript(curi, cs.subSequence(start, end),
624                     tags.end(2) - start);
625 
626             } else if (tags.start(3) > 0){
627                 // <style... match
628                 int start = tags.start(3);
629                 int end = tags.end(3);
630                 assert start >= 0: "Start is: " + start + ", " + curi;
631                 assert end >= 0: "End is :" + end + ", " + curi;
632                 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +
633                     ", " + curi;
634                 processStyle(curi, cs.subSequence(start, end),
635                     tags.end(4) - start);
636             }
637         }
638         TextUtils.recycleMatcher(tags);
639     }
640 
641 
642     static final String NON_HTML_PATH_EXTENSION =
643         "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
644         "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
645 
646     /***
647      * Test whether this HTML is so unexpected (eg in place of a GIF URI)
648      * that it shouldn't be scanned for links.
649      *
650      * @param curi CrawlURI to examine.
651      * @return True if HTML is acceptable/expected here
652      * @throws URIException
653      */
654     protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
655         String path = curi.getUURI().getPath();
656         if(path==null) {
657             // no path extension, HTML is fine
658             return true;
659         }
660         int dot = path.lastIndexOf('.');
661         if (dot < 0) {
662             // no path extension, HTML is fine
663             return true;
664         }
665         if(dot<(path.length()-5)) {
666             // extension too long to recognize, HTML is fine
667             return true;
668         }
669         String ext = path.substring(dot+1);
670         return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
671     }
672 
673     protected void processScript(CrawlURI curi, CharSequence sequence,
674             int endOfOpenTag) {
675         // first, get attributes of script-open tag
676         // as per any other tag
677         processGeneralTag(curi,sequence.subSequence(0,6),
678             sequence.subSequence(0,endOfOpenTag));
679 
680         // then, apply best-effort string-analysis heuristics
681         // against any code present (false positives are OK)
682         processScriptCode(
683             curi, sequence.subSequence(endOfOpenTag, sequence.length()));
684     }
685 
686     /***
687      * Process metadata tags.
688      * @param curi CrawlURI we're processing.
689      * @param cs Sequence from underlying ReplayCharSequence. This
690      * is TRANSIENT data. Make a copy if you want the data to live outside
691      * of this extractors' lifetime.
692      * @return True robots exclusion metatag.
693      */
694     protected boolean processMeta(CrawlURI curi, CharSequence cs) {
695         Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
696         String name = null;
697         String httpEquiv = null;
698         String content = null;
699         while (attr.find()) {
700             int valueGroup =
701                 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
702             CharSequence value =
703                 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
704             if (attr.group(1).equalsIgnoreCase("name")) {
705                 name = value.toString();
706             } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
707                 httpEquiv = value.toString();
708             } else if (attr.group(1).equalsIgnoreCase("content")) {
709                 content = value.toString();
710             }
711             // TODO: handle other stuff
712         }
713         TextUtils.recycleMatcher(attr);
714 
715         // Look for the 'robots' meta-tag
716         if("robots".equalsIgnoreCase(name) && content != null ) {
717             curi.putString(A_META_ROBOTS, content);
718             RobotsHonoringPolicy policy =
719                 getSettingsHandler().getOrder().getRobotsHonoringPolicy();
720             String contentLower = content.toLowerCase();
721             if ((policy == null
722                 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
723                     && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
724                 && (contentLower.indexOf("nofollow") >= 0
725                     || contentLower.indexOf("none") >= 0)) {
726                 // if 'nofollow' or 'none' is specified and the
727                 // honoring policy is not IGNORE or CUSTOM, end html extraction
728                 logger.fine("HTML extraction skipped due to robots meta-tag for: "
729                                 + curi.toString());
730                 return true;
731             }
732         } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
733             String refreshUri = content.substring(content.indexOf("=") + 1);
734             try {
735                 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
736                     Link.REFER_HOP);
737             } catch (URIException e) {
738                 if (getController() != null) {
739                     getController().logUriError(e, curi.getUURI(), refreshUri);
740                 } else {
741                     logger.info("Failed createAndAddLinkRelativeToBase " +
742                         curi + ", " + cs + ", " + refreshUri + ": " + e);
743                 }
744             }
745         }
746         return false;
747     }
748 
749     /***
750      * Process style text.
751      * @param curi CrawlURI we're processing.
752      * @param sequence Sequence from underlying ReplayCharSequence. This
753      * is TRANSIENT data. Make a copy if you want the data to live outside
754      * of this extractors' lifetime.
755      * @param endOfOpenTag
756      */
757     protected void processStyle(CrawlURI curi, CharSequence sequence,
758             int endOfOpenTag) {
759         // First, get attributes of script-open tag as per any other tag.
760         processGeneralTag(curi, sequence.subSequence(0,6),
761             sequence.subSequence(0,endOfOpenTag));
762 
763         // then, parse for URIs
764         this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
765             curi, sequence.subSequence(endOfOpenTag,sequence.length()),
766                 getController());
767     }
768     
769 
770 
771     /* (non-Javadoc)
772      * @see org.archive.crawler.framework.Processor#report()
773      */
774     public String report() {
775         StringBuffer ret = new StringBuffer();
776         ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
777         ret.append("  Function:          Link extraction on HTML documents\n");
778         ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
779         ret.append("  Links extracted:   " + this.numberOfLinksExtracted +
780             "\n\n");
781         return ret.toString();
782     }
783 }
784