1   /* JerichoExtractorHTML
2    * 
3    * Copyright (C) 2006 Olaf Freyer
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   *
21   * $Id: JerichoExtractorHTML.java 5757 2008-02-06 07:44:20Z Gojomo $
22   */
23  package org.archive.crawler.extractor;
24  
25  import java.util.ArrayList;
26  import java.util.Collection;
27  import java.util.Iterator;
28  import java.util.LinkedList;
29  import java.util.List;
30  import java.util.logging.Level;
31  import java.util.logging.Logger;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.apache.commons.lang.StringEscapeUtils;
35  import org.apache.commons.lang.StringUtils;
36  import org.archive.crawler.datamodel.CoreAttributeConstants;
37  import org.archive.crawler.datamodel.CrawlURI;
38  import org.archive.crawler.datamodel.RobotsHonoringPolicy;
39  import org.archive.net.UURI;
40  import org.archive.net.UURIFactory;
41  import org.archive.util.DevUtils;
42  import org.archive.util.TextUtils;
43  
44  import au.id.jericho.lib.html.Attribute;
45  import au.id.jericho.lib.html.Attributes;
46  import au.id.jericho.lib.html.Element;
47  import au.id.jericho.lib.html.FormControl;
48  import au.id.jericho.lib.html.FormControlType;
49  import au.id.jericho.lib.html.FormField;
50  import au.id.jericho.lib.html.FormFields;
51  import au.id.jericho.lib.html.HTMLElementName;
52  import au.id.jericho.lib.html.Source;
53  import au.id.jericho.lib.html.StartTagType;
54  
55  /***
56   * Improved link-extraction from an HTML content-body using jericho-html parser.
57   * This extractor extends ExtractorHTML and mimics its workflow - but has some
58   * substantial differences when it comes to internal implementation. Instead
59   * of heavily relying upon java regular expressions it uses a real html parser
60   * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
61   * Using this parser it can better handle broken html (i.e. missing quotes)
62   * and also offer improved extraction of HTML form URLs (not only extract
63   * the action of a form, but also its default values).
64   * Unfortunately this parser also has one major drawback - it has to read the
65   * whole document into memory for parsing, thus has an inherent OOME risk.
66   * This OOME risk can be reduced/eleminated by limiting the size of documents
67   * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
68   * Also note that this extractor seems to have a lower overall memory 
69   * consumption compared to ExtractorHTML. (still to be confirmed on a larger 
70   * scale crawl) 
71   * 
72   * @author Olaf Freyer
73   * @version $Date: 2008-02-06 07:44:20 +0000 (Wed, 06 Feb 2008) $ $Revision: 5757 $
74   */
75  public class JerichoExtractorHTML extends ExtractorHTML implements
76          CoreAttributeConstants {
77  
78      private static final long serialVersionUID = 1684681316546343615L;
79  
80      private Logger logger = Logger.getLogger(this.getClass().getName());
81  
82      protected long numberOfFormsProcessed = 0;
83  
84      public JerichoExtractorHTML(String name) {
85          this(name, "Jericho-HTML extractor. Extracts links from HTML " +
86                  "documents using Jericho HTML Parser. Offers same " + 
87                  "basic functionality as ExtractorHTML but better " +
88                  "handles broken HTML and extraction of default " +
89                  "values from HTML forms. A word of warning: the used " +
90                  "parser, the Jericho HTML Parser, reads the whole " +
91                  "document into memory for " +
92                  "parsing - thus this extractor has an inherent OOME risk. " +
93                  "This OOME risk can be reduced/eleminated by limiting the " +
94                  "size of documents to be parsed (i.e. using " +
95                  "NotExceedsDocumentLengthTresholdDecideRule). ");
96      }
97  
98      public JerichoExtractorHTML(String name, String description) {
99          super(name, description);
100     }
101 
102     private static List<Attribute> findOnAttributes(Attributes attributes) {
103         List<Attribute> result = new LinkedList<Attribute>();
104         for (Iterator attrIter = attributes.iterator(); attrIter.hasNext();) {
105             Attribute attr = (Attribute) attrIter.next();
106             if (attr.getKey().startsWith("on"))
107                 result.add(attr);
108         }
109         return result;
110     }
111 
112     protected void processGeneralTag(CrawlURI curi, Element element,
113             Attributes attributes) {
114         Attribute attr;
115         String attrValue;
116         List attrList;
117         String elementName = element.getName();
118 
119         // Just in case it's an OBJECT or APPLET tag
120         String codebase = null;
121         ArrayList<String> resources = null;
122 
123         final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(curi,
124                 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
125 
126         final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
127                 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
128 
129         final boolean overlyEagerLinkDetection =
130             ((Boolean)getUncheckedAttribute(
131                 curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
132 
133         // HREF
134         if (((attr = attributes.get("href")) != null) &&
135             ((attrValue = attr.getValue()) != null)) {
136             CharSequence context = Link.elementContext(elementName, attr
137                     .getKey());
138             if ("link".equals(elementName)) {
139                 // <LINK> elements treated as embeds (css, ico, etc)
140                 processEmbed(curi, attrValue, context);
141             } else {
142                 // other HREFs treated as links
143                 processLink(curi, attrValue, context);
144             }
145             if ("base".equals(elementName)) {
146                 try {
147                     curi.setBaseURI(attrValue);
148                 } catch (URIException e) {
149                     if (getController() != null) {
150                         // Controller can be null: e.g. when running
151                         // ExtractorTool.
152                         getController().logUriError(e, curi.getUURI(),
153                                 attrValue);
154                     } else {
155                         logger.info("Failed set base uri: " + curi + ", "
156                                 + attrValue + ": " + e.getMessage());
157                     }
158                 }
159             }
160         }
161         // ACTION
162         if (((attr = attributes.get("action")) != null) &&
163                  ((attrValue = attr.getValue()) != null)) {
164             if (!ignoreFormActions) {
165                 CharSequence context = Link.elementContext(elementName, attr
166                         .getKey());
167                 processLink(curi, attrValue, context);
168             }
169         }
170         // ON_
171         if ((attrList = findOnAttributes(attributes)).size() != 0) {
172             for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) {
173                 attr = (Attribute) attrIter.next();
174                 CharSequence valueSegment = attr.getValueSegment();
175                 if (valueSegment != null)
176                     processScriptCode(curi, valueSegment);
177 
178             }
179         }
180         // SRC atc.
181         if ((((attr = attributes.get("src")) != null)
182                 || ((attr = attributes.get("lowsrc")) != null)
183                 || ((attr = attributes.get("background")) != null)
184                 || ((attr = attributes.get("cite")) != null)
185                 || ((attr = attributes.get("longdesc")) != null)
186                 || ((attr = attributes.get("usemap")) != null)
187                 || ((attr = attributes.get("profile")) != null)
188                 || ((attr = attributes.get("datasrc")) != null)) &&
189                    ((attrValue = attr.getValue()) != null)) {
190 
191             final char hopType;
192             CharSequence context = Link.elementContext(elementName, attr
193                     .getKey());
194 
195             if (!framesAsEmbeds
196                     && ("frame".equals(elementName) || "iframe"
197                             .equals(elementName)))
198                 hopType = Link.NAVLINK_HOP;
199             else
200                 hopType = Link.EMBED_HOP;
201 
202             processEmbed(curi, attrValue, context, hopType);
203         }
204         // CODEBASE
205         if (((attr = attributes.get("codebase")) != null) &&
206                  ((attrValue = attr.getValue()) != null)) {
207             codebase = StringEscapeUtils.unescapeHtml(attrValue);
208             CharSequence context = Link.elementContext(elementName, attr
209                     .getKey());
210             processEmbed(curi, codebase, context);
211         }
212         // CLASSID DATA
213         if ((((attr = attributes.get("classid")) != null)
214                 || ((attr = attributes.get("data")) != null)) &&
215                    ((attrValue = attr.getValue()) != null)) {
216             if (resources == null)
217                 resources = new ArrayList<String>();
218             resources.add(attrValue);
219         }
220         // ARCHIVE
221         if (((attr = attributes.get("archive")) != null) &&
222                  ((attrValue = attr.getValue()) != null)) {
223             if (resources == null)
224                 resources = new ArrayList<String>();
225             String[] multi = TextUtils.split(WHITESPACE, attrValue);
226             for (int i = 0; i < multi.length; i++) {
227                 resources.add(multi[i]);
228             }
229         }
230         // CODE
231         if (((attr = attributes.get("code")) != null) &&
232                  ((attrValue = attr.getValue()) != null)) {
233             if (resources == null)
234                 resources = new ArrayList<String>();
235             // If element is applet and code value does not end with
236             // '.class' then append '.class' to the code value.
237             if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
238                 resources.add(attrValue + CLASSEXT);
239             } else {
240                 resources.add(attrValue);
241             }
242         }
243         // VALUE
244         if (((attr = attributes.get("value")) != null) &&
245                  ((attrValue = attr.getValue()) != null)) {
246             if (TextUtils.matches(LIKELY_URI_PATH, attrValue)
247                     && overlyEagerLinkDetection) {
248                 CharSequence context = Link.elementContext(elementName, attr
249                         .getKey());
250                 processLink(curi, attrValue, context);
251             }
252 
253         }
254         // STYLE
255         if (((attr = attributes.get("style")) != null) &&
256                  ((attrValue = attr.getValue()) != null)) {
257             // STYLE inline attribute
258             // then, parse for URIs
259             this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
260                     attrValue, getController());
261         }
262 
263         // handle codebase/resources
264         if (resources == null)
265             return;
266 
267         Iterator<String> iter = resources.iterator();
268         UURI codebaseURI = null;
269         String res = null;
270         try {
271             if (codebase != null) {
272                 // TODO: Pass in the charset.
273                 codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);
274             }
275             while (iter.hasNext()) {
276                 res = iter.next();
277                 res = StringEscapeUtils.unescapeHtml(res);
278                 if (codebaseURI != null) {
279                     res = codebaseURI.resolve(res).toString();
280                 }
281                 processEmbed(curi, res, element); // TODO: include attribute
282                                                     // too
283             }
284         } catch (URIException e) {
285             curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
286         } catch (IllegalArgumentException e) {
287             DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
288                     + "codebase=" + codebase + " res=" + res + "\n"
289                     + DevUtils.extraInfo(), e);
290         }
291     }
292 
293     protected boolean processMeta(CrawlURI curi, Element element) {
294         String name = element.getAttributeValue("name");
295         String httpEquiv = element.getAttributeValue("http-equiv");
296         String content = element.getAttributeValue("content");
297 
298         if ("robots".equals(name) && content != null) {
299             curi.putString(A_META_ROBOTS, content);
300             RobotsHonoringPolicy policy = getSettingsHandler().getOrder()
301                     .getRobotsHonoringPolicy();
302             String contentLower = content.toLowerCase();
303             if ((policy == null || (!policy.isType(curi,
304                     RobotsHonoringPolicy.IGNORE) && !policy.isType(curi,
305                     RobotsHonoringPolicy.CUSTOM)))
306                     && (contentLower.indexOf("nofollow") >= 0 || contentLower
307                             .indexOf("none") >= 0)) {
308                 // if 'nofollow' or 'none' is specified and the
309                 // honoring policy is not IGNORE or CUSTOM, end html extraction
310                 logger.fine("HTML extraction skipped due to robots meta-tag " +
311                     "for: " + curi.toString());
312                 return true;
313             }
314         }
315         if ("refresh".equals(httpEquiv) && content != null) {
316             String refreshUri = content.substring(content.indexOf("=") + 1);
317             try {
318                 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
319                         Link.REFER_HOP);
320             } catch (URIException e) {
321                 if (getController() != null) {
322                     getController().logUriError(e, curi.getUURI(), refreshUri);
323                 } else {
324                     logger.info("Failed createAndAddLinkRelativeToBase " + curi
325                             + ", " + element.toString() + ", " + refreshUri
326                             + ": " + e);
327                 }
328             }
329         }
330         return false;
331     }
332 
333     protected void processScript(CrawlURI curi, Element element) {
334         // first, get attributes of script-open tag
335         // as per any other tag
336         processGeneralTag(curi, element, element.getAttributes());
337 
338         // then, apply best-effort string-analysis heuristics
339         // against any code present (false positives are OK)
340         processScriptCode(curi, element.getContent());
341 
342     }
343 
344     protected void processStyle(CrawlURI curi, Element element) {
345         // First, get attributes of script-open tag as per any other tag.
346         processGeneralTag(curi, element, element.getAttributes());
347 
348         // then, parse for URIs
349         this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
350                 element.getContent(), getController());
351     }
352 
353     protected void processForm(CrawlURI curi, Element element) {
354         String action = element.getAttributeValue("action");
355         String name = element.getAttributeValue("name");
356         String queryURL = "";
357 
358         final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
359                 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
360 
361         if (ignoreFormActions) {
362             return;
363         }
364         
365         // method-sensitive extraction
366         String method = StringUtils.defaultIfEmpty(
367                 element.getAttributeValue("method"), "GET");
368         if(((Boolean)getUncheckedAttribute(curi,
369                  ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue() 
370                  && ! "GET".equalsIgnoreCase(method)) {
371              return;
372         }
373 
374         numberOfFormsProcessed++;
375 
376         // get all form fields
377         FormFields formFields = element.findFormFields();
378         for (Iterator fieldsIter = formFields.iterator(); fieldsIter.hasNext();) {
379             // for each form field
380             FormField formField = (FormField) fieldsIter.next();
381 
382             // for each form control
383             for (Iterator controlIter = formField.getFormControls().iterator();
384                 controlIter.hasNext();) {
385                 FormControl formControl = (FormControl) controlIter.next();
386 
387                 // get name of control element (and URLEncode it)
388                 String controlName = formControl.getName();
389 
390                 // retrieve list of values - submit needs special handling
391                 Collection controlValues;
392                 if (!(formControl.getFormControlType() ==
393                         FormControlType.SUBMIT)) {
394                     controlValues = formControl.getValues();
395                 } else {
396                     controlValues = formControl.getPredefinedValues();
397                 }
398 
399                 if (controlValues.size() > 0) {
400                     // for each value set
401                     for (Iterator valueIter = controlValues.iterator();
402                             valueIter.hasNext();) {
403                         String value = (String) valueIter.next();
404                         queryURL += "&" + controlName + "=" + value;
405                     }
406                 } else {
407                     queryURL += "&" + controlName + "=";
408                 }
409             }
410         }
411 
412         // clean up url
413         if (action == null) {
414             queryURL = queryURL.replaceFirst("&", "?");
415         } else {
416             if (!action.contains("?"))
417                 queryURL = queryURL.replaceFirst("&", "?");
418             queryURL = action + queryURL;
419         }
420 
421         CharSequence context = Link.elementContext(element.getName(),
422             "name=" + name);
423         processLink(curi, queryURL, context);
424 
425     }
426 
427     /***
428      * Run extractor. This method is package visible to ease testing.
429      * 
430      * @param curi
431      *            CrawlURI we're processing.
432      * @param cs
433      *            Sequence from underlying ReplayCharSequence.
434      */
435     void extract(CrawlURI curi, CharSequence cs) {
436         Source source = new Source(cs);
437         List elements = source.findAllElements(StartTagType.NORMAL);
438         for (Iterator elementIter = elements.iterator();
439                 elementIter.hasNext();) {
440             Element element = (Element) elementIter.next();
441             String elementName = element.getName();
442             Attributes attributes;
443             if (elementName.equals(HTMLElementName.META)) {
444                 if (processMeta(curi, element)) {
445                     // meta tag included NOFOLLOW; abort processing
446                     break;
447                 }
448             } else if (elementName.equals(HTMLElementName.SCRIPT)) {
449                 processScript(curi, element);
450             } else if (elementName.equals(HTMLElementName.STYLE)) {
451                 processStyle(curi, element);
452             } else if (elementName.equals(HTMLElementName.FORM)) {
453                 processForm(curi, element);
454             } else if (!(attributes = element.getAttributes()).isEmpty()) {
455                 processGeneralTag(curi, element, attributes);
456             }
457         }
458     }
459 
460     /*
461      * (non-Javadoc)
462      * 
463      * @see org.archive.crawler.framework.Processor#report()
464      */
465     public String report() {
466         StringBuffer ret = new StringBuffer();
467         ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
468         ret.append("  Function:          Link extraction on HTML documents\n");
469         ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
470         ret.append("  Forms processed:   " + this.numberOfFormsProcessed + "\n");
471         ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
472         return ret.toString();
473     }
474 }