1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.extractor;
24
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.Iterator;
28 import java.util.LinkedList;
29 import java.util.List;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.apache.commons.lang.StringEscapeUtils;
35 import org.apache.commons.lang.StringUtils;
36 import org.archive.crawler.datamodel.CoreAttributeConstants;
37 import org.archive.crawler.datamodel.CrawlURI;
38 import org.archive.crawler.datamodel.RobotsHonoringPolicy;
39 import org.archive.net.UURI;
40 import org.archive.net.UURIFactory;
41 import org.archive.util.DevUtils;
42 import org.archive.util.TextUtils;
43
44 import au.id.jericho.lib.html.Attribute;
45 import au.id.jericho.lib.html.Attributes;
46 import au.id.jericho.lib.html.Element;
47 import au.id.jericho.lib.html.FormControl;
48 import au.id.jericho.lib.html.FormControlType;
49 import au.id.jericho.lib.html.FormField;
50 import au.id.jericho.lib.html.FormFields;
51 import au.id.jericho.lib.html.HTMLElementName;
52 import au.id.jericho.lib.html.Source;
53 import au.id.jericho.lib.html.StartTagType;
54
55 /***
56 * Improved link-extraction from an HTML content-body using jericho-html parser.
57 * This extractor extends ExtractorHTML and mimics its workflow - but has some
58 * substantial differences when it comes to internal implementation. Instead
59 * of heavily relying upon java regular expressions it uses a real html parser
60 * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
61 * Using this parser it can better handle broken html (i.e. missing quotes)
62 * and also offer improved extraction of HTML form URLs (not only extract
63 * the action of a form, but also its default values).
64 * Unfortunately this parser also has one major drawback - it has to read the
65 * whole document into memory for parsing, thus has an inherent OOME risk.
66 * This OOME risk can be reduced/eleminated by limiting the size of documents
67 * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
68 * Also note that this extractor seems to have a lower overall memory
69 * consumption compared to ExtractorHTML. (still to be confirmed on a larger
70 * scale crawl)
71 *
72 * @author Olaf Freyer
73 * @version $Date: 2008-02-06 07:44:20 +0000 (Wed, 06 Feb 2008) $ $Revision: 5757 $
74 */
75 public class JerichoExtractorHTML extends ExtractorHTML implements
76 CoreAttributeConstants {
77
78 private static final long serialVersionUID = 1684681316546343615L;
79
80 private Logger logger = Logger.getLogger(this.getClass().getName());
81
82 protected long numberOfFormsProcessed = 0;
83
84 public JerichoExtractorHTML(String name) {
85 this(name, "Jericho-HTML extractor. Extracts links from HTML " +
86 "documents using Jericho HTML Parser. Offers same " +
87 "basic functionality as ExtractorHTML but better " +
88 "handles broken HTML and extraction of default " +
89 "values from HTML forms. A word of warning: the used " +
90 "parser, the Jericho HTML Parser, reads the whole " +
91 "document into memory for " +
92 "parsing - thus this extractor has an inherent OOME risk. " +
93 "This OOME risk can be reduced/eleminated by limiting the " +
94 "size of documents to be parsed (i.e. using " +
95 "NotExceedsDocumentLengthTresholdDecideRule). ");
96 }
97
98 public JerichoExtractorHTML(String name, String description) {
99 super(name, description);
100 }
101
102 private static List<Attribute> findOnAttributes(Attributes attributes) {
103 List<Attribute> result = new LinkedList<Attribute>();
104 for (Iterator attrIter = attributes.iterator(); attrIter.hasNext();) {
105 Attribute attr = (Attribute) attrIter.next();
106 if (attr.getKey().startsWith("on"))
107 result.add(attr);
108 }
109 return result;
110 }
111
112 protected void processGeneralTag(CrawlURI curi, Element element,
113 Attributes attributes) {
114 Attribute attr;
115 String attrValue;
116 List attrList;
117 String elementName = element.getName();
118
119
120 String codebase = null;
121 ArrayList<String> resources = null;
122
123 final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(curi,
124 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
125
126 final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
127 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
128
129 final boolean overlyEagerLinkDetection =
130 ((Boolean)getUncheckedAttribute(
131 curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
132
133
134 if (((attr = attributes.get("href")) != null) &&
135 ((attrValue = attr.getValue()) != null)) {
136 CharSequence context = Link.elementContext(elementName, attr
137 .getKey());
138 if ("link".equals(elementName)) {
139
140 processEmbed(curi, attrValue, context);
141 } else {
142
143 processLink(curi, attrValue, context);
144 }
145 if ("base".equals(elementName)) {
146 try {
147 curi.setBaseURI(attrValue);
148 } catch (URIException e) {
149 if (getController() != null) {
150
151
152 getController().logUriError(e, curi.getUURI(),
153 attrValue);
154 } else {
155 logger.info("Failed set base uri: " + curi + ", "
156 + attrValue + ": " + e.getMessage());
157 }
158 }
159 }
160 }
161
162 if (((attr = attributes.get("action")) != null) &&
163 ((attrValue = attr.getValue()) != null)) {
164 if (!ignoreFormActions) {
165 CharSequence context = Link.elementContext(elementName, attr
166 .getKey());
167 processLink(curi, attrValue, context);
168 }
169 }
170
171 if ((attrList = findOnAttributes(attributes)).size() != 0) {
172 for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) {
173 attr = (Attribute) attrIter.next();
174 CharSequence valueSegment = attr.getValueSegment();
175 if (valueSegment != null)
176 processScriptCode(curi, valueSegment);
177
178 }
179 }
180
181 if ((((attr = attributes.get("src")) != null)
182 || ((attr = attributes.get("lowsrc")) != null)
183 || ((attr = attributes.get("background")) != null)
184 || ((attr = attributes.get("cite")) != null)
185 || ((attr = attributes.get("longdesc")) != null)
186 || ((attr = attributes.get("usemap")) != null)
187 || ((attr = attributes.get("profile")) != null)
188 || ((attr = attributes.get("datasrc")) != null)) &&
189 ((attrValue = attr.getValue()) != null)) {
190
191 final char hopType;
192 CharSequence context = Link.elementContext(elementName, attr
193 .getKey());
194
195 if (!framesAsEmbeds
196 && ("frame".equals(elementName) || "iframe"
197 .equals(elementName)))
198 hopType = Link.NAVLINK_HOP;
199 else
200 hopType = Link.EMBED_HOP;
201
202 processEmbed(curi, attrValue, context, hopType);
203 }
204
205 if (((attr = attributes.get("codebase")) != null) &&
206 ((attrValue = attr.getValue()) != null)) {
207 codebase = StringEscapeUtils.unescapeHtml(attrValue);
208 CharSequence context = Link.elementContext(elementName, attr
209 .getKey());
210 processEmbed(curi, codebase, context);
211 }
212
213 if ((((attr = attributes.get("classid")) != null)
214 || ((attr = attributes.get("data")) != null)) &&
215 ((attrValue = attr.getValue()) != null)) {
216 if (resources == null)
217 resources = new ArrayList<String>();
218 resources.add(attrValue);
219 }
220
221 if (((attr = attributes.get("archive")) != null) &&
222 ((attrValue = attr.getValue()) != null)) {
223 if (resources == null)
224 resources = new ArrayList<String>();
225 String[] multi = TextUtils.split(WHITESPACE, attrValue);
226 for (int i = 0; i < multi.length; i++) {
227 resources.add(multi[i]);
228 }
229 }
230
231 if (((attr = attributes.get("code")) != null) &&
232 ((attrValue = attr.getValue()) != null)) {
233 if (resources == null)
234 resources = new ArrayList<String>();
235
236
237 if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
238 resources.add(attrValue + CLASSEXT);
239 } else {
240 resources.add(attrValue);
241 }
242 }
243
244 if (((attr = attributes.get("value")) != null) &&
245 ((attrValue = attr.getValue()) != null)) {
246 if (TextUtils.matches(LIKELY_URI_PATH, attrValue)
247 && overlyEagerLinkDetection) {
248 CharSequence context = Link.elementContext(elementName, attr
249 .getKey());
250 processLink(curi, attrValue, context);
251 }
252
253 }
254
255 if (((attr = attributes.get("style")) != null) &&
256 ((attrValue = attr.getValue()) != null)) {
257
258
259 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
260 attrValue, getController());
261 }
262
263
264 if (resources == null)
265 return;
266
267 Iterator<String> iter = resources.iterator();
268 UURI codebaseURI = null;
269 String res = null;
270 try {
271 if (codebase != null) {
272
273 codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);
274 }
275 while (iter.hasNext()) {
276 res = iter.next();
277 res = StringEscapeUtils.unescapeHtml(res);
278 if (codebaseURI != null) {
279 res = codebaseURI.resolve(res).toString();
280 }
281 processEmbed(curi, res, element);
282
283 }
284 } catch (URIException e) {
285 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
286 } catch (IllegalArgumentException e) {
287 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
288 + "codebase=" + codebase + " res=" + res + "\n"
289 + DevUtils.extraInfo(), e);
290 }
291 }
292
293 protected boolean processMeta(CrawlURI curi, Element element) {
294 String name = element.getAttributeValue("name");
295 String httpEquiv = element.getAttributeValue("http-equiv");
296 String content = element.getAttributeValue("content");
297
298 if ("robots".equals(name) && content != null) {
299 curi.putString(A_META_ROBOTS, content);
300 RobotsHonoringPolicy policy = getSettingsHandler().getOrder()
301 .getRobotsHonoringPolicy();
302 String contentLower = content.toLowerCase();
303 if ((policy == null || (!policy.isType(curi,
304 RobotsHonoringPolicy.IGNORE) && !policy.isType(curi,
305 RobotsHonoringPolicy.CUSTOM)))
306 && (contentLower.indexOf("nofollow") >= 0 || contentLower
307 .indexOf("none") >= 0)) {
308
309
310 logger.fine("HTML extraction skipped due to robots meta-tag " +
311 "for: " + curi.toString());
312 return true;
313 }
314 }
315 if ("refresh".equals(httpEquiv) && content != null) {
316 String refreshUri = content.substring(content.indexOf("=") + 1);
317 try {
318 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
319 Link.REFER_HOP);
320 } catch (URIException e) {
321 if (getController() != null) {
322 getController().logUriError(e, curi.getUURI(), refreshUri);
323 } else {
324 logger.info("Failed createAndAddLinkRelativeToBase " + curi
325 + ", " + element.toString() + ", " + refreshUri
326 + ": " + e);
327 }
328 }
329 }
330 return false;
331 }
332
333 protected void processScript(CrawlURI curi, Element element) {
334
335
336 processGeneralTag(curi, element, element.getAttributes());
337
338
339
340 processScriptCode(curi, element.getContent());
341
342 }
343
344 protected void processStyle(CrawlURI curi, Element element) {
345
346 processGeneralTag(curi, element, element.getAttributes());
347
348
349 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
350 element.getContent(), getController());
351 }
352
353 protected void processForm(CrawlURI curi, Element element) {
354 String action = element.getAttributeValue("action");
355 String name = element.getAttributeValue("name");
356 String queryURL = "";
357
358 final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
359 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
360
361 if (ignoreFormActions) {
362 return;
363 }
364
365
366 String method = StringUtils.defaultIfEmpty(
367 element.getAttributeValue("method"), "GET");
368 if(((Boolean)getUncheckedAttribute(curi,
369 ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()
370 && ! "GET".equalsIgnoreCase(method)) {
371 return;
372 }
373
374 numberOfFormsProcessed++;
375
376
377 FormFields formFields = element.findFormFields();
378 for (Iterator fieldsIter = formFields.iterator(); fieldsIter.hasNext();) {
379
380 FormField formField = (FormField) fieldsIter.next();
381
382
383 for (Iterator controlIter = formField.getFormControls().iterator();
384 controlIter.hasNext();) {
385 FormControl formControl = (FormControl) controlIter.next();
386
387
388 String controlName = formControl.getName();
389
390
391 Collection controlValues;
392 if (!(formControl.getFormControlType() ==
393 FormControlType.SUBMIT)) {
394 controlValues = formControl.getValues();
395 } else {
396 controlValues = formControl.getPredefinedValues();
397 }
398
399 if (controlValues.size() > 0) {
400
401 for (Iterator valueIter = controlValues.iterator();
402 valueIter.hasNext();) {
403 String value = (String) valueIter.next();
404 queryURL += "&" + controlName + "=" + value;
405 }
406 } else {
407 queryURL += "&" + controlName + "=";
408 }
409 }
410 }
411
412
413 if (action == null) {
414 queryURL = queryURL.replaceFirst("&", "?");
415 } else {
416 if (!action.contains("?"))
417 queryURL = queryURL.replaceFirst("&", "?");
418 queryURL = action + queryURL;
419 }
420
421 CharSequence context = Link.elementContext(element.getName(),
422 "name=" + name);
423 processLink(curi, queryURL, context);
424
425 }
426
427 /***
428 * Run extractor. This method is package visible to ease testing.
429 *
430 * @param curi
431 * CrawlURI we're processing.
432 * @param cs
433 * Sequence from underlying ReplayCharSequence.
434 */
435 void extract(CrawlURI curi, CharSequence cs) {
436 Source source = new Source(cs);
437 List elements = source.findAllElements(StartTagType.NORMAL);
438 for (Iterator elementIter = elements.iterator();
439 elementIter.hasNext();) {
440 Element element = (Element) elementIter.next();
441 String elementName = element.getName();
442 Attributes attributes;
443 if (elementName.equals(HTMLElementName.META)) {
444 if (processMeta(curi, element)) {
445
446 break;
447 }
448 } else if (elementName.equals(HTMLElementName.SCRIPT)) {
449 processScript(curi, element);
450 } else if (elementName.equals(HTMLElementName.STYLE)) {
451 processStyle(curi, element);
452 } else if (elementName.equals(HTMLElementName.FORM)) {
453 processForm(curi, element);
454 } else if (!(attributes = element.getAttributes()).isEmpty()) {
455 processGeneralTag(curi, element, attributes);
456 }
457 }
458 }
459
460
461
462
463
464
465 public String report() {
466 StringBuffer ret = new StringBuffer();
467 ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
468 ret.append(" Function: Link extraction on HTML documents\n");
469 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
470 ret.append(" Forms processed: " + this.numberOfFormsProcessed + "\n");
471 ret.append(" Links extracted: " + this.numberOfLinksExtracted + "\n\n");
472 return ret.toString();
473 }
474 }