1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.io.IOException;
30 import java.util.logging.Logger;
31 import java.util.regex.Matcher;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CoreAttributeConstants;
35 import org.archive.crawler.datamodel.CrawlURI;
36 import org.archive.crawler.framework.CrawlController;
37 import org.archive.io.ReplayCharSequence;
38 import org.archive.util.TextUtils;
39
40 /***
41 * A simple extractor which finds HTTP URIs inside XML/RSS files,
42 * inside attribute values and simple elements (those with only
43 * whitespace + HTTP URI + whitespace as contents)
44 *
45 * @author gojomo
46 *
47 **/
48
49 public class ExtractorXML extends Extractor implements CoreAttributeConstants {
50
51 private static final long serialVersionUID = 3101230586822401584L;
52
53 private static Logger logger =
54 Logger.getLogger(ExtractorXML.class.getName());
55
56 private static String ESCAPED_AMP = "&";
57
58 static final String XML_URI_EXTRACTOR =
59 "(?i)[\"\'>]//s*(http:[^//s\"\'<>]+)//s*[\"\'<]";
60
61
62
63 private long numberOfCURIsHandled = 0;
64 private long numberOfLinksExtracted = 0;
65
66 /***
67 * @param name
68 */
69 public ExtractorXML(String name) {
70 super(name, "XML Extractor. Extracts links from XML/RSS.");
71 }
72
73 /***
74 * @param curi Crawl URI to process.
75 */
76 public void extract(CrawlURI curi) {
77 if (!isHttpTransactionContentToProcess(curi)) {
78 return;
79 }
80 String mimeType = curi.getContentType();
81 if (mimeType == null) {
82 return;
83 }
84 if ((mimeType.toLowerCase().indexOf("xml") < 0)
85 && (!curi.toString().toLowerCase().endsWith(".rss"))
86 && (!curi.toString().toLowerCase().endsWith(".xml"))) {
87 return;
88 }
89 this.numberOfCURIsHandled++;
90
91 ReplayCharSequence cs = null;
92 try {
93 cs = curi.getHttpRecorder().getReplayCharSequence();
94 } catch (IOException e) {
95 logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
96 }
97 if (cs == null) {
98 logger.severe("Failed getting ReplayCharSequence: " +
99 curi.toString());
100 return;
101 }
102 try {
103 this.numberOfLinksExtracted += processXml(curi, cs,
104 getController());
105
106 curi.linkExtractorFinished();
107 } finally {
108 if (cs != null) {
109 try {
110 cs.close();
111 } catch (IOException ioe) {
112 logger.warning(TextUtils.exceptionToString(
113 "Failed close of ReplayCharSequence.", ioe));
114 }
115 }
116 }
117 }
118
119 public static long processXml(CrawlURI curi, CharSequence cs,
120 CrawlController controller) {
121 long foundLinks = 0;
122 Matcher uris = null;
123 String xmlUri;
124 uris = TextUtils.getMatcher(XML_URI_EXTRACTOR, cs);
125 while (uris.find()) {
126 xmlUri = uris.group(1);
127
128 xmlUri = TextUtils.replaceAll(ESCAPED_AMP, xmlUri, "&");
129 foundLinks++;
130 try {
131
132
133
134 curi.createAndAddLink(xmlUri,Link.SPECULATIVE_MISC,
135 Link.SPECULATIVE_HOP);
136 } catch (URIException e) {
137
138
139 if (controller != null) {
140 controller.logUriError(e, curi.getUURI(), xmlUri);
141 } else {
142 logger.info(curi + ", " + xmlUri + ": " +
143 e.getMessage());
144 }
145 }
146 }
147 TextUtils.recycleMatcher(uris);
148 return foundLinks;
149 }
150
151 public String report() {
152 StringBuffer ret = new StringBuffer();
153 ret.append("Processor: org.archive.crawler.extractor.ExtractorXML\n");
154 ret.append(" Function: Link extraction on XML/RSS\n");
155 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
156 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
157
158 return ret.toString();
159 }
160 }