1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.extractor;
24
25 import java.io.File;
26 import java.io.FileOutputStream;
27 import java.io.IOException;
28 import java.net.URL;
29 import java.util.Collection;
30 import java.util.Iterator;
31
32 import javax.management.AttributeNotFoundException;
33 import javax.management.InvalidAttributeValueException;
34 import javax.management.MBeanException;
35 import javax.management.ReflectionException;
36
37 import org.apache.commons.collections.CollectionUtils;
38 import org.apache.commons.collections.Predicate;
39 import org.apache.commons.httpclient.URIException;
40 import org.archive.crawler.datamodel.CoreAttributeConstants;
41 import org.archive.crawler.datamodel.CrawlOrder;
42 import org.archive.crawler.datamodel.CrawlURI;
43 import org.archive.crawler.settings.MapType;
44 import org.archive.crawler.settings.SettingsHandler;
45 import org.archive.crawler.settings.XMLSettingsHandler;
46 import org.archive.net.UURI;
47 import org.archive.net.UURIFactory;
48 import org.archive.util.HttpRecorder;
49 import org.archive.util.TmpDirTestCase;
50
51
52 /***
53 * Test html extractor.
54 *
55 * @author stack
56 * @version $Revision: 5756 $, $Date: 2008-02-06 07:43:26 +0000 (Wed, 06 Feb 2008) $
57 */
58 public class ExtractorHTMLTest
59 extends TmpDirTestCase
60 implements CoreAttributeConstants {
61 private final String ARCHIVE_DOT_ORG = "archive.org";
62 private final String LINK_TO_FIND = "http://www.hewlett.org/";
63 private HttpRecorder recorder = null;
64 private ExtractorHTML extractor = null;
65
66 protected ExtractorHTML createExtractor()
67 throws InvalidAttributeValueException, AttributeNotFoundException,
68 MBeanException, ReflectionException {
69
70
71
72
73
74
75 final String name = this.getClass().getName();
76 SettingsHandler handler = new XMLSettingsHandler(
77 new File(getTmpDir(), name + ".order.xml"));
78 handler.initialize();
79 return (ExtractorHTML)((MapType)handler.getOrder().
80 getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
81 getSettingsObject(null), new ExtractorHTML(name));
82 }
83
84 protected void setUp() throws Exception {
85 super.setUp();
86 this.extractor = createExtractor();
87 final boolean USE_NET = false;
88 URL url = null;
89 if (USE_NET) {
90 url = new URL("http://" + this.ARCHIVE_DOT_ORG);
91 } else {
92 File f = new File(getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
93 url = f.toURI().toURL();
94 FileOutputStream fos = new FileOutputStream(f);
95 fos.write(("<html><head><title>test</title><body>" +
96 "<a href=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
97 "</body></html>").getBytes());
98 fos.flush();
99 fos.close();
100 }
101 this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
102 this.getClass().getName(), url.openStream(), null);
103 }
104
105
106
107
108 protected void tearDown() throws Exception {
109 super.tearDown();
110 }
111
112 public void testInnerProcess() throws IOException {
113 UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
114 CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
115 this.extractor.innerProcess(curi);
116 Collection links = curi.getOutLinks();
117 boolean foundLinkToHewlettFoundation = false;
118 for (Iterator i = links.iterator(); i.hasNext();) {
119 Link link = (Link)i.next();
120 if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
121 foundLinkToHewlettFoundation = true;
122 break;
123 }
124 }
125 assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
126 }
127
128 private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
129 throws URIException {
130 CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
131 curi.setContentSize(this.recorder.getRecordedInput().getSize());
132 curi.setContentType("text/html");
133 curi.setFetchStatus(200);
134 curi.setHttpRecorder(rec);
135
136 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
137 new Object());
138 return curi;
139 }
140
141 /***
142 * Test single net or local filesystem page parse.
143 * Set the uuri to be a net url or instead put in place a file
144 * named for this class under the unit test directory.
145 * @throws IOException
146 * @throws ReflectionException
147 * @throws MBeanException
148 * @throws AttributeNotFoundException
149 * @throws InvalidAttributeValueException
150 */
151 public void testPageParse()
152 throws InvalidAttributeValueException, AttributeNotFoundException,
153 MBeanException, ReflectionException, IOException {
154 UURI uuri = null;
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171 if (uuri != null) {
172 runExtractor(uuri);
173 }
174 }
175
176 protected UURI getUURI(String url) throws URIException {
177 url = (url.indexOf("://") > 0)? url: "file://" + url;
178 return UURIFactory.getInstance(url);
179 }
180
181 protected void runExtractor(UURI baseUURI)
182 throws InvalidAttributeValueException, AttributeNotFoundException,
183 MBeanException, ReflectionException, IOException {
184 runExtractor(baseUURI, null);
185 }
186
187 protected void runExtractor(UURI baseUURI, String encoding)
188 throws IOException, InvalidAttributeValueException,
189 AttributeNotFoundException, MBeanException, ReflectionException {
190 if (baseUURI == null) {
191 return;
192 }
193 this.extractor = createExtractor();
194 URL url = new URL(baseUURI.toString());
195 this.recorder = HttpRecorder.
196 wrapInputStreamWithHttpRecord(getTmpDir(),
197 this.getClass().getName(), url.openStream(), encoding);
198 CrawlURI curi = setupCrawlURI(this.recorder, url.toString());
199 this.extractor.innerProcess(curi);
200
201 System.out.println("+" + this.extractor.report());
202 int count = 0;
203 Collection links = curi.getOutLinks();
204 System.out.println("+HTML Links (hopType="+Link.NAVLINK_HOP+"):");
205 if (links != null) {
206 for (Iterator i = links.iterator(); i.hasNext();) {
207 Link link = (Link)i.next();
208 if (link.getHopType()==Link.NAVLINK_HOP) {
209 count++;
210 System.out.println(link.getDestination());
211 }
212 }
213 }
214 System.out.println("+HTML Embeds (hopType="+Link.EMBED_HOP+"):");
215 if (links != null) {
216 for (Iterator i = links.iterator(); i.hasNext();) {
217 Link link = (Link)i.next();
218 if (link.getHopType()==Link.EMBED_HOP) {
219 count++;
220 System.out.println(link.getDestination());
221 }
222 }
223 }
224 System.out.
225 println("+HTML Speculative Embeds (hopType="+Link.SPECULATIVE_HOP+"):");
226 if (links != null) {
227 for (Iterator i = links.iterator(); i.hasNext();) {
228 Link link = (Link)i.next();
229 if (link.getHopType()==Link.SPECULATIVE_HOP) {
230 count++;
231 System.out.println(link.getDestination());
232 }
233 }
234 }
235 System.out.
236 println("+HTML Other (all other hopTypes):");
237 if (links != null) {
238 for (Iterator i = links.iterator(); i.hasNext();) {
239 Link link = (Link) i.next();
240 if (link.getHopType() != Link.SPECULATIVE_HOP
241 && link.getHopType() != Link.NAVLINK_HOP
242 && link.getHopType() != Link.EMBED_HOP) {
243 count++;
244 System.out.println(link.getHopType() + " "
245 + link.getDestination());
246 }
247 }
248 }
249 System.out.println("TOTAL URIS EXTRACTED: "+count);
250 }
251
252 /***
253 * Test a particular <embed src=...> construct that was suspicious in
254 * the No10GovUk crawl.
255 *
256 * @throws URIException
257 */
258 public void testEmbedSrc() throws URIException {
259 CrawlURI curi=
260 new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
261
262 CharSequence cs = "<embed src=\"/documents/prem/18/1/graphics/qtvr/" +
263 "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" " +
264 "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/" +
265 "quicktime/download/\" /> ";
266 this.extractor.extract(curi,cs);
267 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
268 public boolean evaluate(Object object) {
269 return ((Link) object).getDestination().toString().indexOf(
270 "/documents/prem/18/1/graphics/qtvr/hall.mov")>=0;
271 }
272 }));
273 }
274
275 /***
276 * Test a whitespace issue found in href.
277 *
278 * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.
279 * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833
280 *
281 * @throws URIException
282 */
283 public void testHrefWhitespace() throws URIException {
284 CrawlURI curi =
285 new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk"));
286 CharSequence cs = "<a href=\"http://www.carsound.dk\n\n\n" +
287 "\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";
288 this.extractor.extract(curi,cs);
289 curi.getOutLinks();
290 assertTrue("Not stripping new lines", CollectionUtils.exists(curi
291 .getOutLinks(), new Predicate() {
292 public boolean evaluate(Object object) {
293 return ((Link) object).getDestination().toString().indexOf(
294 "http://www.carsound.dk/")>=0;
295 }
296 }));
297 }
298
299 /***
300 * Test a missing whitespace issue found in form
301 *
302 * [HER-1128] ExtractorHTML fails to extract FRAME SRC link without
303 * whitespace before SRC http://webteam.archive.org/jira/browse/HER-1128
304 */
305 public void testNoWhitespaceBeforeValidAttribute() throws URIException {
306 CrawlURI curi = new CrawlURI(UURIFactory
307 .getInstance("http://www.example.com"));
308 CharSequence cs = "<frame name=\"main\"src=\"http://www.example.com/\"> ";
309 this.extractor.extract(curi, cs);
310 Link[] links = curi.getOutLinks().toArray(new Link[0]);
311 assertTrue("no links found",links.length==1);
312 assertTrue("expected link not found",
313 links[0].getDestination().toString().equals("http://www.example.com/"));
314 }
315
316 /***
317 * Test only extract FORM ACTIONS with METHOD GET
318 *
319 * [HER-1280] do not by default GET form action URLs declared as POST,
320 * because it can cause problems/complaints
321 * http://webteam.archive.org/jira/browse/HER-1280
322 */
323 public void testOnlyExtractFormGets() throws URIException {
324 CrawlURI curi = new CrawlURI(UURIFactory
325 .getInstance("http://www.example.com"));
326 CharSequence cs =
327 "<form method=\"get\" action=\"http://www.example.com/ok1\"> "+
328 "<form action=\"http://www.example.com/ok2\" method=\"get\"> "+
329 "<form method=\"post\" action=\"http://www.example.com/notok\"> "+
330 "<form action=\"http://www.example.com/ok3\"> ";
331 this.extractor.extract(curi, cs);
332 Link[] links = curi.getOutLinks().toArray(new Link[0]);
333 assertTrue("incorrect number of links found",links.length==3);
334 }
335
336 public static void main(String[] args) throws Exception {
337 if (args.length != 1 && args.length != 2) {
338 System.err.println("Usage: " + ExtractorHTMLTest.class.getName() +
339 " URL|PATH [ENCODING]");
340 System.exit(1);
341 }
342 ExtractorHTMLTest testCase = new ExtractorHTMLTest();
343 testCase.setUp();
344 try {
345 testCase.runExtractor(testCase.getUURI(args[0]),
346 (args.length == 2)? args[1]: null);
347 } finally {
348 testCase.tearDown();
349 }
350 }
351 }