View Javadoc

1   /* ExtractorHTMLTest
2    *
3    * Created on May 19, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.extractor;
24  
25  import java.io.File;
26  import java.io.FileOutputStream;
27  import java.io.IOException;
28  import java.net.URL;
29  import java.util.Collection;
30  import java.util.Iterator;
31  
32  import javax.management.AttributeNotFoundException;
33  import javax.management.InvalidAttributeValueException;
34  import javax.management.MBeanException;
35  import javax.management.ReflectionException;
36  
37  import org.apache.commons.collections.CollectionUtils;
38  import org.apache.commons.collections.Predicate;
39  import org.apache.commons.httpclient.URIException;
40  import org.archive.crawler.datamodel.CoreAttributeConstants;
41  import org.archive.crawler.datamodel.CrawlOrder;
42  import org.archive.crawler.datamodel.CrawlURI;
43  import org.archive.crawler.settings.MapType;
44  import org.archive.crawler.settings.SettingsHandler;
45  import org.archive.crawler.settings.XMLSettingsHandler;
46  import org.archive.net.UURI;
47  import org.archive.net.UURIFactory;
48  import org.archive.util.HttpRecorder;
49  import org.archive.util.TmpDirTestCase;
50  
51  
52  /***
53   * Test html extractor.
54   *
55   * @author stack
56   * @version $Revision: 5756 $, $Date: 2008-02-06 07:43:26 +0000 (Wed, 06 Feb 2008) $
57   */
58  public class ExtractorHTMLTest
59  extends TmpDirTestCase
60  implements CoreAttributeConstants {
61      private final String ARCHIVE_DOT_ORG = "archive.org";
62      private final String LINK_TO_FIND = "http://www.hewlett.org/";
63      private HttpRecorder recorder = null;
64      private ExtractorHTML extractor = null;
65      
66      protected ExtractorHTML createExtractor()
67      throws InvalidAttributeValueException, AttributeNotFoundException,
68      MBeanException, ReflectionException {
69          // Hack in a settings handler.  Do this by adding this extractor
70          // to the order file (I'm adding it to a random MapType; seemingly
71          // can only add to MapTypes post-construction). This takes care
72          // of setting a valid SettingsHandler into the ExtractorHTML (This
73          // shouldn't be so difficult).  Of note, the order file below is
74          // not written to disk.
75          final String name = this.getClass().getName();
76          SettingsHandler handler = new XMLSettingsHandler(
77              new File(getTmpDir(), name + ".order.xml"));
78          handler.initialize();
79          return (ExtractorHTML)((MapType)handler.getOrder().
80              getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
81                  getSettingsObject(null), new ExtractorHTML(name));
82      }
83      
84      protected void setUp() throws Exception {
85          super.setUp();
86          this.extractor = createExtractor();
87          final boolean USE_NET = false;
88          URL url = null;
89          if (USE_NET) {
90              url = new URL("http://" + this.ARCHIVE_DOT_ORG);
91          } else {
92              File f = new File(getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
93              url = f.toURI().toURL();
94              FileOutputStream fos = new FileOutputStream(f);
95              fos.write(("<html><head><title>test</title><body>" +
96                  "<a href=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
97                  "</body></html>").getBytes());
98              fos.flush();
99              fos.close();
100         }
101         this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
102             this.getClass().getName(), url.openStream(), null);
103     }
104 
105     /*
106      * @see TestCase#tearDown()
107      */
108     protected void tearDown() throws Exception {
109         super.tearDown();
110     }
111 
112     public void testInnerProcess() throws IOException {
113         UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
114         CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
115         this.extractor.innerProcess(curi);
116         Collection links = curi.getOutLinks();
117         boolean foundLinkToHewlettFoundation = false;
118         for (Iterator i = links.iterator(); i.hasNext();) {
119             Link link = (Link)i.next();
120             if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
121                 foundLinkToHewlettFoundation = true;
122                 break;
123             }
124         }
125         assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
126     }
127     
128     private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
129     		throws URIException {
130         CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
131         curi.setContentSize(this.recorder.getRecordedInput().getSize());
132         curi.setContentType("text/html");
133         curi.setFetchStatus(200);
134         curi.setHttpRecorder(rec);
135         // Fake out the extractor that this is a HTTP transaction.
136         curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
137             new Object());
138         return curi;
139     }
140     
141     /***
142      * Test single net or local filesystem page parse.
143      * Set the uuri to be a net url or instead put in place a file
144      * named for this class under the unit test directory.
145      * @throws IOException
146      * @throws ReflectionException
147      * @throws MBeanException
148      * @throws AttributeNotFoundException
149      * @throws InvalidAttributeValueException
150      */
151     public void testPageParse()
152     throws InvalidAttributeValueException, AttributeNotFoundException,
153     MBeanException, ReflectionException, IOException {
154         UURI uuri = null;
155         
156 // DO
157 //      uuri = UURIFactory.getInstance("http://www.xjmu.edu.cn/");
158 // OR
159 //        File f = new File(getTmpDir(), this.getClass().getName() +
160 //        ".html");
161 //        if (f.exists()) {
162 //        	uuri = UURIFactory.getInstance("file://" +
163 //        			f.getAbsolutePath());
164 //        }
165 // OR 
166 //      uuri = getUURI(URL or PATH)
167 //
168 // OR 
169 //      Use the main method below and pass this class an argument.
170 //     
171         if (uuri != null) {
172         	runExtractor(uuri);
173         }
174     }
175     
176     protected UURI getUURI(String url) throws URIException {
177         url = (url.indexOf("://") > 0)? url: "file://" + url;
178         return UURIFactory.getInstance(url);
179     }
180     
181     protected void runExtractor(UURI baseUURI)
182     throws InvalidAttributeValueException, AttributeNotFoundException,
183     MBeanException, ReflectionException, IOException {
184         runExtractor(baseUURI, null);
185     }
186     
187     protected void runExtractor(UURI baseUURI, String encoding)
188     throws IOException, InvalidAttributeValueException,
189     AttributeNotFoundException, MBeanException, ReflectionException {
190         if (baseUURI == null) {
191         	return;
192         }
193         this.extractor = createExtractor();
194         URL url = new URL(baseUURI.toString());
195         this.recorder = HttpRecorder.
196             wrapInputStreamWithHttpRecord(getTmpDir(),
197             this.getClass().getName(), url.openStream(), encoding);
198         CrawlURI curi = setupCrawlURI(this.recorder, url.toString());
199         this.extractor.innerProcess(curi);
200         
201         System.out.println("+" + this.extractor.report());
202         int count = 0; 
203         Collection links = curi.getOutLinks();
204         System.out.println("+HTML Links (hopType="+Link.NAVLINK_HOP+"):");
205         if (links != null) {
206             for (Iterator i = links.iterator(); i.hasNext();) {
207                 Link link = (Link)i.next();
208                 if (link.getHopType()==Link.NAVLINK_HOP) {
209                     count++;
210                     System.out.println(link.getDestination());
211                 }
212             }
213         }
214         System.out.println("+HTML Embeds (hopType="+Link.EMBED_HOP+"):");
215         if (links != null) {
216             for (Iterator i = links.iterator(); i.hasNext();) {
217                 Link link = (Link)i.next();
218                 if (link.getHopType()==Link.EMBED_HOP) {
219                     count++;
220                     System.out.println(link.getDestination());
221                 }
222             }
223         }
224         System.out.
225             println("+HTML Speculative Embeds (hopType="+Link.SPECULATIVE_HOP+"):");
226         if (links != null) {
227             for (Iterator i = links.iterator(); i.hasNext();) {
228                 Link link = (Link)i.next();
229                 if (link.getHopType()==Link.SPECULATIVE_HOP) {
230                     count++;
231                     System.out.println(link.getDestination());
232                 }
233             }
234         }
235         System.out.
236             println("+HTML Other (all other hopTypes):");
237         if (links != null) {
238             for (Iterator i = links.iterator(); i.hasNext();) {
239                 Link link = (Link) i.next();
240                 if (link.getHopType() != Link.SPECULATIVE_HOP
241                         && link.getHopType() != Link.NAVLINK_HOP
242                         && link.getHopType() != Link.EMBED_HOP) {
243                     count++;
244                     System.out.println(link.getHopType() + " "
245                             + link.getDestination());
246                 }
247             }
248         }
249         System.out.println("TOTAL URIS EXTRACTED: "+count);
250     }
251 
252     /***
253      * Test a particular <embed src=...> construct that was suspicious in
254      * the No10GovUk crawl.
255      *
256      * @throws URIException
257      */
258     public void testEmbedSrc() throws URIException {
259         CrawlURI curi=
260             new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
261         // An example from http://www.records.pro.gov.uk/documents/prem/18/1/default.asp?PageId=62&qt=true
262         CharSequence cs = "<embed src=\"/documents/prem/18/1/graphics/qtvr/" +
263             "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" " +
264             "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/" +
265             "quicktime/download/\" /> ";
266         this.extractor.extract(curi,cs);
267         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
268             public boolean evaluate(Object object) {
269                 return ((Link) object).getDestination().toString().indexOf(
270                         "/documents/prem/18/1/graphics/qtvr/hall.mov")>=0;
271             }
272         }));
273     }
274     
275     /***
276      * Test a whitespace issue found in href.
277      * 
278      * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.
279      * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833
280      *
281      * @throws URIException
282      */
283     public void testHrefWhitespace() throws URIException {
284         CrawlURI curi =
285             new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk"));
286         CharSequence cs = "<a href=\"http://www.carsound.dk\n\n\n" +
287         	"\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";   
288         this.extractor.extract(curi,cs);
289         curi.getOutLinks();
290         assertTrue("Not stripping new lines", CollectionUtils.exists(curi
291                 .getOutLinks(), new Predicate() {
292             public boolean evaluate(Object object) {
293                 return ((Link) object).getDestination().toString().indexOf(
294                         "http://www.carsound.dk/")>=0;
295             }
296         }));
297     }
298     
299     /***
300      * Test a missing whitespace issue found in form
301      * 
302      * [HER-1128] ExtractorHTML fails to extract FRAME SRC link without
303      * whitespace before SRC http://webteam.archive.org/jira/browse/HER-1128
304      */
305     public void testNoWhitespaceBeforeValidAttribute() throws URIException {
306         CrawlURI curi = new CrawlURI(UURIFactory
307                 .getInstance("http://www.example.com"));
308         CharSequence cs = "<frame name=\"main\"src=\"http://www.example.com/\"> ";
309         this.extractor.extract(curi, cs);
310         Link[] links = curi.getOutLinks().toArray(new Link[0]);
311         assertTrue("no links found",links.length==1);
312         assertTrue("expected link not found", 
313                 links[0].getDestination().toString().equals("http://www.example.com/"));
314     }
315     
316     /***
317      * Test only extract FORM ACTIONS with METHOD GET 
318      * 
319      * [HER-1280] do not by default GET form action URLs declared as POST, 
320      * because it can cause problems/complaints 
321      * http://webteam.archive.org/jira/browse/HER-1280
322      */
323     public void testOnlyExtractFormGets() throws URIException {
324         CrawlURI curi = new CrawlURI(UURIFactory
325                 .getInstance("http://www.example.com"));
326         CharSequence cs = 
327             "<form method=\"get\" action=\"http://www.example.com/ok1\"> "+
328             "<form action=\"http://www.example.com/ok2\" method=\"get\"> "+
329             "<form method=\"post\" action=\"http://www.example.com/notok\"> "+
330             "<form action=\"http://www.example.com/ok3\"> ";
331         this.extractor.extract(curi, cs);
332         Link[] links = curi.getOutLinks().toArray(new Link[0]);
333         assertTrue("incorrect number of links found",links.length==3);
334     }
335     
336     public static void main(String[] args) throws Exception {
337         if (args.length != 1 && args.length != 2) {
338             System.err.println("Usage: " + ExtractorHTMLTest.class.getName() +
339                 " URL|PATH [ENCODING]");
340             System.exit(1);
341         }
342         ExtractorHTMLTest testCase = new ExtractorHTMLTest();
343         testCase.setUp();
344         try {
345             testCase.runExtractor(testCase.getUURI(args[0]),
346                 (args.length == 2)? args[1]: null);
347         } finally {
348             testCase.tearDown();
349         }
350     }
351 }