1   /*
2    * ExtractorXML
3    *
4    * $Id: ExtractorXML.java 4653 2006-09-25 18:58:50Z paul_jack $
5    *
6    * Created on Sep 27, 2005
7    *
8    * Copyright (C) 2005 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.io.IOException;
30  import java.util.logging.Logger;
31  import java.util.regex.Matcher;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.archive.crawler.datamodel.CoreAttributeConstants;
35  import org.archive.crawler.datamodel.CrawlURI;
36  import org.archive.crawler.framework.CrawlController;
37  import org.archive.io.ReplayCharSequence;
38  import org.archive.util.TextUtils;
39  
40  /***
41   * A simple extractor which finds HTTP URIs inside XML/RSS files,
42   * inside attribute values and simple elements (those with only
43   * whitespace + HTTP URI + whitespace as contents)
44   *
45   * @author gojomo
46   *
47   **/
48  
49  public class ExtractorXML extends Extractor implements CoreAttributeConstants {
50  
51      private static final long serialVersionUID = 3101230586822401584L;
52  
53      private static Logger logger =
54          Logger.getLogger(ExtractorXML.class.getName());
55  
56      private static String ESCAPED_AMP = "&amp";
57  
58      static final String XML_URI_EXTRACTOR =    
59      "(?i)[\"\'>]//s*(http:[^//s\"\'<>]+)//s*[\"\'<]"; 
60      // GROUPS:
61      // (G1) URI
62      
63      private long numberOfCURIsHandled = 0;
64      private long numberOfLinksExtracted = 0;
65  
66      /***
67       * @param name
68       */
69      public ExtractorXML(String name) {
70          super(name, "XML Extractor. Extracts links from XML/RSS.");
71      }
72  
73      /***
74       * @param curi Crawl URI to process.
75       */
76      public void extract(CrawlURI curi) {
77          if (!isHttpTransactionContentToProcess(curi)) {
78              return;
79          }
80          String mimeType = curi.getContentType();
81          if (mimeType == null) {
82              return;
83          }
84          if ((mimeType.toLowerCase().indexOf("xml") < 0) 
85                  && (!curi.toString().toLowerCase().endsWith(".rss"))
86                  && (!curi.toString().toLowerCase().endsWith(".xml"))) {
87              return;
88          }
89          this.numberOfCURIsHandled++;
90  
91          ReplayCharSequence cs = null;
92          try {
93              cs = curi.getHttpRecorder().getReplayCharSequence();
94          } catch (IOException e) {
95              logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
96          }
97          if (cs == null) {
98              logger.severe("Failed getting ReplayCharSequence: " +
99                  curi.toString());
100             return;
101         }
102         try {
103             this.numberOfLinksExtracted += processXml(curi, cs,
104                 getController());
105             // Set flag to indicate that link extraction is completed.
106             curi.linkExtractorFinished();
107         } finally {
108             if (cs != null) {
109                 try {
110                     cs.close();
111                 } catch (IOException ioe) {
112                     logger.warning(TextUtils.exceptionToString(
113                             "Failed close of ReplayCharSequence.", ioe));
114                 }
115             }
116         }
117     }
118 
119     public static long processXml(CrawlURI curi, CharSequence cs,
120             CrawlController controller) {
121         long foundLinks = 0;
122         Matcher uris = null;
123         String xmlUri;
124         uris = TextUtils.getMatcher(XML_URI_EXTRACTOR, cs);
125         while (uris.find()) {
126             xmlUri = uris.group(1);
127             // TODO: Escape more HTML Entities.
128             xmlUri = TextUtils.replaceAll(ESCAPED_AMP, xmlUri, "&");
129             foundLinks++;
130             try {
131                 // treat as speculative, as whether context really 
132                 // intends to create a followable/fetchable URI is
133                 // unknown
134                 curi.createAndAddLink(xmlUri,Link.SPECULATIVE_MISC,
135                         Link.SPECULATIVE_HOP);
136             } catch (URIException e) {
137                 // There may not be a controller (e.g. If we're being run
138                 // by the extractor tool).
139                 if (controller != null) {
140                     controller.logUriError(e, curi.getUURI(), xmlUri);
141                 } else {
142                     logger.info(curi + ", " + xmlUri + ": " +
143                         e.getMessage());
144                 }
145             }
146         }
147         TextUtils.recycleMatcher(uris);
148         return foundLinks;
149     }
150 
151     public String report() {
152         StringBuffer ret = new StringBuffer();
153         ret.append("Processor: org.archive.crawler.extractor.ExtractorXML\n");
154         ret.append("  Function:          Link extraction on XML/RSS\n");
155         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
156         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
157 
158         return ret.toString();
159     }
160 }