1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Nov 17, 2003
20   *
21   * To change the template for this generated file go to
22   * Window>Preferences>Java>Code Generation>Code and Comments
23   */
24  package org.archive.crawler.extractor;
25  
26  import java.io.IOException;
27  import java.util.logging.Logger;
28  import java.util.regex.Matcher;
29  
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.crawler.datamodel.CoreAttributeConstants;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.framework.CrawlController;
34  import org.archive.io.ReplayCharSequence;
35  import org.archive.net.UURI;
36  import org.archive.util.DevUtils;
37  import org.archive.util.TextUtils;
38  
39  /***
40   * Processes Javascript files for strings that are likely to be
41   * crawlable URIs.
42   *
43   * @author gojomo
44   *
45   */
46  public class ExtractorJS extends Extractor implements CoreAttributeConstants {
47  
48      private static final long serialVersionUID = -2231962381454717720L;
49  
50      private static Logger LOGGER =
51          Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
52  
53      static final String AMP = "&";
54      static final String ESCAPED_AMP = "&";
55      static final String WHITESPACE = "//s";
56  
57      // finds whitespace-free strings in Javascript
58      // (areas between paired ' or " characters, possibly backslash-quoted
59      // on the ends, but not in the middle)
60      static final String JAVASCRIPT_STRING_EXTRACTOR =
61          "(////{0,8}+(?:\"|\'))(//S{0,"+UURI.MAX_URL_LENGTH+"}?)(?://1)";
62      // GROUPS:
63      // (G1) ' or " with optional leading backslashes
64      // (G2) whitespace-free string delimited on boths ends by G1
65  
66      // determines whether a string is likely URI
67      // (no whitespace or '<' '>',  has an internal dot or some slash,
68      // begins and ends with either '/' or a word-char)
69      static final String STRING_URI_DETECTOR =
70          "(?://w|[//.]{0,2}/)[//S&&[^<>]]*(?://.|/)[//S&&[^<>]]*(?://w|/)";
71  
72      protected long numberOfCURIsHandled = 0;
73      protected static long numberOfLinksExtracted = 0;
74  
75      /***
76       * @param name
77       */
78      public ExtractorJS(String name) {
79          super(name, "JavaScript extractor. Link extraction on JavaScript" +
80                  " files (.js).");
81      }
82  
83      /* (non-Javadoc)
84       * @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI)
85       */
86      public void extract(CrawlURI curi) {
87          if (!isHttpTransactionContentToProcess(curi)) {
88              return;
89          }
90          String contentType = curi.getContentType();
91          if ((contentType == null)) {
92              return;
93          }
94          // If content type is not js and if the viaContext
95          // does not begin with 'script', return.
96          if((contentType.indexOf("javascript") < 0) &&
97              (contentType.indexOf("jscript") < 0) &&
98              (contentType.indexOf("ecmascript") < 0) &&
99              (!curi.toString().toLowerCase().endsWith(".js")) &&
100             (curi.getViaContext() == null || !curi.getViaContext().
101                 toString().toLowerCase().startsWith("script"))) {
102             return;
103         }
104 
105         this.numberOfCURIsHandled++;
106 
107         ReplayCharSequence cs = null;
108         try {
109             cs = curi.getHttpRecorder().getReplayCharSequence();
110         } catch (IOException e) {
111             curi.addLocalizedError(this.getName(), e,
112             	"Failed get of replay char sequence.");
113         }
114         if (cs == null) {
115             LOGGER.warning("Failed getting ReplayCharSequence: " +
116                 curi.toString());
117             return;
118         }
119 
120         try {
121             try {
122                 numberOfLinksExtracted += considerStrings(curi, cs,
123                         getController(), true);
124             } catch (StackOverflowError e) {
125                 DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
126             }
127             // Set flag to indicate that link extraction is completed.
128             curi.linkExtractorFinished();
129         } finally {
130             // Done w/ the ReplayCharSequence. Close it.
131             if (cs != null) {
132                 try {
133                     cs.close();
134                 } catch (IOException ioe) {
135                     LOGGER.warning(TextUtils.exceptionToString(
136                         "Failed close of ReplayCharSequence.", ioe));
137                 }
138             }
139         }
140     }
141 
142     public static long considerStrings(CrawlURI curi, CharSequence cs,
143             CrawlController controller, boolean handlingJSFile) {
144         long foundLinks = 0;
145         Matcher strings =
146             TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
147         while(strings.find()) {
148             CharSequence subsequence =
149                 cs.subSequence(strings.start(2), strings.end(2));
150             Matcher uri =
151                 TextUtils.getMatcher(STRING_URI_DETECTOR, subsequence);
152             if(uri.matches()) {
153                 String string = uri.group();
154                 string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP);
155                 foundLinks++;
156                 try {
157                     if (handlingJSFile) {
158                         curi.createAndAddLinkRelativeToVia(string,
159                             Link.JS_MISC, Link.SPECULATIVE_HOP);
160                     } else {
161                         curi.createAndAddLinkRelativeToBase(string,
162                             Link.JS_MISC, Link.SPECULATIVE_HOP);
163                     }
164                 } catch (URIException e) {
165                     // There may not be a controller (e.g. If we're being run
166                     // by the extractor tool).
167                     if (controller != null) {
168                         controller.logUriError(e, curi.getUURI(), string);
169                     } else {
170                         LOGGER.info(curi + ", " + string + ": " +
171                             e.getMessage());
172                     }
173                 }
174             } else {
175                foundLinks += considerStrings(curi, subsequence,
176                    controller, handlingJSFile);
177             }
178             TextUtils.recycleMatcher(uri);
179         }
180         TextUtils.recycleMatcher(strings);
181         return foundLinks;
182     }
183 
184     /* (non-Javadoc)
185      * @see org.archive.crawler.framework.Processor#report()
186      */
187     public String report() {
188         StringBuffer ret = new StringBuffer();
189         ret.append("Processor: org.archive.crawler.extractor.ExtractorJS\n");
190         ret.append("  Function:          Link extraction on JavaScript code\n");
191         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
192         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
193 
194         return ret.toString();
195     }
196 }