View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Nov 17, 2003
20   *
21   * To change the template for this generated file go to
22   * Window>Preferences>Java>Code Generation>Code and Comments
23   */
24  package org.archive.extractor;
25  
26  import java.util.LinkedList;
27  import java.util.regex.Matcher;
28  import java.util.regex.Pattern;
29  
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.crawler.extractor.Link;
32  import org.archive.net.UURI;
33  import org.archive.net.UURIFactory;
34  import org.archive.util.TextUtils;
35  
36  /***
37   * Uses regular expressions to find likely URIs inside Javascript.
38   *
39   * ROUGH DRAFT IN PROGRESS / incomplete... untested...
40   * 
41   * @author gojomo
42   */
43  public class RegexpJSLinkExtractor extends CharSequenceLinkExtractor {
44  //    private static Logger logger =
45  //        Logger.getLogger(RegexpJSLinkExtractor.class.getName());
46  
47      static final String AMP = "&";
48      static final String ESCAPED_AMP = "&";
49      static final String WHITESPACE = "//s";
50  
51      // finds whitespace-free strings in Javascript
52      // (areas between paired ' or " characters, possibly backslash-quoted
53      // on the ends, but not in the middle)
54      static final Pattern JAVASCRIPT_STRING_EXTRACTOR = Pattern.compile(
55          "(////{0,8}+(?:\"|\'))(.+?)(?://1)");
56  
57      // determines whether a string is likely URI
58      // (no whitespace or '<' '>',  has an internal dot or some slash,
59      // begins and ends with either '/' or a word-char)
60      static final Pattern STRING_URI_DETECTOR = Pattern.compile(
61          "(?://w|[//.]{0,2}/)[//S&&[^<>]]*(?://.|/)[//S&&[^<>]]*(?://w|/)");
62  
63      Matcher strings;
64      LinkedList<Matcher> matcherStack = new LinkedList<Matcher>();
65  
66      protected boolean findNextLink() {
67          if(strings==null) {
68               strings = JAVASCRIPT_STRING_EXTRACTOR.matcher(sourceContent);
69          }
70          while(strings!=null) {
71              while(strings.find()) {
72                  CharSequence subsequence =
73                      sourceContent.subSequence(strings.start(2), strings.end(2));
74                  Matcher uri = STRING_URI_DETECTOR.matcher(subsequence);
75                  if ((subsequence.length() <= UURI.MAX_URL_LENGTH) && uri.matches()) {
76                      String string = uri.group();
77                      string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP);
78                      try {
79                          Link link = new Link(source, UURIFactory.getInstance(
80                                  source, string), Link.JS_MISC, Link.SPECULATIVE_HOP);
81                          next.add(link);
82                          return true;
83                      } catch (URIException e) {
84                          extractErrorListener.noteExtractError(e,source,string);
85                      }
86                  } else {
87                     //  push current range
88                     matcherStack.addFirst(strings);
89                     // start looking inside string
90                     strings = JAVASCRIPT_STRING_EXTRACTOR.matcher(subsequence);
91                  }
92              }
93              // continue at enclosing range, if available
94              strings = (Matcher) (matcherStack.isEmpty() ? null : matcherStack.removeFirst());
95          }
96          return false;
97      }
98  
99  
100     /* (non-Javadoc)
101      * @see org.archive.extractor.LinkExtractor#reset()
102      */
103     public void reset() {
104         super.reset();
105         matcherStack.clear();
106         strings = null;
107     }
108 
109     protected static CharSequenceLinkExtractor newDefaultInstance() {
110         return new RegexpJSLinkExtractor();
111     }
112 }