View Javadoc

1   /* PublicSuffixes.java
2    *
3    * $Id: BloomFilter32bitSplit.java 5197 2007-06-06 01:31:46Z gojomo $
4    *
5    * Created on Jun 13, 2007
6    *
7    * Copyright (C) 2007 Internet Archive
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  
26  package org.archive.net;
27  
28  import java.io.BufferedReader;
29  import java.io.BufferedWriter;
30  import java.io.FileReader;
31  import java.io.FileWriter;
32  import java.io.IOException;
33  import java.io.InputStreamReader;
34  import java.io.OutputStreamWriter;
35  import java.util.ArrayList;
36  import java.util.Collections;
37  import java.util.Iterator;
38  import java.util.List;
39  import java.util.SortedSet;
40  import java.util.TreeSet;
41  import java.util.regex.Matcher;
42  import java.util.regex.Pattern;
43  
44  import org.apache.commons.io.IOUtils;
45  import org.archive.util.TextUtils;
46  
47  /***
48   * Utility class for making use of the information about 'public suffixes' at
49   * http://publicsuffix.org.
50   * 
51   * The public suffix list (once known as 'effective TLDs') was motivated by the
52   * need to decide on which broader domains a subdomain was allowed to set
53   * cookies. For example, a server at 'www.example.com' can set cookies for
54   * 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set
55   * cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'.
56   * The number of rules for all top-level-domains and 2nd- or 3rd- level domains
57   * has become quite long; essentially the broadest domain a subdomain may assign
58   * to is the one that was sold/registered to a specific name registrant.
59   * 
60   * This concept should be useful in other contexts, too. Grouping URIs (or
61   * queues of URIs to crawl) together with others sharing the same registered
62   * suffix may be useful for applying the same rules to all, such as assigning
63   * them to the same queue or crawler in a multi- machine setup.
64   * 
65   * @author Gojomo
66   */
67  public class PublicSuffixes {
68      protected static Pattern topmostAssignedSurtPrefixPattern;
69      protected static String topmostAssignedSurtPrefixRegex;
70  
71      /***
72       * Utility method for dumping a regex String, based on a published public
73       * suffix list, which matches any SURT-form hostname up through the broadest
74       * 'private' (assigned/sold) domain-segment. That is, for any of the
75       * SURT-form hostnames...
76       * 
77       * com,example, com,example,www, com,example,california,www
78       * 
79       * ...the regex will match 'com,example,'.
80       * 
81       * @param args
82       * @throws IOException
83       */
84      public static void main(String args[]) throws IOException {
85  
86          String regex;
87          
88          if (args.length == 0 || "=".equals(args[0])) {
89              // use bundled list
90              regex = getTopmostAssignedSurtPrefixRegex();
91          } else {
92              // use specified filename
93              BufferedReader reader = new BufferedReader(new FileReader(args[0]));
94              regex = getTopmostAssignedSurtPrefixRegex(reader);
95              IOUtils.closeQuietly(reader);
96          }
97  
98          boolean needsClose = false;
99          BufferedWriter writer;
100         if (args.length >= 2) {
101             // writer to specified file
102             writer = new BufferedWriter(new FileWriter(args[1]));
103             needsClose = true;
104         } else {
105             // write to stdout
106             writer = new BufferedWriter(new OutputStreamWriter(System.out));
107         }
108         writer.append(regex);
109         writer.flush();
110         if (needsClose) {
111             writer.close();
112         }
113     }
114 
115     /***
116      * Reads a file of the format promulgated by publicsuffix.org, ignoring
117      * comments and '!' exceptions/notations, converting domain segments to
118      * SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns sorted
119      * list of unique SURT-ordered prefixes.
120      * 
121      * @param reader
122      * @return
123      * @throws IOException
124      */
125     public static List<String> readPublishedFileToSurtList(BufferedReader reader)
126             throws IOException {
127         String line;
128         List<String> list = new ArrayList<String>();
129         while ((line = reader.readLine()) != null) {
130 
131             // discard whitespace, empty lines, comments, exceptions
132             line = line.trim();
133             if (line.length() == 0 || line.startsWith("//")) {
134                 continue;
135             }
136             // discard utf8 notation after entry
137             line = line.split("//s+")[0];
138             line = line.toLowerCase();
139 
140             // SURT-order domain segments
141             String[] segs = line.split("//.");
142             StringBuilder surtregex = new StringBuilder();
143             for (int i = segs.length - 1; i >= 0; i--) {
144                 if (segs[i].length() > 0) {
145                     // current list has a stray '?' in a .no domain
146                     String fixed = segs[i].replaceAll("//?", "_");
147                     // replace '!' with '+' to indicate lookahead-for-exceptions
148                     // (gets those to sort before '*' at later build-step)
149                     fixed = fixed.replaceAll("!", "+");
150                     surtregex.append(fixed + ",");
151                 }
152             }
153             list.add(surtregex.toString());
154         }
155 
156         Collections.sort(list);
157         // uniq
158         String last = "";
159         Iterator<String> iter = list.iterator();
160         while (iter.hasNext()) {
161             String s = iter.next();
162             if (s.equals(last)) {
163                 iter.remove();
164                 continue;
165             }
166             last = s;
167 //            System.out.println(s);
168         }
169         return list;
170     }
171 
172     /***
173      * Converts SURT-ordered list of public prefixes into a Java regex which
174      * matches the public-portion "plus one" segment, giving the domain on which
175      * cookies can be set or other policy grouping should occur. Also adds to
176      * regex a fallback matcher that for any new/unknown TLDs assumes the
177      * second-level domain is assignable. (Eg: 'zzz,example,').
178      * 
179      * @param list
180      * @return
181      */
182     private static String surtPrefixRegexFromSurtList(List<String> list) {
183         StringBuilder regex = new StringBuilder();
184         regex.append("(?ix)^\n");
185         TreeSet<String> prefixes = new TreeSet<String>(Collections
186                 .reverseOrder());
187         prefixes.addAll(list);
188         prefixes.add("*,"); // for new/unknown TLDs
189         buildRegex("", regex, prefixes);
190         regex.append("\n([//-//w]+,)");
191         String rstring = regex.toString();
192         // convert glob-stars to word-char-runs
193         rstring = rstring.replaceAll("//*", "[////-////w]+");
194         return rstring;
195     }
196 
197     protected static void buildRegex(String stem, StringBuilder regex,
198             SortedSet<String> prefixes) {
199         if (prefixes.isEmpty()) {
200             return;
201         }
202         if (prefixes.size() == 1 && prefixes.first().equals(stem)) {
203             // avoid unnecessary "(?:)"
204             return;
205         }
206         regex.append("(?:");
207         if (stem.length() == 0) {
208             regex.append("\n "); // linebreak-space before first character
209         }
210         Iterator<String> iter = prefixes.iterator();
211         char c = 0;
212         while (iter.hasNext()) {
213             String s = iter.next();
214             if (s.length() > stem.length()) {
215                 char d = s.charAt(stem.length());
216 
217                 if (d == '+') {
218                     // convert exception to zero-width-positive-lookahead
219                     regex.append("(?=" + s.substring(stem.length() + 1) + ")");
220                 } else {
221                     if (d == c) {
222                         continue;
223                     }
224                     c = d;
225                     regex.append(c);
226                     String newStem = s.substring(0, stem.length() + 1);
227                     SortedSet<String> tail = prefixes.tailSet(newStem);
228                     SortedSet<String> range = null;
229                     successor: for (String candidate : tail) {
230                         if (!candidate.equals(newStem)) {
231                             range = prefixes.subSet(s, candidate);
232                             break successor;
233                         }
234                     }
235                     if (range == null) {
236                         range = prefixes.tailSet(s);
237                     }
238                     buildRegex(newStem, regex, range);
239                 }
240                 regex.append('|');
241             } else {
242                 // empty suffix; insert dummy to be eaten when loop exits
243                 regex.append('@');
244             }
245         }
246         // eat the trailing '|' (if no empty '@') or dummy
247         regex.deleteCharAt(regex.length() - 1);
248         regex.append(')');
249         if (stem.length() == 1) {
250             regex.append('\n'); // linebreak for TLDs
251         }
252     }
253 
254     public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() {
255         if (topmostAssignedSurtPrefixPattern == null) {
256             topmostAssignedSurtPrefixPattern = Pattern
257                     .compile(getTopmostAssignedSurtPrefixRegex());
258         }
259         return topmostAssignedSurtPrefixPattern;
260     }
261 
262     public static synchronized String getTopmostAssignedSurtPrefixRegex() {
263         if (topmostAssignedSurtPrefixRegex == null) {
264             // use bundled list
265             BufferedReader reader = new BufferedReader(new InputStreamReader(
266                     PublicSuffixes.class.getClassLoader().getResourceAsStream(
267                             "effective_tld_names.dat")));
268             topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader);
269             IOUtils.closeQuietly(reader);
270         }
271         return topmostAssignedSurtPrefixRegex;
272     }
273 
274     public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) {
275         List<String> list;
276         try {
277             list = readPublishedFileToSurtList(reader);
278         } catch (IOException e) {
279             throw new RuntimeException(e);
280         }
281         return surtPrefixRegexFromSurtList(list);
282     }
283 
284     /***
285      * Truncate SURT to its topmost assigned domain segment; that is, 
286      * the public suffix plus one segment, but as a SURT-ordered prefix. 
287      * 
288      * if the pattern doesn't match, the passed-in SURT is returned.
289      * 
290      * @param surt SURT to truncate
291      * @return truncated-to-topmost-assigned SURT prefix
292      */
293     public static String reduceSurtToTopmostAssigned(String surt) {
294         Matcher matcher = TextUtils.getMatcher(
295                 getTopmostAssignedSurtPrefixRegex(), surt);
296         if (matcher.find()) {
297             surt = matcher.group();
298         }
299         TextUtils.recycleMatcher(matcher);
300         return surt;
301     }
302 }