1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.net;
27
28 import java.io.BufferedReader;
29 import java.io.BufferedWriter;
30 import java.io.FileReader;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.InputStreamReader;
34 import java.io.OutputStreamWriter;
35 import java.util.ArrayList;
36 import java.util.Collections;
37 import java.util.Iterator;
38 import java.util.List;
39 import java.util.SortedSet;
40 import java.util.TreeSet;
41 import java.util.regex.Matcher;
42 import java.util.regex.Pattern;
43
44 import org.apache.commons.io.IOUtils;
45 import org.archive.util.TextUtils;
46
47 /***
48 * Utility class for making use of the information about 'public suffixes' at
49 * http://publicsuffix.org.
50 *
51 * The public suffix list (once known as 'effective TLDs') was motivated by the
52 * need to decide on which broader domains a subdomain was allowed to set
53 * cookies. For example, a server at 'www.example.com' can set cookies for
54 * 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set
55 * cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'.
56 * The number of rules for all top-level-domains and 2nd- or 3rd- level domains
57 * has become quite long; essentially the broadest domain a subdomain may assign
58 * to is the one that was sold/registered to a specific name registrant.
59 *
60 * This concept should be useful in other contexts, too. Grouping URIs (or
61 * queues of URIs to crawl) together with others sharing the same registered
62 * suffix may be useful for applying the same rules to all, such as assigning
63 * them to the same queue or crawler in a multi- machine setup.
64 *
65 * @author Gojomo
66 */
67 public class PublicSuffixes {
68 protected static Pattern topmostAssignedSurtPrefixPattern;
69 protected static String topmostAssignedSurtPrefixRegex;
70
71 /***
72 * Utility method for dumping a regex String, based on a published public
73 * suffix list, which matches any SURT-form hostname up through the broadest
74 * 'private' (assigned/sold) domain-segment. That is, for any of the
75 * SURT-form hostnames...
76 *
77 * com,example, com,example,www, com,example,california,www
78 *
79 * ...the regex will match 'com,example,'.
80 *
81 * @param args
82 * @throws IOException
83 */
84 public static void main(String args[]) throws IOException {
85
86 String regex;
87
88 if (args.length == 0 || "=".equals(args[0])) {
89
90 regex = getTopmostAssignedSurtPrefixRegex();
91 } else {
92
93 BufferedReader reader = new BufferedReader(new FileReader(args[0]));
94 regex = getTopmostAssignedSurtPrefixRegex(reader);
95 IOUtils.closeQuietly(reader);
96 }
97
98 boolean needsClose = false;
99 BufferedWriter writer;
100 if (args.length >= 2) {
101
102 writer = new BufferedWriter(new FileWriter(args[1]));
103 needsClose = true;
104 } else {
105
106 writer = new BufferedWriter(new OutputStreamWriter(System.out));
107 }
108 writer.append(regex);
109 writer.flush();
110 if (needsClose) {
111 writer.close();
112 }
113 }
114
115 /***
116 * Reads a file of the format promulgated by publicsuffix.org, ignoring
117 * comments and '!' exceptions/notations, converting domain segments to
118 * SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns sorted
119 * list of unique SURT-ordered prefixes.
120 *
121 * @param reader
122 * @return
123 * @throws IOException
124 */
125 public static List<String> readPublishedFileToSurtList(BufferedReader reader)
126 throws IOException {
127 String line;
128 List<String> list = new ArrayList<String>();
129 while ((line = reader.readLine()) != null) {
130
131
132 line = line.trim();
133 if (line.length() == 0 || line.startsWith("//")) {
134 continue;
135 }
136
137 line = line.split("//s+")[0];
138 line = line.toLowerCase();
139
140
141 String[] segs = line.split("//.");
142 StringBuilder surtregex = new StringBuilder();
143 for (int i = segs.length - 1; i >= 0; i--) {
144 if (segs[i].length() > 0) {
145
146 String fixed = segs[i].replaceAll("//?", "_");
147
148
149 fixed = fixed.replaceAll("!", "+");
150 surtregex.append(fixed + ",");
151 }
152 }
153 list.add(surtregex.toString());
154 }
155
156 Collections.sort(list);
157
158 String last = "";
159 Iterator<String> iter = list.iterator();
160 while (iter.hasNext()) {
161 String s = iter.next();
162 if (s.equals(last)) {
163 iter.remove();
164 continue;
165 }
166 last = s;
167
168 }
169 return list;
170 }
171
172 /***
173 * Converts SURT-ordered list of public prefixes into a Java regex which
174 * matches the public-portion "plus one" segment, giving the domain on which
175 * cookies can be set or other policy grouping should occur. Also adds to
176 * regex a fallback matcher that for any new/unknown TLDs assumes the
177 * second-level domain is assignable. (Eg: 'zzz,example,').
178 *
179 * @param list
180 * @return
181 */
182 private static String surtPrefixRegexFromSurtList(List<String> list) {
183 StringBuilder regex = new StringBuilder();
184 regex.append("(?ix)^\n");
185 TreeSet<String> prefixes = new TreeSet<String>(Collections
186 .reverseOrder());
187 prefixes.addAll(list);
188 prefixes.add("*,");
189 buildRegex("", regex, prefixes);
190 regex.append("\n([//-//w]+,)");
191 String rstring = regex.toString();
192
193 rstring = rstring.replaceAll("//*", "[////-////w]+");
194 return rstring;
195 }
196
197 protected static void buildRegex(String stem, StringBuilder regex,
198 SortedSet<String> prefixes) {
199 if (prefixes.isEmpty()) {
200 return;
201 }
202 if (prefixes.size() == 1 && prefixes.first().equals(stem)) {
203
204 return;
205 }
206 regex.append("(?:");
207 if (stem.length() == 0) {
208 regex.append("\n ");
209 }
210 Iterator<String> iter = prefixes.iterator();
211 char c = 0;
212 while (iter.hasNext()) {
213 String s = iter.next();
214 if (s.length() > stem.length()) {
215 char d = s.charAt(stem.length());
216
217 if (d == '+') {
218
219 regex.append("(?=" + s.substring(stem.length() + 1) + ")");
220 } else {
221 if (d == c) {
222 continue;
223 }
224 c = d;
225 regex.append(c);
226 String newStem = s.substring(0, stem.length() + 1);
227 SortedSet<String> tail = prefixes.tailSet(newStem);
228 SortedSet<String> range = null;
229 successor: for (String candidate : tail) {
230 if (!candidate.equals(newStem)) {
231 range = prefixes.subSet(s, candidate);
232 break successor;
233 }
234 }
235 if (range == null) {
236 range = prefixes.tailSet(s);
237 }
238 buildRegex(newStem, regex, range);
239 }
240 regex.append('|');
241 } else {
242
243 regex.append('@');
244 }
245 }
246
247 regex.deleteCharAt(regex.length() - 1);
248 regex.append(')');
249 if (stem.length() == 1) {
250 regex.append('\n');
251 }
252 }
253
254 public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() {
255 if (topmostAssignedSurtPrefixPattern == null) {
256 topmostAssignedSurtPrefixPattern = Pattern
257 .compile(getTopmostAssignedSurtPrefixRegex());
258 }
259 return topmostAssignedSurtPrefixPattern;
260 }
261
262 public static synchronized String getTopmostAssignedSurtPrefixRegex() {
263 if (topmostAssignedSurtPrefixRegex == null) {
264
265 BufferedReader reader = new BufferedReader(new InputStreamReader(
266 PublicSuffixes.class.getClassLoader().getResourceAsStream(
267 "effective_tld_names.dat")));
268 topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader);
269 IOUtils.closeQuietly(reader);
270 }
271 return topmostAssignedSurtPrefixRegex;
272 }
273
274 public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) {
275 List<String> list;
276 try {
277 list = readPublishedFileToSurtList(reader);
278 } catch (IOException e) {
279 throw new RuntimeException(e);
280 }
281 return surtPrefixRegexFromSurtList(list);
282 }
283
284 /***
285 * Truncate SURT to its topmost assigned domain segment; that is,
286 * the public suffix plus one segment, but as a SURT-ordered prefix.
287 *
288 * if the pattern doesn't match, the passed-in SURT is returned.
289 *
290 * @param surt SURT to truncate
291 * @return truncated-to-topmost-assigned SURT prefix
292 */
293 public static String reduceSurtToTopmostAssigned(String surt) {
294 Matcher matcher = TextUtils.getMatcher(
295 getTopmostAssignedSurtPrefixRegex(), surt);
296 if (matcher.find()) {
297 surt = matcher.group();
298 }
299 TextUtils.recycleMatcher(matcher);
300 return surt;
301 }
302 }