1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.util;
26
27 import java.io.BufferedInputStream;
28 import java.io.BufferedOutputStream;
29 import java.io.BufferedReader;
30 import java.io.FileInputStream;
31 import java.io.FileOutputStream;
32 import java.io.FileWriter;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.io.InputStreamReader;
36 import java.io.PrintStream;
37 import java.io.Reader;
38 import java.util.Iterator;
39 import java.util.SortedSet;
40 import java.util.TreeSet;
41
42 import org.apache.commons.httpclient.URIException;
43 import org.archive.net.UURI;
44 import org.archive.net.UURIFactory;
45 import org.archive.util.iterator.LineReadingIterator;
46 import org.archive.util.iterator.RegexpLineIterator;
47
48 /***
49 * Specialized TreeSet for keeping a set of String prefixes.
50 *
51 * Redundant prefixes (those that are themselves prefixed
52 * by other set entries) are eliminated.
53 *
54 * @author gojomo
55 */
56 public class SurtPrefixSet extends TreeSet<String> {
57
58 private static final long serialVersionUID = 2598365040524933110L;
59
60 private static final String SURT_PREFIX_DIRECTIVE = "+";
61
62 /***
63 * Test whether the given String is prefixed by one
64 * of this set's entries.
65 *
66 * @param s
67 * @return True if contains prefix.
68 */
69 public boolean containsPrefixOf(String s) {
70 SortedSet sub = headSet(s);
71
72
73 if (!sub.isEmpty() && s.startsWith((String)sub.last())) {
74 return true;
75 }
76 return contains(s);
77 }
78
79 /***
80 * Maintains additional invariant: if one entry is a
81 * prefix of another, keep only the prefix.
82 *
83 * @see java.util.Collection#add(java.lang.Object)
84 */
85 public boolean add(String s) {
86 SortedSet sub = headSet(s);
87 if (!sub.isEmpty() && s.startsWith((String)sub.last())) {
88
89 return false;
90 }
91 boolean retVal = super.add(s);
92 sub = tailSet(s+"\0");
93 while(!sub.isEmpty() && ((String)sub.first()).startsWith(s)) {
94
95 sub.remove(sub.first());
96 }
97 return retVal;
98 }
99
100
101 /***
102 * Read a set of SURT prefixes from a reader source; keep sorted and
103 * with redundant entries removed.
104 *
105 * @param r reader over file of SURT_format strings
106 * @throws IOException
107 */
108 public void importFrom(Reader r) {
109 BufferedReader reader = new BufferedReader(r);
110 String s;
111
112 Iterator iter =
113 new RegexpLineIterator(
114 new LineReadingIterator(reader),
115 RegexpLineIterator.COMMENT_LINE,
116 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
117 RegexpLineIterator.ENTRY);
118
119 while (iter.hasNext()) {
120 s = (String) iter.next();
121 add(s.toLowerCase());
122 }
123 }
124
125 /***
126 * @param r Where to read from.
127 */
128 public void importFromUris(Reader r) {
129 BufferedReader reader = new BufferedReader(r);
130 String s;
131
132 Iterator iter =
133 new RegexpLineIterator(
134 new LineReadingIterator(reader),
135 RegexpLineIterator.COMMENT_LINE,
136 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
137 RegexpLineIterator.ENTRY);
138
139 while (iter.hasNext()) {
140 s = (String) iter.next();
141
142 addFromPlain(s);
143 }
144 }
145
146 /***
147 * Import SURT prefixes from a reader with mixed URI and SURT prefix
148 * format.
149 *
150 * @param r the reader to import the prefixes from
151 * @param deduceFromSeeds true to also import SURT prefixes implied
152 * from normal URIs/hostname seeds
153 */
154 public void importFromMixed(Reader r, boolean deduceFromSeeds) {
155 BufferedReader reader = new BufferedReader(r);
156 String s;
157
158 Iterator iter =
159 new RegexpLineIterator(
160 new LineReadingIterator(reader),
161 RegexpLineIterator.COMMENT_LINE,
162 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
163 RegexpLineIterator.ENTRY);
164
165 while (iter.hasNext()) {
166 s = (String) iter.next();
167 if(s.startsWith(SURT_PREFIX_DIRECTIVE)) {
168
169 String u = s.substring(SURT_PREFIX_DIRECTIVE.length()).trim();
170 if(u.indexOf("(")>0) {
171
172 add(u.toLowerCase());
173 } else {
174
175
176 addFromPlain(u);
177 }
178
179 continue;
180 } else {
181 if(deduceFromSeeds) {
182
183
184 addFromPlain(s);
185 }
186 }
187 }
188 }
189
190 /***
191 * Given a plain URI or hostname, deduce an implied SURT prefix from
192 * it and add to active prefixes.
193 *
194 * @param u String of URI or hostname
195 */
196 private void addFromPlain(String u) {
197 u = prefixFromPlain(u);
198 add(u);
199 }
200
201 /***
202 * Given a plain URI or hostname/hostname+path, deduce an implied SURT
203 * prefix from it. Results may be unpredictable on strings that cannot
204 * be interpreted as URIs.
205 *
206 * UURI 'fixup' is applied to the URI that is built.
207 *
208 * @param u URI or almost-URI to consider
209 * @return implied SURT prefix form
210 */
211 public static String prefixFromPlain(String u) {
212 u = ArchiveUtils.addImpliedHttpIfNecessary(u);
213 u = coerceFromHttpsForComparison(u);
214 boolean trailingSlash = u.endsWith("/");
215
216 try {
217 u = UURIFactory.getInstance(u).toString();
218 } catch (URIException e) {
219 e.printStackTrace();
220
221 }
222
223
224
225
226 if(!trailingSlash && u.endsWith("/")) {
227 u = u.substring(0,u.length()-1);
228 }
229
230 u = SURT.fromURI(u);
231
232 u = SurtPrefixSet.asPrefix(u);
233 return u;
234 }
235
236 /***
237 * For SURT comparisons -- prefixes or candidates being checked against
238 * those prefixes -- we treat https URIs as if they were http.
239 *
240 * @param u string to coerce if it has https scheme
241 * @return string converted to http scheme, or original if not necessary
242 */
243 private static String coerceFromHttpsForComparison(String u) {
244 if (u.startsWith("https://")) {
245 u = "http" + u.substring("https".length());
246 }
247 return u;
248 }
249
250 /***
251 * Utility method for truncating a SURT that came from a
252 * full URI (as a seed, for example) into a prefix
253 * for determining inclusion.
254 *
255 * This involves:
256 * <pre>
257 * (1) removing the last path component, if any
258 * (anything after the last '/', if there are
259 * at least 3 '/'s)
260 * (2) removing a trailing ')', if present, opening
261 * the possibility of proper subdomains. (This
262 * means that the presence or absence of a
263 * trailing '/' after a hostname in a seed list
264 * is significant for the how the SURT prefix is
265 * created, even though it is not signficant for
266 * the URI's treatment as a seed.)
267 * </pre>
268 *
269 * @param s String to work on.
270 * @return As prefix.
271 */
272 private static String asPrefix(String s) {
273
274 s = s.replaceAll("^(.*//.*/)[^/]*","$1");
275
276 if (!s.endsWith("/")) {
277 s = s.replaceAll("^(.*)//)","$1");
278 }
279 return s;
280 }
281
282 /***
283 * Calculate the SURT form URI to use as a candidate against prefixes
284 * from the given Object (CandidateURI or UURI)
285 *
286 * @param object CandidateURI or UURI
287 * @return SURT form of URI for evaluation, or null if unavailable
288 */
289 public static String getCandidateSurt(Object object) {
290 UURI u = UURI.from(object);
291 if (u == null) {
292 return null;
293 }
294 String candidateSurt = u.getSurtForm();
295
296 candidateSurt = coerceFromHttpsForComparison(candidateSurt);
297 return candidateSurt;
298 }
299 /***
300 * @param fw
301 * @throws IOException
302 */
303 public void exportTo(FileWriter fw) throws IOException {
304 Iterator iter = this.iterator();
305 while(iter.hasNext()) {
306 fw.write((String)iter.next() + "\n");
307 }
308 }
309
310 /***
311 * Changes all prefixes so that they enforce an exact host. For
312 * prefixes that already include a ')', this means discarding
313 * anything after ')' (path info). For prefixes that don't include
314 * a ')' -- domain prefixes open to subdomains -- add the closing
315 * ')' (or ",)").
316 */
317 public void convertAllPrefixesToHosts() {
318 SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
319 Iterator iter = iterCopy.iterator();
320 while (iter.hasNext()) {
321 String prefix = (String) iter.next();
322 String convPrefix = convertPrefixToHost(prefix);
323 if(prefix!=convPrefix) {
324
325 this.remove(prefix);
326 this.add(convPrefix);
327 }
328 }
329 }
330
331 public static String convertPrefixToHost(String prefix) {
332 if(prefix.endsWith(")")) {
333 return prefix;
334 }
335 if(prefix.indexOf(')')<0) {
336
337 if(!prefix.endsWith(",")) {
338 prefix += ",";
339 }
340 prefix += ")";
341 } else {
342
343 prefix = prefix.substring(0,prefix.indexOf(')')+1);
344 }
345 return prefix;
346 }
347
348 /***
349 * Changes all prefixes so that they only enforce a general
350 * domain (allowing subdomains).For prefixes that don't include
351 * a ')', no change is necessary. For others, truncate everything
352 * from the ')' onward. Additionally, truncate off "www," if it
353 * appears.
354 */
355 public void convertAllPrefixesToDomains() {
356 SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
357 Iterator iter = iterCopy.iterator();
358 while (iter.hasNext()) {
359 String prefix = (String) iter.next();
360 String convPrefix = convertPrefixToDomain(prefix);
361 if(prefix!=convPrefix) {
362
363 this.remove(prefix);
364 this.add(convPrefix);
365 }
366 }
367 }
368
369 public static String convertPrefixToDomain(String prefix) {
370 if(prefix.indexOf(')')>=0) {
371 prefix = prefix.substring(0,prefix.indexOf(')'));
372 }
373
374 if(prefix.endsWith("www,")) {
375 prefix = prefix.substring(0,prefix.length()-4);
376 }
377 return prefix;
378 }
379
380 /***
381 * Allow class to be used as a command-line tool for converting
382 * URL lists (or naked host or host/path fragments implied
383 * to be HTTP URLs) to implied SURT prefix form.
384 *
385 * Read from stdin or first file argument. Writes to stdout.
386 *
387 * @param args cmd-line arguments: may include input file
388 * @throws IOException
389 */
390 public static void main(String[] args) throws IOException {
391 InputStream in = args.length > 0 ? new BufferedInputStream(
392 new FileInputStream(args[0])) : System.in;
393 PrintStream out = args.length > 1 ? new PrintStream(
394 new BufferedOutputStream(new FileOutputStream(args[1])))
395 : System.out;
396 BufferedReader br =
397 new BufferedReader(new InputStreamReader(in));
398 String line;
399 while((line = br.readLine())!=null) {
400 if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
401 line = line.trim();
402 if(line.length()==0) continue;
403 out.println(prefixFromPlain(line));
404 }
405 br.close();
406 out.close();
407 }
408 }