1   /* SURTPrefixSet
2   *
3   * $Id: SurtPrefixSet.java 4644 2006-09-20 22:40:21Z paul_jack $
4   *
5   * Created on Jul 23, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.util;
26  
27  import java.io.BufferedInputStream;
28  import java.io.BufferedOutputStream;
29  import java.io.BufferedReader;
30  import java.io.FileInputStream;
31  import java.io.FileOutputStream;
32  import java.io.FileWriter;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.io.InputStreamReader;
36  import java.io.PrintStream;
37  import java.io.Reader;
38  import java.util.Iterator;
39  import java.util.SortedSet;
40  import java.util.TreeSet;
41  
42  import org.apache.commons.httpclient.URIException;
43  import org.archive.net.UURI;
44  import org.archive.net.UURIFactory;
45  import org.archive.util.iterator.LineReadingIterator;
46  import org.archive.util.iterator.RegexpLineIterator;
47  
48  /***
49   * Specialized TreeSet for keeping a set of String prefixes. 
50   * 
51   * Redundant prefixes (those that are themselves prefixed
52   * by other set entries) are eliminated.
53   * 
54   * @author gojomo
55   */
56  public class SurtPrefixSet extends TreeSet<String> {
57  
58      private static final long serialVersionUID = 2598365040524933110L;
59  
60      private static final String SURT_PREFIX_DIRECTIVE = "+";
61  
62      /***
63       * Test whether the given String is prefixed by one
64       * of this set's entries. 
65       * 
66       * @param s
67       * @return True if contains prefix.
68       */
69      public boolean containsPrefixOf(String s) {
70          SortedSet sub = headSet(s);
71          // because redundant prefixes have been eliminated,
72          // only a test against last item in headSet is necessary
73          if (!sub.isEmpty() && s.startsWith((String)sub.last())) {
74              return true; // prefix substring exists
75          } // else: might still exist exactly (headSet does not contain boundary)
76          return contains(s); // exact string exists, or no prefix is there
77      }
78      
79      /*** 
80       * Maintains additional invariant: if one entry is a 
81       * prefix of another, keep only the prefix. 
82       * 
83       * @see java.util.Collection#add(java.lang.Object)
84       */
85      public boolean add(String s) {
86          SortedSet sub = headSet(s);
87          if (!sub.isEmpty() && s.startsWith((String)sub.last())) {
88              // no need to add; prefix is already present
89              return false;
90          }
91          boolean retVal = super.add(s);
92          sub = tailSet(s+"\0");
93          while(!sub.isEmpty() && ((String)sub.first()).startsWith(s)) {
94              // remove redundant entries
95              sub.remove(sub.first());
96          }
97          return retVal;
98      }
99      
100     
101     /***
102      * Read a set of SURT prefixes from a reader source; keep sorted and 
103      * with redundant entries removed.
104      * 
105      * @param r reader over file of SURT_format strings
106      * @throws IOException
107      */
108     public void importFrom(Reader r) {
109         BufferedReader reader = new BufferedReader(r);
110         String s;
111         
112         Iterator iter = 
113             new RegexpLineIterator(
114                     new LineReadingIterator(reader),
115                     RegexpLineIterator.COMMENT_LINE,
116                     RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
117                     RegexpLineIterator.ENTRY);
118 
119         while (iter.hasNext()) {
120             s = (String) iter.next();
121             add(s.toLowerCase());
122         }
123     }
124 
125     /***
126      * @param r Where to read from.
127      */
128     public void importFromUris(Reader r) {
129         BufferedReader reader = new BufferedReader(r);
130         String s;
131         
132         Iterator iter = 
133             new RegexpLineIterator(
134                     new LineReadingIterator(reader),
135                     RegexpLineIterator.COMMENT_LINE,
136                     RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
137                     RegexpLineIterator.ENTRY);
138 
139         while (iter.hasNext()) {
140             s = (String) iter.next();
141             // s is a URI (or even fragmentary hostname), not a SURT
142             addFromPlain(s);
143         }
144     }
145 
146     /***
147      * Import SURT prefixes from a reader with mixed URI and SURT prefix
148      * format. 
149      * 
150      * @param r  the reader to import the prefixes from
151      * @param deduceFromSeeds   true to also import SURT prefixes implied
152      *                          from normal URIs/hostname seeds
153      */
154     public void importFromMixed(Reader r, boolean deduceFromSeeds) {
155         BufferedReader reader = new BufferedReader(r);
156         String s;
157         
158         Iterator iter = 
159             new RegexpLineIterator(
160                     new LineReadingIterator(reader),
161                     RegexpLineIterator.COMMENT_LINE,
162                     RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
163                     RegexpLineIterator.ENTRY);
164 
165         while (iter.hasNext()) {
166             s = (String) iter.next();
167             if(s.startsWith(SURT_PREFIX_DIRECTIVE)) {
168                 // it's specifically a SURT prefix line
169                 String u = s.substring(SURT_PREFIX_DIRECTIVE.length()).trim();
170                 if(u.indexOf("(")>0) {
171                     // formal SURT prefix; toLowerCase just in case
172                     add(u.toLowerCase());
173                 } else {
174                     // hostname/normal form URI from which 
175                     // to deduce SURT prefix
176                     addFromPlain(u);
177                 }
178                 
179                 continue; 
180             } else {
181                 if(deduceFromSeeds) {
182                     // also deducing 'implied' SURT prefixes 
183                     // from normal URIs/hostname seeds
184                     addFromPlain(s);
185                 }
186             }
187         }
188     }
189     
190     /***
191      * Given a plain URI or hostname, deduce an implied SURT prefix from
192      * it and add to active prefixes. 
193      * 
194      * @param u String of URI or hostname
195      */
196     private void addFromPlain(String u) {
197         u = prefixFromPlain(u);
198         add(u);
199     }
200 
201     /***
202      * Given a plain URI or hostname/hostname+path, deduce an implied SURT 
203      * prefix from it. Results may be unpredictable on strings that cannot
204      * be interpreted as URIs. 
205      * 
206      * UURI 'fixup' is applied to the URI that is built. 
207      *
208      * @param u URI or almost-URI to consider
209      * @return implied SURT prefix form
210      */
211     public static String prefixFromPlain(String u) {
212         u = ArchiveUtils.addImpliedHttpIfNecessary(u);
213         u = coerceFromHttpsForComparison(u);
214         boolean trailingSlash = u.endsWith("/");
215         // ensure all typical UURI cleanup (incl. IDN-punycoding) is done
216         try {
217             u = UURIFactory.getInstance(u).toString();
218         } catch (URIException e) {
219             e.printStackTrace();
220             // allow to continue with original string uri
221         }
222         // except: don't let UURI-fixup add a trailing slash
223         // if it wasn't already there (presence or absence of
224         // such slash has special meaning specifying implied
225         // SURT prefixes)
226         if(!trailingSlash && u.endsWith("/")) {
227             u = u.substring(0,u.length()-1);
228         }
229         // convert to full SURT
230         u = SURT.fromURI(u);
231         // truncate to implied prefix
232         u = SurtPrefixSet.asPrefix(u);
233         return u;
234     }
235 
236     /***
237      * For SURT comparisons -- prefixes or candidates being checked against
238      * those prefixes -- we treat https URIs as if they were http.
239      * 
240      * @param u string to coerce if it has https scheme
241      * @return string converted to http scheme, or original if not necessary
242      */
243     private static String coerceFromHttpsForComparison(String u) {
244         if (u.startsWith("https://")) {
245             u = "http" + u.substring("https".length());
246         }
247         return u;
248     }
249 
250     /***
251      * Utility method for truncating a SURT that came from a 
252      * full URI (as a seed, for example) into a prefix
253      * for determining inclusion.
254      * 
255      * This involves: 
256      * <pre>
257      *    (1) removing the last path component, if any
258      *        (anything after the last '/', if there are
259      *        at least 3 '/'s)
260      *    (2) removing a trailing ')', if present, opening
261      *        the possibility of proper subdomains. (This
262      *        means that the presence or absence of a
263      *        trailing '/' after a hostname in a seed list
264      *        is significant for the how the SURT prefix is 
265      *        created, even though it is not signficant for 
266      *        the URI's treatment as a seed.)
267      * </pre>
268      *
269      * @param s String to work on.
270      * @return As prefix.
271      */
272     private static String asPrefix(String s) {
273         // Strip last path-segment, if more than 3 slashes
274         s = s.replaceAll("^(.*//.*/)[^/]*","$1");
275         // Strip trailing ")", if present and NO path (no 3rd slash).
276         if (!s.endsWith("/")) {
277             s = s.replaceAll("^(.*)//)","$1");
278         }
279         return s;
280     }
281 
282     /***
283      * Calculate the SURT form URI to use as a candidate against prefixes
284      * from the given Object (CandidateURI or UURI)
285      * 
286      * @param object CandidateURI or UURI
287      * @return SURT form of URI for evaluation, or null if unavailable
288      */
289     public static String getCandidateSurt(Object object) {
290         UURI u = UURI.from(object);
291         if (u == null) {
292             return null;
293         }
294         String candidateSurt = u.getSurtForm();
295         // also want to treat https as http
296         candidateSurt = coerceFromHttpsForComparison(candidateSurt);
297         return candidateSurt;
298     }
299     /***
300      * @param fw
301      * @throws IOException
302      */
303     public void exportTo(FileWriter fw) throws IOException {
304         Iterator iter = this.iterator();
305         while(iter.hasNext()) {
306             fw.write((String)iter.next() + "\n");
307         }
308     }
309 
310     /***
311      * Changes all prefixes so that they enforce an exact host. For
312      * prefixes that already include a ')', this means discarding 
313      * anything after ')' (path info). For prefixes that don't include
314      * a ')' -- domain prefixes open to subdomains -- add the closing
315      * ')' (or ",)").  
316      */
317     public void convertAllPrefixesToHosts() {
318         SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
319         Iterator iter = iterCopy.iterator();
320         while (iter.hasNext()) {
321             String prefix = (String) iter.next();
322             String convPrefix = convertPrefixToHost(prefix);
323             if(prefix!=convPrefix) {
324             	// if returned value not unchanged, update set
325             	this.remove(prefix);
326             	this.add(convPrefix);
327             }
328         }
329     }
330     
331     public static String convertPrefixToHost(String prefix) {
332         if(prefix.endsWith(")")) {
333             return prefix; // no change necessary
334         }
335         if(prefix.indexOf(')')<0) {
336             // open-ended domain prefix
337             if(!prefix.endsWith(",")) {
338                 prefix += ",";
339             }
340             prefix += ")";
341         } else {
342             // prefix with excess path-info
343             prefix = prefix.substring(0,prefix.indexOf(')')+1);
344         }
345         return prefix;
346     }
347 
348     /***
349      * Changes all prefixes so that they only enforce a general
350      * domain (allowing subdomains).For prefixes that don't include
351      * a ')', no change is necessary. For others, truncate everything
352      * from the ')' onward. Additionally, truncate off "www," if it
353      * appears.
354      */
355     public void convertAllPrefixesToDomains() {
356         SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
357         Iterator iter = iterCopy.iterator();
358         while (iter.hasNext()) {
359             String prefix = (String) iter.next();
360             String convPrefix = convertPrefixToDomain(prefix);
361             if(prefix!=convPrefix) {
362             	// if returned value not unchanged, update set
363             	this.remove(prefix);
364             	this.add(convPrefix);
365             }
366         } 
367     }
368     
369     public static String convertPrefixToDomain(String prefix) {
370         if(prefix.indexOf(')')>=0) {
371             prefix = prefix.substring(0,prefix.indexOf(')'));
372         }
373         // strip 'www,' when present
374         if(prefix.endsWith("www,")) {
375             prefix = prefix.substring(0,prefix.length()-4);
376         }
377         return prefix;
378     }
379     
380     /***
381      * Allow class to be used as a command-line tool for converting 
382      * URL lists (or naked host or host/path fragments implied
383      * to be HTTP URLs) to implied SURT prefix form. 
384      * 
385      * Read from stdin or first file argument. Writes to stdout. 
386      *
387      * @param args cmd-line arguments: may include input file
388      * @throws IOException
389      */
390     public static void main(String[] args) throws IOException {
391         InputStream in = args.length > 0 ? new BufferedInputStream(
392                 new FileInputStream(args[0])) : System.in;
393         PrintStream out = args.length > 1 ? new PrintStream(
394                 new BufferedOutputStream(new FileOutputStream(args[1])))
395                 : System.out;
396         BufferedReader br =
397             new BufferedReader(new InputStreamReader(in));
398         String line;
399         while((line = br.readLine())!=null) {
400             if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
401             line = line.trim();
402             if(line.length()==0) continue;
403             out.println(prefixFromPlain(line));
404         }
405         br.close();
406         out.close();
407     }
408 }