View Javadoc

1   /* UURIFactory
2    *
3    * $Id: UURIFactory.java 5371 2007-08-02 04:21:57Z gojomo $
4    *
5    * Created on July 16, 2004
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.net;
26  
27  import gnu.inet.encoding.IDNA;
28  import gnu.inet.encoding.IDNAException;
29  import it.unimi.dsi.mg4j.util.MutableString;
30  
31  import java.io.UnsupportedEncodingException;
32  import java.util.Arrays;
33  import java.util.BitSet;
34  import java.util.logging.Level;
35  import java.util.logging.Logger;
36  import java.util.regex.Matcher;
37  import java.util.regex.Pattern;
38  
39  import org.apache.commons.httpclient.URI;
40  import org.apache.commons.httpclient.URIException;
41  import org.archive.util.TextUtils;
42  
43  
44  /***
45   * Factory that returns UURIs.
46   * 
47   * Does escaping and fixup on URIs massaging in accordance with RFC2396
48   * and to match browser practice. For example, it removes any
49   * '..' if first thing in the path as per IE,  converts backslashes to forward
50   * slashes, and discards any 'fragment'/anchor portion of the URI. This
51   * class will also fail URIs if they are longer than IE's allowed maximum
52   * length.
53   * 
54   * <p>TODO: Test logging.
55   * 
56   * @author stack
57   */
58  public class UURIFactory extends URI {
59      
60      private static final long serialVersionUID = -6146295130382209042L;
61  
62      /***
63       * Logging instance.
64       */
65      private static Logger logger =
66          Logger.getLogger(UURIFactory.class.getName());
67      
68      /***
69       * The single instance of this factory.
70       */
71      private static final UURIFactory factory = new UURIFactory();
72      
73      /***
74       * RFC 2396-inspired regex.
75       *
76       * From the RFC Appendix B:
77       * <pre>
78       * URI Generic Syntax                August 1998
79       *
80       * B. Parsing a URI Reference with a Regular Expression
81       *
82       * As described in Section 4.3, the generic URI syntax is not sufficient
83       * to disambiguate the components of some forms of URI.  Since the
84       * "greedy algorithm" described in that section is identical to the
85       * disambiguation method used by POSIX regular expressions, it is
86       * natural and commonplace to use a regular expression for parsing the
87       * potential four components and fragment identifier of a URI reference.
88       *
89       * The following line is the regular expression for breaking-down a URI
90       * reference into its components.
91       *
92       * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
93       * 12            3  4          5       6  7        8 9
94       *
95       * The numbers in the second line above are only to assist readability;
96       * they indicate the reference points for each subexpression (i.e., each
97       * paired parenthesis).  We refer to the value matched for subexpression
98       * <n> as $<n>.  For example, matching the above expression to
99       *
100      * http://www.ics.uci.edu/pub/ietf/uri/#Related
101      *
102      * results in the following subexpression matches:
103      *
104      * $1 = http:
105      * $2 = http
106      * $3 = //www.ics.uci.edu
107      * $4 = www.ics.uci.edu
108      * $5 = /pub/ietf/uri/
109      * $6 = <undefined>
110      * $7 = <undefined>
111      * $8 = #Related
112      * $9 = Related
113      *
114      * where <undefined> indicates that the component is not present, as is
115      * the case for the query component in the above example.  Therefore, we
116      * can determine the value of the four components and fragment as
117      *
118      * scheme    = $2
119      * authority = $4
120      * path      = $5
121      * query     = $7
122      * fragment  = $9
123      * </pre>
124      *
125      * -- 
126      * <p>Below differs from the rfc regex in that it has java escaping of
127      * regex characters and we allow a URI made of a fragment only (Added extra
128      * group so indexing is off by one after scheme).
129      */
130     final static Pattern RFC2396REGEX = Pattern.compile(
131         "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(//?([^#]*))?)?(#(.*))?");
132     //    12            34  5          6       7   8          9 A
133     //              2 1             54        6          87 3      A9
134     // 1: scheme
135     // 2: scheme:
136     // 3: //authority/path
137     // 4: //authority
138     // 5: authority
139     // 6: path
140     // 7: ?query
141     // 8: query 
142     // 9: #fragment
143     // A: fragment
144 
145     public static final String SLASHDOTDOTSLASH = "^(///.//./)+";
146     public static final String SLASH = "/";
147     public static final String HTTP = "http";
148     public static final String HTTP_PORT = ":80";
149     public static final String HTTPS = "https";
150     public static final String HTTPS_PORT = ":443";
151     public static final String DOT = ".";
152     public static final String EMPTY_STRING = "";
153     public static final String NBSP = "\u00A0";
154     public static final String SPACE = " ";
155     public static final String ESCAPED_SPACE = "%20";
156     public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$";
157     public static final String PIPE = "|";
158     public static final String PIPE_PATTERN = "//|";
159     public static final String ESCAPED_PIPE = "%7C";
160     public static final String CIRCUMFLEX = "^";
161     public static final String CIRCUMFLEX_PATTERN = "//^";
162     public static final String ESCAPED_CIRCUMFLEX = "%5E";
163     public static final String QUOT = "\"";
164     public static final String ESCAPED_QUOT = "%22";
165     public static final String SQUOT = "'";
166     public static final String ESCAPED_SQUOT = "%27";
167     public static final String APOSTROPH = "`";
168     public static final String ESCAPED_APOSTROPH = "%60";
169     public static final String LSQRBRACKET = "[";
170     public static final String LSQRBRACKET_PATTERN = "//[";
171     public static final String ESCAPED_LSQRBRACKET = "%5B";
172     public static final String RSQRBRACKET = "]";
173     public static final String RSQRBRACKET_PATTERN = "//]";
174     public static final String ESCAPED_RSQRBRACKET = "%5D";
175     public static final String LCURBRACKET = "{";
176     public static final String LCURBRACKET_PATTERN = "//{";
177     public static final String ESCAPED_LCURBRACKET = "%7B";
178     public static final String RCURBRACKET = "}";
179     public static final String RCURBRACKET_PATTERN = "//}";
180     public static final String ESCAPED_RCURBRACKET = "%7D";
181     public static final String BACKSLASH = "//";
182     public static final String BACKSLASH_PATTERN = "////";
183     public static final String ESCAPED_BACKSLASH = "%5C";
184     public static final String STRAY_SPACING = "[\n\r\t]+";
185     public static final String IMPROPERESC_REPLACE = "%25$1";
186     public static final String IMPROPERESC =
187         "%((?:[^//p{XDigit}])|(?:.[^//p{XDigit}])|(?://z))";
188     public static final String COMMERCIAL_AT = "@";
189     public static final char PERCENT_SIGN = '%';
190     public static final char COLON = ':';
191     
192     /***
193      * First percent sign in string followed by two hex chars.
194      */
195     public static final String URI_HEX_ENCODING =
196         "^[^%]*%[//p{XDigit}][//p{XDigit}].*";
197     
198     /***
199      * Authority port number regex.
200      */
201     final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$");
202     
203     /***
204      * Characters we'll accept in the domain label part of a URI
205      * authority: ASCII letters-digits-hyphen (LDH) plus underscore,
206      * with single intervening '.' characters.
207      * 
208      * (We accept '_' because DNS servers have tolerated for many
209      * years counter to spec; we also accept dash patterns and ACE
210      * prefixes that will be rejected by IDN-punycoding attempt.)
211      */
212     final static String ACCEPTABLE_ASCII_DOMAIN =
213         "^(?:[a-zA-Z0-9_-]++(?://.)?)++$";
214     
215     /***
216      * Pattern that looks for case of three or more slashes after the 
217      * scheme.  If found, we replace them with two only as mozilla does.
218      */
219     final static Pattern HTTP_SCHEME_SLASHES =
220         Pattern.compile("^(https?://)/+(.*)");
221     
222     /***
223      * Pattern that looks for case of two or more slashes in a path.
224      */
225     final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+");
226     
227     /***
228      * System property key for list of supported schemes.
229      */
230     private static final String SCHEMES_KEY = ".schemes";
231     
232     /***
233      * System property key for list of purposefully-ignored schemes.
234      */
235     private static final String IGNORED_SCHEMES_KEY = ".ignored-schemes";
236 
237     private String[] schemes = null;
238     private String[] ignoredSchemes = null;
239 
240     public static final int IGNORED_SCHEME = 9999999;
241     
242     /***
243      * Protected constructor.
244      */
245     private UURIFactory() {
246         super();
247         String s = System.getProperty(this.getClass().getName() + SCHEMES_KEY);
248         if (s != null && s.length() > 0) {
249             schemes = s.split("[, ]+");
250             Arrays.sort(schemes);
251         }
252         String ignored = System.getProperty(this.getClass().getName() + IGNORED_SCHEMES_KEY);
253         if (ignored != null && ignored.length() > 0) {
254             ignoredSchemes  = ignored.split("[, ]+");
255             Arrays.sort(ignoredSchemes);
256         }
257     }
258     
259     /***
260      * @param uri URI as string.
261      * @return An instance of UURI
262      * @throws URIException
263      */
264     public static UURI getInstance(String uri) throws URIException {
265         return UURIFactory.factory.create(uri);
266     }
267     
268     /***
269      * @param uri URI as string.
270      * @param charset Character encoding of the passed uri string.
271      * @return An instance of UURI
272      * @throws URIException
273      */
274     public static UURI getInstance(String uri, String charset)
275     		throws URIException {
276         return UURIFactory.factory.create(uri, charset);
277     }
278     
279     /***
280      * @param base Base uri to use resolving passed relative uri.
281      * @param relative URI as string.
282      * @return An instance of UURI
283      * @throws URIException
284      */
285     public static UURI getInstance(UURI base, String relative)
286     		throws URIException {
287         return UURIFactory.factory.create(base, relative);
288     }
289     
290     /***
291      * Test of whether passed String has an allowed URI scheme.
292      * First tests if likely scheme suffix.  If so, we then test if its one of
293      * the supported schemes.
294      * @param possibleUrl URL string to examine.
295      * @return True if passed string looks like it could be an URL.
296      */
297     public static boolean hasSupportedScheme(String possibleUrl) {
298         boolean hasScheme = UURI.hasScheme(possibleUrl);
299         if (!hasScheme || UURIFactory.factory.schemes == null) {
300             return hasScheme;
301         }
302         String tmpStr = possibleUrl.substring(0, possibleUrl.indexOf(':'));
303         return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0;
304     }
305 
306     /***
307      * @param uri URI as string.
308      * @return Instance of UURI.
309      * @throws URIException
310      */
311     private UURI create(String uri) throws URIException {
312         return create(uri, UURI.getDefaultProtocolCharset());
313     }
314     
315     /***
316      * @param uri URI as string.
317      * @param charset Original encoding of the string.
318      * @return Instance of UURI.
319      * @throws URIException
320      */
321     private UURI create(String uri, String charset) throws URIException {
322         UURI uuri  = new UURI(fixup(uri, null, charset), true, charset);
323         if (logger.isLoggable(Level.FINE)) {
324             logger.fine("URI " + uri +
325                 " PRODUCT " + uuri.toString() +
326                 " CHARSET " + charset);
327         }
328         return validityCheck(uuri);
329     }
330     
331     /***
332      * @param base UURI to use as a base resolving <code>relative</code>.
333      * @param relative Relative URI.
334      * @return Instance of UURI.
335      * @throws URIException
336      */
337     private UURI create(UURI base, String relative) throws URIException {
338         UURI uuri = new UURI(base, new UURI(fixup(relative, base, base.getProtocolCharset()),
339             true, base.getProtocolCharset()));
340         if (logger.isLoggable(Level.FINE)) {
341             logger.fine(" URI " + relative +
342                 " PRODUCT " + uuri.toString() +
343                 " CHARSET " + base.getProtocolCharset() +
344                 " BASE " + base);
345         }
346         return validityCheck(uuri);
347     }
348 
349     /***
350      * Check the generated UURI.
351      * 
352      * At the least look at length of uuri string.  We were seeing case
353      * where before escaping, string was &lt; MAX_URL_LENGTH but after was
354      * &gt;.  Letting out a too-big message was causing us troubles later
355      * down the processing chain.
356      * @param uuri Created uuri to check.
357      * @return The passed <code>uuri</code> so can easily inline this check.
358      * @throws URIException
359      */
360     protected UURI validityCheck(UURI uuri) throws URIException {
361         if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) {
362            throw new URIException("Created (escaped) uuri > " +
363               UURI.MAX_URL_LENGTH +": "+uuri.toString());
364         }
365         return uuri;
366     }
367     
368     /***
369      * Do heritrix fix-up on passed uri string.
370      *
371      * Does heritrix escaping; usually escaping done to make our behavior align
372      * with IEs.  This method codifies our experience pulling URIs from the
373      * wilds.  Its does all the escaping we want; its output can always be
374      * assumed to be 'escaped' (though perhaps to a laxer standard than the 
375      * vanilla HttpClient URI class or official specs might suggest). 
376      *
377      * @param uri URI as string.
378      * @param base May be null.
379      * @param e True if the uri is already escaped.
380      * @return A fixed up URI string.
381      * @throws URIException
382      */
383     private String fixup(String uri, final URI base, final String charset)
384     throws URIException {
385         if (uri == null) {
386             throw new NullPointerException();
387         } else if (uri.length() == 0 && base == null) {
388             throw new URIException("URI length is zero (and not relative).");
389         }
390         
391         if (uri.length() > UURI.MAX_URL_LENGTH) {
392             // We check length here and again later after all convertions.
393             throw new URIException("URI length > " + UURI.MAX_URL_LENGTH +
394                 ": " + uri);
395         }
396         
397         // Replace nbsp with normal spaces (so that they get stripped if at
398         // ends, or encoded if in middle)
399         if (uri.indexOf(NBSP) >= 0) {
400             uri = TextUtils.replaceAll(NBSP, uri, SPACE);
401         }
402         
403         // Get rid of any trailing spaces or new-lines. 
404         uri = uri.trim();
405         
406         // IE actually converts backslashes to slashes rather than to %5C.
407         // Since URIs that have backslashes usually work only with IE, we will
408         // convert backslashes to slashes as well.
409         // TODO: Maybe we can first convert backslashes by specs and than by IE
410         // so that we fetch both versions.
411         if (uri.indexOf(BACKSLASH) >= 0) {
412             uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH);
413         }
414         
415         // Remove stray TAB/CR/LF
416         uri = TextUtils.replaceAll(STRAY_SPACING, uri, EMPTY_STRING);
417         
418         // Test for the case of more than two slashes after the http(s) scheme.
419         // Replace with two slashes as mozilla does if found.
420         // See [ 788219 ] URI Syntax Errors stop page parsing.
421         Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri);
422         if (matcher.matches()) {
423             uri = matcher.group(1) + matcher.group(2);
424         }
425 
426         // now, minimally escape any whitespace
427         uri = escapeWhitespace(uri);
428         
429         // For further processing, get uri elements.  See the RFC2396REGEX
430         // comment above for explaination of group indices used in the below.
431         matcher = RFC2396REGEX.matcher(uri);
432         if (!matcher.matches()) {
433             throw new URIException("Failed parse of " + uri);
434         }
435         String uriScheme = checkUriElementAndLowerCase(matcher.group(2));
436         String uriSchemeSpecificPart = checkUriElement(matcher.group(3));
437         String uriAuthority = checkUriElement(matcher.group(5));
438         String uriPath = checkUriElement(matcher.group(6));
439         String uriQuery = checkUriElement(matcher.group(8));
440         // UNUSED String uriFragment = checkUriElement(matcher.group(10));
441         
442         // If a scheme, is it a supported scheme?
443         if (uriScheme != null && uriScheme.length() > 0 &&
444                 this.schemes != null) {
445             if (!(Arrays.binarySearch(schemes,uriScheme)>=0)) {
446                 // unsupported; see if silently ignored
447                 if((Arrays.binarySearch(ignoredSchemes,uriScheme)>=0)) {
448                     throw new URIException(
449                             IGNORED_SCHEME, "Ignored scheme: " + uriScheme);
450                 } else {
451                     throw new URIException("Unsupported scheme: " + uriScheme);
452                 }
453             }
454         }
455         
456         // Test if relative URI. If so, need a base to resolve against.
457         if (uriScheme == null || uriScheme.length() <= 0) {
458             if (base == null) {
459                 throw new URIException("Relative URI but no base: " + uri);
460             }
461         } else {
462         	checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme,
463         		uriSchemeSpecificPart);
464         }
465         
466         // fixup authority portion: lowercase/IDN-punycode any domain; 
467         // remove stray trailing spaces
468         uriAuthority = fixupAuthority(uriAuthority);
469 
470         // Do some checks if absolute path.
471         if (uriSchemeSpecificPart != null &&
472                 uriSchemeSpecificPart.startsWith(SLASH)) {
473             if (uriPath != null) {
474                 // Eliminate '..' if its first thing in the path.  IE does this.
475                 uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath,
476                     SLASH);
477             }
478             // Ensure root URLs end with '/': browsers always send "/"
479             // on the request-line, so we should consider "http://host"
480             // to be "http://host/".
481             if (uriPath == null || EMPTY_STRING.equals(uriPath)) {
482                 uriPath = SLASH;
483             }
484         }
485 
486         if (uriAuthority != null) {
487             if (uriScheme != null && uriScheme.length() > 0 &&
488                     uriScheme.equals(HTTP)) {
489                 uriAuthority = checkPort(uriAuthority);
490                 uriAuthority = stripTail(uriAuthority, HTTP_PORT);
491             } else if (uriScheme != null && uriScheme.length() > 0 &&
492                     uriScheme.equals(HTTPS)) {
493                 uriAuthority = checkPort(uriAuthority);
494                 uriAuthority = stripTail(uriAuthority, HTTPS_PORT);
495             }
496             // Strip any prefix dot or tail dots from the authority.
497             uriAuthority = stripTail(uriAuthority, DOT);
498             uriAuthority = stripPrefix(uriAuthority, DOT);
499         } else {
500             // no authority; may be relative. consider stripping scheme
501             // to work-around org.apache.commons.httpclient.URI bug
502             // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 )
503             if (uriScheme != null && base != null
504                     && uriScheme.equals(base.getScheme())) {
505                 // uriScheme redundant and will only confound httpclient.URI
506                 uriScheme = null; 
507             }
508         }
509         
510         // Ensure minimal escaping. Use of 'lax' URI and URLCodec 
511         // means minimal escaping isn't necessarily complete/consistent.
512         // There is a chance such lax encoding will throw exceptions
513         // later at inconvenient times. 
514         //
515         // One reason for these bad escapings -- though not the only --
516         // is that the page is using an encoding other than the ASCII or the
517         // UTF-8 that is our default URI encoding.  In this case the parent
518         // class is burping on the passed URL encoding.  If the page encoding
519         // was passed into this factory, the encoding seems to be parsed
520         // correctly (See the testEscapedEncoding unit test).
521         //
522         // This fixup may cause us to miss content.  There is the charset case
523         // noted above.  TODO: Look out for cases where we fail other than for
524         // the above given reason which will be fixed when we address
525         // '[ 913687 ] Make extractors interrogate for charset'.
526 
527         uriPath = ensureMinimalEscaping(uriPath, charset);
528         uriQuery = ensureMinimalEscaping(uriQuery, charset,
529             LaxURLCodec.QUERY_SAFE);
530 
531         // Preallocate.  The '1's and '2's in below are space for ':',
532         // '//', etc. URI characters.
533         MutableString s = new MutableString(
534             ((uriScheme != null)? uriScheme.length(): 0)
535             + 1 // ';' 
536             + ((uriAuthority != null)? uriAuthority.length(): 0)
537             + 2 // '//'
538             + ((uriPath != null)? uriPath.length(): 0)
539             + 1 // '?'
540             + ((uriQuery != null)? uriQuery.length(): 0));
541         appendNonNull(s, uriScheme, ":", true);
542         appendNonNull(s, uriAuthority, "//", false);
543         appendNonNull(s, uriPath, "", false);
544         appendNonNull(s, uriQuery, "?", false);
545         return s.toString();
546     }
547     
548     /***
549      * If http(s) scheme, check scheme specific part begins '//'.
550      * @throws URIException 
551      * @see http://www.faqs.org/rfcs/rfc1738.html Section 3.1. Common Internet
552      * Scheme Syntax
553      */
554     protected void checkHttpSchemeSpecificPartSlashPrefix(final URI base,
555     		final String scheme, final String schemeSpecificPart)
556     throws URIException {
557     	if (scheme == null || scheme.length() <= 0) {
558     		return;
559     	}
560     	if (!scheme.equals("http") && !scheme.equals("https")) {
561     		return;
562     	}
563     	if ( schemeSpecificPart == null 
564     	        || !schemeSpecificPart.startsWith("//")) {
565     	    // only acceptable if schemes match
566     	    if (base == null || !scheme.equals(base.getScheme())) {
567     	        throw new URIException(
568     	                "relative URI with scheme only allowed for " +
569     	                "scheme matching base");
570     	    } 
571     	    return; 
572     	}
573     	if (schemeSpecificPart.length() <= 2) {
574     		throw new URIException("http scheme specific part is " +
575         		"too short: " + schemeSpecificPart);
576     	}
577     }
578     
579     /***
580      * Fixup 'authority' portion of URI, by removing any stray 
581      * encoded spaces, lowercasing any domain names, and applying
582      * IDN-punycoding to Unicode domains. 
583      * 
584      * @param uriAuthority the authority string to fix
585      * @return fixed version
586      * @throws URIException
587      */
588     private String fixupAuthority(String uriAuthority) throws URIException {
589         // Lowercase the host part of the uriAuthority; don't destroy any
590         // userinfo capitalizations.  Make sure no illegal characters in
591         // domainlabel substring of the uri authority.
592         if (uriAuthority != null) {
593             // Get rid of any trailing escaped spaces:
594             // http://www.archive.org%20.  Rare but happens.
595             // TODO: reevaluate: do IE or firefox do such mid-URI space-removal?
596             // if not, we shouldn't either. 
597             while(uriAuthority.endsWith(ESCAPED_SPACE)) {
598                 uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3);
599             }
600 
601             // lowercase & IDN-punycode only the domain portion
602             int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
603             int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex);
604             if(atIndex<0 && portColonIndex<0) {
605                 // most common case: neither userinfo nor port
606                 return fixupDomainlabel(uriAuthority);
607             } else if (atIndex<0 && portColonIndex>-1) {
608                 // next most common: port but no userinfo
609                 String domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex));
610                 String port = uriAuthority.substring(portColonIndex);
611                 return domain + port;
612             } else if (atIndex>-1 && portColonIndex<0) {
613                 // uncommon: userinfo, no port
614                 String userinfo = uriAuthority.substring(0,atIndex+1);
615                 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1));
616                 return userinfo + domain;
617             } else {
618                 // uncommon: userinfo, port
619                 String userinfo = uriAuthority.substring(0,atIndex+1);
620                 String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex));
621                 String port = uriAuthority.substring(portColonIndex);
622                 return userinfo + domain + port;
623             }
624         }
625         return uriAuthority;
626     }
627     
628     /***
629      * Fixup the domain label part of the authority.
630      * 
631      * We're more lax than the spec. in that we allow underscores.
632      * 
633      * @param label Domain label to fix.
634      * @return Return fixed domain label.
635      * @throws URIException
636      */
637     private String fixupDomainlabel(String label)
638     throws URIException {
639         
640         // apply IDN-punycoding, as necessary
641         try {
642             // TODO: optimize: only apply when necessary, or
643             // keep cache of recent encodings
644             label = IDNA.toASCII(label);
645         } catch (IDNAException e) {
646             if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) {
647                 // domain name has ACE prefix, leading/trailing dash, or 
648                 // underscore -- but is still a name we wish to tolerate;
649                 // simply continue
650             } else {
651                 // problematic domain: neither ASCII acceptable characters
652                 // nor IDN-punycodable, so throw exception 
653                 // TODO: change to HeritrixURIException so distinguishable
654                 // from URIExceptions in library code
655                 URIException ue = new URIException(e+" "+label);
656                 ue.initCause(e);
657                 throw ue;
658             }
659         }
660         label = label.toLowerCase();
661         return label;
662     }
663     
664     /***
665      * Ensure that there all characters needing escaping
666      * in the passed-in String are escaped. Stray '%' characters
667      * are *not* escaped, as per browser behavior. 
668      * 
669      * @param u String to escape
670      * @param charset 
671      * @return string with any necessary escaping applied
672      */
673     private String ensureMinimalEscaping(String u, final String charset) {
674         return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE);
675     }
676     
677     /***
678      * Ensure that there all characters needing escaping
679      * in the passed-in String are escaped. Stray '%' characters
680      * are *not* escaped, as per browser behavior. 
681      * 
682      * @param u String to escape
683      * @param charset 
684      * @param bitset 
685      * @return string with any necessary escaping applied
686      */
687     private String ensureMinimalEscaping(String u, final String charset,
688             final BitSet bitset) {
689         if (u == null) {
690             return null;
691         }
692         for (int i = 0; i < u.length(); i++) {
693             char c = u.charAt(i);
694             if (!bitset.get(c)) {
695                 try {
696                     u = LaxURLCodec.DEFAULT.encode(bitset, u, charset);
697                 } catch (UnsupportedEncodingException e) {
698                     e.printStackTrace();
699                 }
700                 break;
701             }
702         }
703         return u;
704     }
705 
706     /***
707      * Escape any whitespace found.
708      * 
709      * The parent class takes care of the bulk of escaping.  But if any
710      * instance of escaping is found in the URI, then we ask for parent
711      * to do NO escaping.  Here we escape any whitespace found irrespective
712      * of whether the uri has already been escaped.  We do this for
713      * case where uri has been judged already-escaped only, its been
714      * incompletly done and whitespace remains.  Spaces, etc., in the URI are
715      * a real pain.  Their presence will break log file and ARC parsing.
716      * @param uri URI string to check.
717      * @return uri with spaces escaped if any found.
718      */
719     protected String escapeWhitespace(String uri) {
720         // Just write a new string anyways.  The perl '\s' is not
721         // as inclusive as the Character.isWhitespace so there are
722         // whitespace characters we could miss.  So, rather than
723         // write some awkward regex, just go through the string
724         // a character at a time.  Only create buffer first time
725         // we find a space.
726         MutableString buffer = null;
727         for (int i = 0; i < uri.length(); i++) {
728             char c = uri.charAt(i);
729             if (Character.isWhitespace(c)) {
730                 if (buffer == null) {
731                     buffer = new MutableString(uri.length() +
732                         2 /*If space, two extra characters (at least)*/);
733                     buffer.append(uri.substring(0, i));
734                 }
735                 buffer.append("%");
736                 String hexStr = Integer.toHexString(c);
737                 if ((hexStr.length() % 2) > 0) {
738                     buffer.append("0");
739                 }
740                 buffer.append(hexStr);
741                 
742             } else {
743                 if (buffer != null) {
744                     buffer.append(c);
745                 }
746             }
747         }
748         return (buffer !=  null)? buffer.toString(): uri;
749     }
750 
751     /***
752      * Check port on passed http authority.  Make sure the size is not larger
753      * than allowed: See the 'port' definition on this
754      * page, http://www.kerio.com/manual/wrp/en/418.htm.
755      * Also, we've seen port numbers of '0080' whose leading zeros confuse
756      * the parent class. Strip the leading zeros.
757      *
758      * @param uriAuthority
759      * @return Null or an amended port number.
760      * @throws URIException
761      */
762     private String checkPort(String uriAuthority)
763     throws URIException {
764         Matcher m = PORTREGEX.matcher(uriAuthority);
765         if (m.matches()) {
766             String no = m.group(2);
767             if (no != null && no.length() > 0) {
768                 // First check if the port has leading zeros
769                 // as in '0080'.  Strip them if it has and
770                 // then reconstitute the uriAuthority.  Be careful
771                 // of cases where port is '0' or '000'.
772                 while (no.charAt(0) == '0' && no.length() > 1) {
773                     no = no.substring(1);
774                 }
775                 uriAuthority = m.group(1) + no;
776                 // Now makesure the number is legit.
777                 int portNo = Integer.parseInt(no);
778                 if (portNo <= 0 || portNo > 65535) {
779                     throw new URIException("Port out of bounds: " +
780                         uriAuthority);
781                 }
782             }
783         }
784         return uriAuthority;
785     }
786 
787     /***
788      * @param b Buffer to append to.
789      * @param str String to append if not null.
790      * @param substr Suffix or prefix to use if <code>str</code> is not null.
791      * @param suffix True if <code>substr</code> is a suffix.
792      */
793     private void appendNonNull(MutableString b, String str, String substr,
794             boolean suffix) {
795         if (str != null && str.length() > 0) {
796             if (!suffix) {
797                 b.append(substr);
798             }
799             b.append(str);
800             if (suffix) {
801                 b.append(substr);
802             }
803         }
804     }
805 
806     /***
807      * @param str String to work on.
808      * @param prefix Prefix to strip if present.
809      * @return <code>str</code> w/o <code>prefix</code>.
810      */
811     private String stripPrefix(String str, String prefix) {
812         return str.startsWith(prefix)?
813             str.substring(prefix.length(), str.length()):
814             str;
815     }
816 
817     /***
818      * @param str String to work on.
819      * @param tail Tail to strip if present.
820      * @return <code>str</code> w/o <code>tail</code>.
821      */
822     private static String stripTail(String str, String tail) {
823         return str.endsWith(tail)?
824             str.substring(0, str.length() - tail.length()):
825             str;
826     }
827 
828     /***
829      * @param element to examine.
830      * @return Null if passed null or an empty string otherwise
831      * <code>element</code>.
832      */
833     private String checkUriElement(String element) {
834         return (element == null || element.length() <= 0)? null: element;
835     }
836 
837     /***
838      * @param element to examine and lowercase if non-null.
839      * @return Null if passed null or an empty string otherwise
840      * <code>element</code> lowercased.
841      */
842     private String checkUriElementAndLowerCase(String element) {
843         String tmp = checkUriElement(element);
844         return (tmp != null)? tmp.toLowerCase(): tmp;
845     }
846 }