View Javadoc

1   /* LaxURI
2   *
3   * $Id: LaxURI.java 4646 2006-09-22 17:23:04Z paul_jack $
4   *
5   * Created on Aug 3, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.net;
26  
27  import java.util.Arrays;
28  import java.util.BitSet;
29  
30  import org.apache.commons.httpclient.URI;
31  import org.apache.commons.httpclient.URIException;
32  import org.apache.commons.httpclient.util.EncodingUtil;
33  
34  /***
35   * URI subclass which allows partial/inconsistent encoding, matching
36   * the URIs which will be relayed in requests from popular web
37   * browsers (esp. Mozilla Firefox and MS IE).
38   * 
39   * @author gojomo
40   */
41  public class LaxURI extends URI {
42  
43      private static final long serialVersionUID = 5273922211722239537L;
44      
45      final protected static char[] HTTP_SCHEME = {'h','t','t','p'};
46      final protected static char[] HTTPS_SCHEME = {'h','t','t','p','s'};
47      
48      protected static final BitSet lax_rel_segment = new BitSet(256);
49      // Static initializer for lax_rel_segment
50      static {
51          lax_rel_segment.or(rel_segment);
52          lax_rel_segment.set(':'); // allow ':'
53          // TODO: add additional allowances as need is demonstrated
54      }
55  
56      protected static final BitSet lax_abs_path = new BitSet(256);
57      static {
58          lax_abs_path.or(abs_path);
59          lax_abs_path.set('|'); // tests indicate Firefox (1.0.6) doesn't escape.
60      }
61      
62      protected static final BitSet lax_query = new BitSet(256);
63      static {
64          lax_query.or(query);
65          lax_query.set('{'); // tests indicate FF doesn't escape { in query
66          lax_query.set('}'); // tests indicate FF doesn't escape } in query
67          lax_query.set('|'); // tests indicate FF doesn't escape | in query
68          lax_query.set('['); // tests indicate FF doesn't escape [ in query
69          lax_query.set(']'); // tests indicate FF doesn't escape ] in query
70          lax_query.set('^'); // tests indicate FF doesn't escape ^ in query
71      }
72      
73      // passthrough initializers
74      public LaxURI(String uri, boolean escaped, String charset)
75      throws URIException {
76          super(uri,escaped,charset);
77      }
78      public LaxURI(URI base, URI relative) throws URIException {
79          super(base,relative);
80      }
81      public LaxURI(String uri, boolean escaped) throws URIException {
82          super(uri,escaped);
83      }
84      public LaxURI() {
85          super();
86      }
87  
88      // overridden to use this class's static decode()
89      public String getURI() throws URIException {
90          return (_uri == null) ? null : decode(_uri, getProtocolCharset());
91      }
92      
93      // overridden to use this class's static decode()
94      public String getPath() throws URIException {
95          char[] p = getRawPath();
96          return (p == null) ? null : decode(p, getProtocolCharset());
97      }
98  
99      // overridden to use this class's static decode()
100     public String getPathQuery() throws URIException {
101         char[] rawPathQuery = getRawPathQuery();
102         return (rawPathQuery == null) ? null : decode(rawPathQuery,
103                 getProtocolCharset());
104     }
105     // overridden to use this class's static decode()
106     protected static String decode(char[] component, String charset)
107             throws URIException {
108         if (component == null) {
109             throw new IllegalArgumentException(
110                     "Component array of chars may not be null");
111         }
112         return decode(new String(component), charset);
113     }
114 
115     // overridden to use IA's LaxURLCodec, which never throws DecoderException
116     protected static String decode(String component, String charset)
117             throws URIException {
118         if (component == null) {
119             throw new IllegalArgumentException(
120                     "Component array of chars may not be null");
121         }
122         byte[] rawdata = null;
123         //     try {
124         rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil
125                 .getAsciiBytes(component));
126         //     } catch (DecoderException e) {
127         //         throw new URIException(e.getMessage());
128         //     }
129         return EncodingUtil.getString(rawdata, charset);
130     }
131     
132     // overidden to lax() the acceptable-char BitSet passed in
133     protected boolean validate(char[] component, BitSet generous) {
134         return super.validate(component, lax(generous));
135     }
136 
137     // overidden to lax() the acceptable-char BitSet passed in
138     protected boolean validate(char[] component, int soffset, int eoffset,
139             BitSet generous) {
140         return super.validate(component, soffset, eoffset, lax(generous));
141     }
142     
143     /***
144      * Given a BitSet -- typically one of the URI superclass's
145      * predefined static variables -- possibly replace it with
146      * a more-lax version to better match the character sets
147      * actually left unencoded in web browser requests
148      * 
149      * @param generous original BitSet
150      * @return (possibly more lax) BitSet to use
151      */
152     protected BitSet lax(BitSet generous) {
153         if (generous == rel_segment) {
154             // Swap in more lax allowable set
155             return lax_rel_segment;
156         }
157         if (generous == abs_path) {
158             return lax_abs_path;
159         }
160         if (generous == query) {
161             return lax_query;
162         }
163         // otherwise, leave as is
164         return generous;
165     }
166     
167     /*** 
168      * Coalesce the _host and _authority fields where 
169      * possible.
170      * 
171      * In the web crawl/http domain, most URIs have an 
172      * identical _host and _authority. (There is no port
173      * or user info.) However, the superclass always 
174      * creates two separate char[] instances. 
175      * 
176      * Notably, the lengths of these char[] fields are 
177      * equal if and only if their values are identical.
178      * This method makes use of this fact to reduce the
179      * two instances to one where possible, slimming 
180      * instances.  
181      * 
182      * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)
183      */
184     protected void parseAuthority(String original, boolean escaped)
185             throws URIException {
186         super.parseAuthority(original, escaped);
187         if (_host != null && _authority != null
188                 && _host.length == _authority.length) {
189             _host = _authority;
190         }
191     }
192     
193     
194     /*** 
195      * Coalesce _scheme to existing instances, where appropriate.
196      * 
197      * In the web-crawl domain, most _schemes are 'http' or 'https',
198      * but the superclass always creates a new char[] instance. For
199      * these two cases, we replace the created instance with a 
200      * long-lived instance from a static field, saving 12-14 bytes
201      * per instance. 
202      * 
203      * @see org.apache.commons.httpclient.URI#setURI()
204      */
205     protected void setURI() {
206         if (_scheme != null) {
207             if (_scheme.length == 4 && Arrays.equals(_scheme, HTTP_SCHEME)) {
208                 _scheme = HTTP_SCHEME;
209             } else if (_scheme.length == 5
210                     && Arrays.equals(_scheme, HTTP_SCHEME)) {
211                 _scheme = HTTPS_SCHEME;
212             }
213         }
214         super.setURI();
215     }
216     
217     /***
218      * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR 
219      * http://issues.apache.org/jira/browse/HTTPCLIENT-588
220      * 
221      * In order to avoid any possilbity of conflict with non-ASCII characters,
222      * Parse a URI reference as a <code>String</code> with the character
223      * encoding of the local system or the document.
224      * <p>
225      * The following line is the regular expression for breaking-down a URI
226      * reference into its components.
227      * <p><blockquote><pre>
228      *   ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
229      *    12            3  4          5       6  7        8 9
230      * </pre></blockquote><p>
231      * For example, matching the above expression to
232      *   http://jakarta.apache.org/ietf/uri/#Related
233      * results in the following subexpression matches:
234      * <p><blockquote><pre>
235      *               $1 = http:
236      *  scheme    =  $2 = http
237      *               $3 = //jakarta.apache.org
238      *  authority =  $4 = jakarta.apache.org
239      *  path      =  $5 = /ietf/uri/
240      *               $6 = <undefined>
241      *  query     =  $7 = <undefined>
242      *               $8 = #Related
243      *  fragment  =  $9 = Related
244      * </pre></blockquote><p>
245      *
246      * @param original the original character sequence
247      * @param escaped <code>true</code> if <code>original</code> is escaped
248      * @throws URIException If an error occurs.
249      */
250     protected void parseUriReference(String original, boolean escaped)
251         throws URIException {
252 
253         // validate and contruct the URI character sequence
254         if (original == null) {
255             throw new URIException("URI-Reference required");
256         }
257 
258         /* @
259          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
260          */
261         String tmp = original.trim();
262         
263         /*
264          * The length of the string sequence of characters.
265          * It may not be equal to the length of the byte array.
266          */
267         int length = tmp.length();
268 
269         /*
270          * Remove the delimiters like angle brackets around an URI.
271          */
272         if (length > 0) {
273             char[] firstDelimiter = { tmp.charAt(0) };
274             if (validate(firstDelimiter, delims)) {
275                 if (length >= 2) {
276                     char[] lastDelimiter = { tmp.charAt(length - 1) };
277                     if (validate(lastDelimiter, delims)) {
278                         tmp = tmp.substring(1, length - 1);
279                         length = length - 2;
280                     }
281                 }
282             }
283         }
284 
285         /*
286          * The starting index
287          */
288         int from = 0;
289 
290         /*
291          * The test flag whether the URI is started from the path component.
292          */
293         boolean isStartedFromPath = false;
294         int atColon = tmp.indexOf(':');
295         int atSlash = tmp.indexOf('/');
296         if ((atColon <= 0 && !tmp.startsWith("//"))
297             || (atSlash >= 0 && atSlash < atColon)) {
298             isStartedFromPath = true;
299         }
300 
301         /*
302          * <p><blockquote><pre>
303          *     @@@@@@@@
304          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
305          * </pre></blockquote><p>
306          */
307         int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
308         if (at == -1) { 
309             at = 0;
310         }
311 
312         /*
313          * Parse the scheme.
314          * <p><blockquote><pre>
315          *  scheme    =  $2 = http
316          *              @
317          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
318          * </pre></blockquote><p>
319          */
320         if (at > 0 && at < length && tmp.charAt(at) == ':') {
321             char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
322             if (validate(target, scheme)) {
323                 _scheme = target;
324             } else {
325                 throw new URIException("incorrect scheme");
326             }
327             from = ++at;
328         }
329 
330         /*
331          * Parse the authority component.
332          * <p><blockquote><pre>
333          *  authority =  $4 = jakarta.apache.org
334          *                  @@
335          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
336          * </pre></blockquote><p>
337          */
338         // Reset flags
339         _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
340         if (0 <= at && at < length && tmp.charAt(at) == '/') {
341             // Set flag
342             _is_hier_part = true;
343             if (at + 2 < length && tmp.charAt(at + 1) == '/' 
344                 && !isStartedFromPath) {
345                 // the temporary index to start the search from
346                 int next = indexFirstOf(tmp, "/?#", at + 2);
347                 if (next == -1) {
348                     next = (tmp.substring(at + 2).length() == 0) ? at + 2 
349                         : tmp.length();
350                 }
351                 parseAuthority(tmp.substring(at + 2, next), escaped);
352                 from = at = next;
353                 // Set flag
354                 _is_net_path = true;
355             }
356             if (from == at) {
357                 // Set flag
358                 _is_abs_path = true;
359             }
360         }
361 
362         /*
363          * Parse the path component.
364          * <p><blockquote><pre>
365          *  path      =  $5 = /ietf/uri/
366          *                                @@@@@@
367          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
368          * </pre></blockquote><p>
369          */
370         if (from < length) {
371             // rel_path = rel_segment [ abs_path ]
372             int next = indexFirstOf(tmp, "?#", from);
373             if (next == -1) {
374                 next = tmp.length();
375             }
376             if (!_is_abs_path) {
377                 if (!escaped 
378                     && prevalidate(tmp.substring(from, next), disallowed_rel_path) 
379                     || escaped 
380                     && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
381                     // Set flag
382                     _is_rel_path = true;
383                 } else if (!escaped 
384                     && prevalidate(tmp.substring(from, next), disallowed_opaque_part) 
385                     || escaped 
386                     && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
387                     // Set flag
388                     _is_opaque_part = true;
389                 } else {
390                     // the path component may be empty
391                     _path = null;
392                 }
393             }
394             String s = tmp.substring(from, next);
395             if (escaped) {
396                 setRawPath(s.toCharArray());
397             } else {
398                 setPath(s);
399             }
400             at = next;
401         }
402 
403         // set the charset to do escape encoding
404         String charset = getProtocolCharset();
405 
406         /*
407          * Parse the query component.
408          * <p><blockquote><pre>
409          *  query     =  $7 = <undefined>
410          *                                        @@@@@@@@@
411          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
412          * </pre></blockquote><p>
413          */
414         if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
415             int next = tmp.indexOf('#', at + 1);
416             if (next == -1) {
417                 next = tmp.length();
418             }
419             if (escaped) {
420                 _query = tmp.substring(at + 1, next).toCharArray();
421                 if (!validate(_query, query)) {
422                     throw new URIException("Invalid query");
423                 }
424             } else {
425                 _query = encode(tmp.substring(at + 1, next), allowed_query, charset);
426             }
427             at = next;
428         }
429 
430         /*
431          * Parse the fragment component.
432          * <p><blockquote><pre>
433          *  fragment  =  $9 = Related
434          *                                                   @@@@@@@@
435          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
436          * </pre></blockquote><p>
437          */
438         if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
439             if (at + 1 == length) { // empty fragment
440                 _fragment = "".toCharArray();
441             } else {
442                 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() 
443                     : encode(tmp.substring(at + 1), allowed_fragment, charset);
444             }
445         }
446 
447         // set this URI.
448         setURI();
449     }
450     
451 }