1   /* UURIFactoryTest
2    *
3    * $Id: UURIFactoryTest.java 5562 2007-11-16 00:53:10Z Gojomo $
4    *
5    * Created on Apr 2, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  
26  package org.archive.net;
27  
28  import java.util.Iterator;
29  import java.util.TreeMap;
30  
31  import junit.framework.TestCase;
32  
33  import org.apache.commons.httpclient.URIException;
34  
35  /***
36   * Test UURIFactory for proper UURI creation across variety of
37   * important/tricky cases.
38   * 
39   * Be careful writing this file.  Make sure you write it with UTF-8 encoding.
40   *
41   * @author igor stack gojomo
42   */
43  public class UURIFactoryTest extends TestCase {
44  	
45  	public final void testEscaping() throws URIException {
46  		// Note: single quote is not being escaped by URI class.
47  		final String ESCAPED_URISTR = "http://archive.org/" +
48  		    UURIFactory.ESCAPED_SPACE +
49  			UURIFactory.ESCAPED_SPACE +
50  			UURIFactory.ESCAPED_CIRCUMFLEX +
51  			UURIFactory.ESCAPED_QUOT +
52  			UURIFactory.SQUOT +
53  			UURIFactory.ESCAPED_APOSTROPH +
54  			UURIFactory.ESCAPED_LSQRBRACKET +
55  			UURIFactory.ESCAPED_RSQRBRACKET +
56  			UURIFactory.ESCAPED_LCURBRACKET +
57  			UURIFactory.ESCAPED_RCURBRACKET +
58  			UURIFactory.SLASH + "a.gif"; // NBSP and SPACE should be trimmed;
59  		
60  		final String URISTR = "http://archive.org/.././" + "\u00A0" +
61  		    UURIFactory.SPACE + UURIFactory.CIRCUMFLEX +
62  			UURIFactory.QUOT + UURIFactory.SQUOT +
63  			UURIFactory.APOSTROPH + UURIFactory.LSQRBRACKET +
64  			UURIFactory.RSQRBRACKET + UURIFactory.LCURBRACKET +
65  			UURIFactory.RCURBRACKET + UURIFactory.BACKSLASH +
66  			"test/../a.gif" + "\u00A0" + UURIFactory.SPACE;
67  		
68  		UURI uuri = UURIFactory.getInstance(URISTR);
69  		final String uuriStr = uuri.toString();
70  		assertEquals("expected escaping", ESCAPED_URISTR, uuriStr);
71  	}
72  
73      public final void testUnderscoreMakesPortParseFail() throws URIException {
74          UURI uuri = UURIFactory.getInstance("http://one-two_three:8080/index.html");
75          int port = uuri.getPort();
76          assertTrue("Failed find of port " + uuri, port == 8080);
77      }
78      
79      public final void testRelativeURIWithTwoSlashes() throws URIException {
80          UURI base = UURIFactory.getInstance("http://www.archive.org");
81          UURI uuri = UURIFactory.getInstance(base, "one//index.html");
82          assertTrue("Doesn't do right thing with two slashes " + uuri,
83              uuri.toString().equals(
84                  "http://www.archive.org/one//index.html"));
85      }
86      
87      public final void testTrailingEncodedSpace() throws URIException {
88          UURI uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20");
89          assertTrue("Doesn't strip trailing encoded space 1 " + uuri,
90              uuri.toString().equals("http://www.nps-shoes.co.uk/"));
91          uuri = UURIFactory.getInstance("http://www.nps-shoes.co.uk%20%20%20");
92          assertTrue("Doesn't strip trailing encoded space 2 " + uuri,
93              uuri.toString().equals("http://www.nps-shoes.co.uk/"));
94      }
95      
96      public final void testPort0080is80() throws URIException {
97          UURI uuri = UURIFactory.getInstance("http://archive.org:0080");
98          assertTrue("Doesn't strip leading zeros " + uuri,
99              uuri.toString().equals("http://archive.org/"));
100     }
101     
102 // DISABLING TEST AS PRECURSOR TO ELIMINATION
103 // the problematic input given -- specifically the "%6s" incomplete uri-escape,
104 // shouldn't necessarily be rejected as a bad URI. IE and Firefox, at least, 
105 // will  attempt to fetch such an URL (getting, in this case against that ad 
106 // server, a bad-request error). Ideally, we'd generate exactly the same 
107 // request against the server as they do. However, with the most recent 
108 // fixup for stray '%' signs, we come close, but not exactly. That's enough
109 // to cause this test to fail (it's not getting the expected exception) but
110 // our almost-URI, which might be what was intended, is better than trying 
111 // nothing.
112 //    public final void testBadPath() {
113 //        String message = null;
114 //        try {
115 //            UURIFactory.getInstance("http://ads.as4x.tmcs.net/" +
116 //                "html.ng/site=cs&pagepos=102&page=home&adsize=1x1&context=" +
117 //                "generic&Params.richmedia=yes%26city%3Dseattle%26" +
118 //                "rstid%3D2415%26market_id%3D86%26brand%3Dcitysearch" +
119 //                "%6state%3DWA");
120 //        } catch (URIException e) {
121 //            message = e.getMessage();
122 //        }
123 //        assertNotNull("Didn't get expected exception.", message);
124 //    }   
125     
126     public final void testEscapeEncoding() throws URIException {
127         UURI uuri = UURIFactory.getInstance("http://www.y1y1.com/" +
128             "albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg", "windows-1256");
129         uuri.getPath();
130     }   
131     
132     public final void testTooLongAfterEscaping() {
133         StringBuffer buffer = new StringBuffer("http://www.archive.org/a/");
134         // Append bunch of spaces.  When escaped, they'll triple in size.
135         for (int i = 0; i < 1024; i++) {
136         	buffer.append(" ");
137         }
138         buffer.append("/index.html");
139         String message = null;
140         try {
141         	UURIFactory.getInstance(buffer.toString());
142         } catch (URIException e) {
143             message = e.getMessage();
144         }
145         assertTrue("Wrong or no exception: " + message, (message != null) &&
146             message.startsWith("Created (escaped) uuri >"));
147     }
148 	
149 	public final void testFtpUris() throws URIException {
150 		final String FTP = "ftp";
151 		final String AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn";
152 		final String PATH = "/clzreceive/";
153 		final String uri = FTP + "://" + AUTHORITY + PATH;
154 		UURI uuri = UURIFactory.getInstance(uri);
155 		assertTrue("Failed to get matching scheme: " + uuri.getScheme(),
156 				(uuri.getScheme()).equals(FTP));
157 		assertTrue("Failed to get matching authority: " +
158 				uuri.getAuthority(), (uuri.getAuthority()).equals(AUTHORITY));
159 		assertTrue("Failed to get matching path: " +
160 				uuri.getPath(), (uuri.getPath()).equals(PATH));       
161 	}
162     
163     public final void testWhitespaceEscaped() throws URIException {
164         // Test that we get all whitespace even if the uri is
165         // already escaped.
166         String uri = "http://archive.org/index%25 .html";
167         String tgtUri = "http://archive.org/index%25%20.html";
168         UURI uuri = UURIFactory.getInstance(uri);
169         assertTrue("Not equal " + uuri.toString(),
170                 uuri.toString().equals(tgtUri));     
171         uri = "http://archive.org/index%25\u001D.html";
172         tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();
173         uuri = UURIFactory.getInstance(uri);
174         assertEquals("whitespace escaping", tgtUri, uuri.toString());
175         uri = "http://gemini.info.usaid.gov/directory/" +
176             "pbResults.cfm?&urlNameLast=Rumplestiltskin";
177         tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?" +
178             "name=Ebenezer%20+Rumplestiltskin,&location=RRB%20%20%20%205%2E08%2D006";
179         uuri = UURIFactory.getInstance(UURIFactory.getInstance(uri),
180             "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location=" +
181             "RRB%20%20%20%205%2E08%2D006");
182         assertEquals("whitespace escaping", tgtUri, uuri.toString());
183     }
184     
185 //	public final void testFailedGetPath() throws URIException {
186 //		final String path = "/RealMedia/ads/" +
187 //		"click_lx.ads/%%PAGE%%/%%RAND%%/%%POS%%/%%CAMP%%/empty";
188 //        // decoding in getPath will interpret %CA as 8-bit escaped char,
189 //        // possibly incomplete
190 //		final String uri = "http://ads.nandomedia.com" + path;
191 //		final UURI uuri = UURIFactory.getInstance(uri);
192 //		String foundPath = uuri.getPath();
193 //		assertEquals("unexpected path", path, foundPath);
194 //	}
195     
196     public final void testDnsHost() throws URIException {
197         String uri = "dns://ads.nandomedia.com:81/one.html";
198         UURI uuri = UURIFactory.getInstance(uri);
199         String host = uuri.getReferencedHost();
200         assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
201         uri = "dns:ads.nandomedia.com";
202         uuri = UURIFactory.getInstance(uri);
203         host = uuri.getReferencedHost();
204         assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
205         uri = "dns:ads.nandomedia.com?a=b";
206         uuri = UURIFactory.getInstance(uri);
207         host = uuri.getReferencedHost();
208         assertTrue("Host is wrong " + host, host.equals("ads.nandomedia.com"));
209     }
210 	
211 	public final void testPercentEscaping() throws URIException {
212 		final String uri = "http://archive.org/%a%%%%%.html";
213         // tests indicate firefox (1.0.6) does not encode '%' at all
214         final String tgtUri = "http://archive.org/%a%%%%%.html";
215 		UURI uuri = UURIFactory.getInstance(uri);
216 		assertEquals("Not equal",tgtUri, uuri.toString());
217 	}
218     
219 	public final void testRelativeDblPathSlashes() throws URIException {
220 		UURI base = UURIFactory.getInstance("http://www.archive.org/index.html");
221 		UURI uuri = UURIFactory.getInstance(base, "JIGOU//KYC//INDEX.HTM");
222         assertTrue("Double slash not working " + uuri.toString(),
223                 uuri.getPath().equals("/JIGOU//KYC//INDEX.HTM"));
224 	}
225     
226     public final void testRelativeWithScheme() throws URIException {
227         UURI base = UURIFactory.getInstance("http://www.example.com/some/page");
228         UURI uuri = UURIFactory.getInstance(base, "http:boo");
229         assertTrue("Relative with scheme not working " + uuri.toString(),
230                 uuri.toString().equals("http://www.example.com/some/boo"));
231     }
232     
233     public final void testBadBaseResolve() throws URIException {
234         UURI base = UURIFactory.getInstance("http://license.joins.com/board/" +
235             "etc_board_list.asp?board_name=new_main&b_type=&nPage=" +
236             "2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage=" +
237             "notice&gate=02");
238         UURIFactory.getInstance(base, "http://www.changeup.com/...</a");
239     }
240     
241     public final void testTilde() throws URIException {
242         noChangeExpected("http://license.joins.com/~igor");
243     }
244     
245     public final void testCurlies() throws URIException {
246         // Firefox allows curlies in the query string portion of a URL only
247         // (converts curlies if they are in the path portion ahead of the
248         // query string).
249         UURI uuri =
250             noChangeExpected("http://license.joins.com/igor?one={curly}");
251         assertEquals(uuri.getQuery(), "one={curly}");
252         assertEquals(UURIFactory.
253                 getInstance("http://license.joins.com/igor{curly}.html").
254                     toString(),
255             "http://license.joins.com/igor%7Bcurly%7D.html");
256         boolean exception = false;
257         try {
258             UURIFactory.getInstance("http://license.{curly}.com/igor.html");
259         } catch (URIException u) {
260             exception = true;
261         }
262         assertTrue("Did not get exception.", exception);
263     }
264     
265     protected UURI noChangeExpected(final String original)
266     throws URIException {
267         UURI uuri = UURIFactory.getInstance(original);
268         assertEquals(original, uuri.toString());
269         return uuri;
270     }
271     
272 	public final void testTrimSpaceNBSP() throws URIException {
273 		final String uri = "   http://archive.org/DIR WITH SPACES/" +
274 		UURIFactory.NBSP + "home.html    " + UURIFactory.NBSP + "   ";
275 		final String tgtUri =
276 			"http://archive.org/DIR%20WITH%20SPACES/%20home.html";
277 		UURI uuri = UURIFactory.getInstance(uri);
278 		assertTrue("Not equal " + uuri.toString(),
279 				uuri.toString().equals(tgtUri));
280 	}
281 	
282 	/***
283 	 * Test space plus encoding ([ 1010966 ] crawl.log has URIs with spaces in them).
284 	 * See <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1010966&group_id=73833&atid=539099">[ 1010966 ] crawl.log has URIs with spaces in them</a>.
285 	 * @throws URIException
286 	 */
287 	public final void testSpaceDoubleEncoding() throws URIException {
288 		final String uri = "http://www.brook.edu/i.html? %20taxonomy=Politics";
289 		final String encodedUri =
290 			"http://www.brook.edu/i.html?%20%20taxonomy=Politics";
291 		UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
292 		assertTrue("Not equal " + uuri.toString(),
293 				uuri.toString().equals(encodedUri));
294 	}
295 	
296 	/***
297 	 * Test for doubly-encoded sequences.
298 	 * See <a href="https://sourceforge.net/tracker/index.php?func=detail&aid=966219&group_id=73833&atid=539099">[ 966219 ] UURI doubly-encodes %XX sequences</a>.
299 	 * @throws URIException
300 	 */
301 	public final void testDoubleEncoding() throws URIException {
302 		final char ae = '\u00E6';
303 		final String uri = "http://archive.org/DIR WITH SPACES/home" +
304 		    ae + ".html";
305 		final String encodedUri =
306 			"http://archive.org/DIR%20WITH%20SPACES/home%E6.html";
307 		UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
308 		assertEquals("single encoding", encodedUri, uuri.toString());
309 		// Dbl-encodes.
310 		uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
311 		uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
312 		assertEquals("double encoding", encodedUri, uuri.toString());
313 		// Do default utf-8 test.
314 		uuri = UURIFactory.getInstance(uri);
315 		final String encodedUtf8Uri =
316 			"http://archive.org/DIR%20WITH%20SPACES/home%C3%A6.html";
317 		assertEquals("Not equal utf8", encodedUtf8Uri, uuri.toString());      
318 		// Now dbl-encode.
319 		uuri = UURIFactory.getInstance(uuri.toString());
320 		uuri = UURIFactory.getInstance(uuri.toString());
321 		assertEquals("Not equal (dbl-encoding) utf8", encodedUtf8Uri, uuri.toString());
322 	}
323 	
324 	/***
325 	 * Test for syntax errors stop page parsing.
326 	 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ] URI Syntax Errors stop page parsing</a>
327 	 * @throws URIException
328 	 */
329 	public final void testThreeSlashes() throws URIException {
330 		UURI goodURI = UURIFactory.
331 		getInstance("http://lcweb.loc.gov/rr/goodtwo.html");
332 		String uuri = "http:///lcweb.loc.gov/rr/goodtwo.html";
333 		UURI rewrittenURI = UURIFactory.getInstance(uuri);
334 		assertTrue("Not equal " + goodURI + ", " + uuri,
335 				goodURI.toString().equals(rewrittenURI.toString()));
336 		uuri = "http:////lcweb.loc.gov/rr/goodtwo.html";
337 		rewrittenURI = UURIFactory.getInstance(uuri);
338 		assertTrue("Not equal " + goodURI + ", " + uuri,
339 				goodURI.toString().equals(rewrittenURI.toString()));
340 		// Check https.
341 		goodURI = UURIFactory.
342 		getInstance("https://lcweb.loc.gov/rr/goodtwo.html");
343 		uuri = "https:////lcweb.loc.gov/rr/goodtwo.html";
344 		rewrittenURI = UURIFactory.getInstance(uuri);
345 		assertTrue("Not equal " + goodURI + ", " + uuri,
346 				goodURI.toString().equals(rewrittenURI.toString()));
347 	}
348 	
349 	public final void testNoScheme() {
350 		boolean expectedException = false;
351 		String uuri = "www.loc.gov/rr/european/egw/polishex.html";
352 		try {
353 			UURIFactory.getInstance(uuri);
354 		} catch (URIException e) {
355 			// Expected exception.
356 			expectedException = true;
357 		}
358 		assertTrue("Didn't get expected exception: " + uuri, 
359 				expectedException); 
360 	}
361 	
362 	public final void testRelative() throws URIException {
363 		UURI uuriTgt = UURIFactory.
364 		getInstance("http://archive.org:83/home.html");
365 		UURI uri = UURIFactory.
366 		getInstance("http://archive.org:83/one/two/three.html");
367 		UURI uuri = UURIFactory.
368 		getInstance(uri, "/home.html");
369 		assertTrue("Not equal",
370 				uuriTgt.toString().equals(uuri.toString()));
371 	}
372 	
373 	/***
374 	 * Test that an empty uuri does the right thing -- that we get back the
375 	 * base.
376 	 *
377 	 * @throws URIException
378 	 */
379 	public final void testRelativeEmpty() throws URIException {
380 		UURI uuriTgt = UURIFactory.
381 		getInstance("http://archive.org:83/one/two/three.html");
382 		UURI uri = UURIFactory.
383 		getInstance("http://archive.org:83/one/two/three.html");
384 		UURI uuri = UURIFactory.
385 		getInstance(uri, "");
386 		assertTrue("Empty length don't work",
387 				uuriTgt.toString().equals(uuri.toString()));
388 	}
389 	
390 	public final void testAbsolute() throws URIException {
391 		UURI uuriTgt = UURIFactory.
392 		getInstance("http://archive.org:83/home.html");
393 		UURI uri = UURIFactory.
394 		getInstance("http://archive.org:83/one/two/three.html");
395 		UURI uuri = UURIFactory.
396 		getInstance(uri, "http://archive.org:83/home.html");
397 		assertTrue("Not equal",
398 				uuriTgt.toString().equals(uuri.toString()));
399 	}
400 	
401 	/***
402 	 * Test for [ 962892 ] UURI accepting/creating unUsable URIs (bad hosts).
403 	 * @see <a href="https://sourceforge.net/tracker/?func=detail&atid=539099&aid=962892&group_id=73833">[ 962892 ] UURI accepting/creating unUsable URIs (bad hosts)</a>
404 	 */
405 	public final void testHostWithLessThan() {
406 		checkExceptionOnIllegalDomainlabel("http://www.betamobile.com</A");
407 		checkExceptionOnIllegalDomainlabel(
408 		"http://C|/unzipped/426/spacer.gif");
409 		checkExceptionOnIllegalDomainlabel("http://www.lycos.co.uk\"/l/b/\"");
410 	}    
411 	
412 	/***
413 	 * Test for [ 1012520 ] UURI.length() &gt; 2k.
414 	 * @throws URIException
415 	 * @see <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1012520&group_id=73833&atid=539099">[ 1012520 ] UURI.length() &gt; 2k</a>
416 	 */
417 	public final void test2kURI() throws URIException {
418 		final StringBuffer buffer = new StringBuffer("http://a.b");
419 		final String subPath = "/123456789";
420 		for (int i = 0; i < 207; i++) {
421 			buffer.append(subPath);
422 		}
423 		// String should be 2080 characters long.  Legal.
424 		UURIFactory.getInstance(buffer.toString());
425 		boolean gotException = false;
426 		// Add ten more characters and make size illegal.
427 		buffer.append(subPath);
428 		try {
429 			UURIFactory.getInstance(buffer.toString()); 
430 		} catch (URIException e) {
431 			gotException = true;
432 		}
433 		assertTrue("No expected exception complaining about long URI",
434 				gotException);
435 	} 
436 	
437 	private void checkExceptionOnIllegalDomainlabel(String uuri) {
438 		boolean expectedException = false;
439         try {
440 			UURIFactory.getInstance(uuri);
441 		} catch (URIException e) {
442 			// Expected exception.
443 			expectedException = true;
444 		}
445 		assertTrue("Didn't get expected exception: " + uuri, 
446 				expectedException); 
447 	}
448 	
449 	/***
450 	 * Test for doing separate DNS lookup for same host
451 	 *
452 	 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788277&group_id=73833&atid=539099">[ 788277 ] Doing separate DNS lookup for same host</a>
453 	 * @throws URIException
454 	 */
455 	public final void testHostWithPeriod() throws URIException {
456 		UURI uuri1 = UURIFactory.
457 		getInstance("http://www.loc.gov./index.html");
458 		UURI uuri2 = UURIFactory.
459 		getInstance("http://www.loc.gov/index.html");
460 		assertEquals("Failed equating hosts with dot",
461 				uuri1.getHost(), uuri2.getHost());
462 	}
463 	
464 	/***
465 	 * Test for NPE in java.net.URI.encode
466 	 *
467 	 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=874220&group_id=73833&atid=539099">[ 874220 ] NPE in java.net.URI.encode</a>
468 	 * @throws URIException
469 	 */
470 	public final void testHostEncodedChars() throws URIException {
471 		String s = "http://g.msn.co.kr/0nwkokr0/00/19??" +
472 		"PS=10274&NC=10009&CE=42&CP=949&HL=" +
473 		"&#65533;&#65533;&#65533;?&#65533;&#65533;";
474 		assertNotNull("Encoded chars " + s, 
475 				UURIFactory.getInstance(s));
476 	}
477 	
478 	/***
479 	 * Test for java.net.URI parses %20 but getHost null
480 	 *
481 	 * See <a href="https://sourceforge.net/tracker/?func=detail&aid=927940&group_id=73833&atid=539099">[ 927940 ] java.net.URI parses %20 but getHost null</a>
482 	 */
483 	public final void testSpaceInHost() {
484 		boolean expectedException = false;
485 		try {
486 			UURIFactory.getInstance(
487 					"http://www.local-regions.odpm%20.gov.uk" +
488 			"/lpsa/challenge/pdf/propect.pdf");
489 		} catch (URIException e) {
490 			expectedException = true;
491 		}
492 		assertTrue("Did not fail with escaped space.", expectedException);
493 		
494 		expectedException = false;
495 		try {
496 			UURIFactory.getInstance(
497 					"http://www.local-regions.odpm .gov.uk" +
498 			"/lpsa/challenge/pdf/propect.pdf");
499 		} catch (URIException e) {
500 			expectedException = true;
501 		}
502 		assertTrue("Did not fail with real space.", expectedException);
503 	}
504 	
505 	/***
506 	 * Test for java.net.URI chokes on hosts_with_underscores.
507 	 *
508 	 * @see  <a href="https://sourceforge.net/tracker/?func=detail&aid=808270&group_id=73833&atid=539099">[ 808270 ] java.net.URI chokes on hosts_with_underscores</a>
509 	 * @throws URIException
510 	 */
511 	public final void testHostWithUnderscores() throws URIException {
512 		UURI uuri = UURIFactory.getInstance(
513 		"http://x_underscore_underscore.2u.com.tw/nonexistent_page.html");
514 		assertEquals("Failed get of host with underscore",
515 				"x_underscore_underscore.2u.com.tw", uuri.getHost());
516 	}
517 	
518 	
519 	/***
520 	 * Two dots for igor.
521 	 */
522 	public final void testTwoDots() {
523 		boolean expectedException = false;
524 		try {
525 			UURIFactory.getInstance(
526 			"http://x_underscore_underscore..2u.com/nonexistent_page.html");
527 		} catch (URIException e) {
528 			expectedException = true;
529 		}
530 		assertTrue("Two dots did not throw exception", expectedException);
531 	}
532 	
533 	/***
534 	 * Test for java.net.URI#getHost fails when leading digit.
535 	 *
536 	 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=910120&group_id=73833&atid=539099">[ 910120 ] java.net.URI#getHost fails when leading digit.</a>
537 	 * @throws URIException
538 	 */
539 	public final void testHostWithDigit() throws URIException {
540 		UURI uuri = UURIFactory.
541 		getInstance("http://0204chat.2u.com.tw/nonexistent_page.html");
542 		assertEquals("Failed get of host with digit",
543 				"0204chat.2u.com.tw", uuri.getHost());
544 	}
545 	
546 	/***
547 	 * Test for Constraining java URI class.
548 	 *
549 	 * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=949548&group_id=73833&atid=539099">[ 949548 ] Constraining java URI class</a>
550 	 */
551 	public final void testPort() {
552 		checkBadPort("http://www.tyopaikat.com:a/robots.txt");
553 		checkBadPort("http://158.144.21.3:80808/robots.txt");
554 		checkBadPort("http://pdb.rutgers.edu:81.rutgers.edu/robots.txt");
555 		checkBadPort(
556 		    "https://webmail.gse.harvard.edu:9100robots.txt/robots.txt");
557 		checkBadPort(
558 		    "https://webmail.gse.harvard.edu:0/robots.txt/robots.txt");
559 	}
560 	
561 	/***
562 	 * Test bad port throws exception.
563 	 * @param uri URI with bad port to check.
564 	 */
565 	private void checkBadPort(String uri) {
566 		boolean exception = false;
567 		try {
568 			UURIFactory.getInstance(uri);
569 		}
570 		catch (URIException e) {
571 			exception = true;
572 		}
573 		assertTrue("Didn't throw exception: " + uri, exception);
574 	}
575 	
576 	/***
577 	 * Preserve userinfo capitalization.
578 	 * @throws URIException
579 	 */
580 	public final void testUserinfo() throws URIException {
581         final String authority = "stack:StAcK@www.tyopaikat.com";
582         final String uri = "http://" + authority + "/robots.txt";
583 		UURI uuri = UURIFactory.getInstance(uri);
584 		assertEquals("Authority not equal", uuri.getAuthority(),
585             authority);
586         /*
587         String tmp = uuri.toString();
588         assertTrue("URI not equal", tmp.equals(uri));
589         */
590 	}
591 
592 	/***
593 	 * Test user info + port
594 	 * @throws URIException
595 	 */
596 	public final void testUserinfoPlusPort() throws URIException {
597 		final String userInfo = "stack:StAcK";
598         final String authority = "www.tyopaikat.com";
599         final int port = 8080;
600         final String uri = "http://" + userInfo + "@" + authority + ":" + port 
601         	+ "/robots.txt";
602 		UURI uuri = UURIFactory.getInstance(uri);
603 		assertEquals("Host not equal", authority,uuri.getHost());
604 		assertEquals("Userinfo Not equal",userInfo,uuri.getUserinfo());
605 		assertEquals("Port not equal",port,uuri.getPort());
606 		assertEquals("Authority wrong","stack:StAcK@www.tyopaikat.com:8080",
607 				uuri.getAuthority());
608 		assertEquals("AuthorityMinusUserinfo wrong","www.tyopaikat.com:8080",
609 				uuri.getAuthorityMinusUserinfo());
610 		
611 	}
612     
613     public final void testRFC3986RelativeChange() throws URIException {
614         UURI base = UURIFactory.getInstance("http://a/b/c/d;p?q");
615         tryRelative(base, "?y",     "http://a/b/c/d;p?y");
616     }
617         
618     /***
619      * Tests from rfc3986
620      *
621      * <pre>
622      *       "g:h"           =  "g:h"
623      *       "g"             =  "http://a/b/c/g"
624      *       "./g"           =  "http://a/b/c/g"
625      *       "g/"            =  "http://a/b/c/g/"
626      *       "/g"            =  "http://a/g"
627      *       "//g"           =  "http://g"
628      *       "?y"            =  "http://a/b/c/d;p?y"
629      *       "g?y"           =  "http://a/b/c/g?y"
630      *       "#s"            =  "http://a/b/c/d;p?q#s"
631      *       "g#s"           =  "http://a/b/c/g#s"
632      *       "g?y#s"         =  "http://a/b/c/g?y#s"
633      *       ";x"            =  "http://a/b/c/;x"
634      *       "g;x"           =  "http://a/b/c/g;x"
635      *       "g;x?y#s"       =  "http://a/b/c/g;x?y#s"
636      *       ""              =  "http://a/b/c/d;p?q"
637      *       "."             =  "http://a/b/c/"
638      *       "./"            =  "http://a/b/c/"
639      *       ".."            =  "http://a/b/"
640      *       "../"           =  "http://a/b/"
641      *       "../g"          =  "http://a/b/g"
642      *       "../.."         =  "http://a/"
643      *       "../../"        =  "http://a/"
644      *       "../../g"       =  "http://a/g"
645      * </pre>
646      *
647      * @throws URIException
648      */
649     public final void testRFC3986Relative() throws URIException {
650         UURI base = UURIFactory.getInstance("http://a/b/c/d;p?q");
651         tryRelative(base, "g:h",    "g:h");
652         tryRelative(base, "g",      "http://a/b/c/g");
653         tryRelative(base, "./g",    "http://a/b/c/g");
654         tryRelative(base, "g/",     "http://a/b/c/g/");
655         tryRelative(base, "/g",     "http://a/g");
656         tryRelative(base, "//g",    "http://g");
657         tryRelative(base, "?y",     "http://a/b/c/d;p?y");
658         tryRelative(base, "g?y",    "http://a/b/c/g?y");
659         tryRelative(base, "#s",     "http://a/b/c/d;p?q#s");
660         tryRelative(base, "g#s",    "http://a/b/c/g#s");
661         tryRelative(base, "g?y#s",  "http://a/b/c/g?y#s");
662         tryRelative(base, ";x",     "http://a/b/c/;x");
663         tryRelative(base, "g;x",    "http://a/b/c/g;x");
664         tryRelative(base, "g;x?y#s","http://a/b/c/g;x?y#s");
665         tryRelative(base, "",       "http://a/b/c/d;p?q");
666         tryRelative(base, ".",      "http://a/b/c/");
667         tryRelative(base, "./",     "http://a/b/c/");
668         tryRelative(base, "..",     "http://a/b/");
669         tryRelative(base, "../",    "http://a/b/");
670         tryRelative(base, "../g",   "http://a/b/g");
671         tryRelative(base, "../..",  "http://a/");
672         tryRelative(base, "../../", "http://a/");
673         tryRelative(base, "../../g","http://a/g");
674     }
675     
676 	protected void tryRelative(UURI base, String relative, String expected) 
677     throws URIException {
678         UURI uuri = UURIFactory.getInstance(base, relative);
679         assertEquals("Derelativized " + relative + " gave " 
680                 + uuri + " not " + expected,
681                 uuri,UURIFactory.getInstance(expected));
682     }
683 
684     /***
685 	 * Tests from rfc2396 with amendments to accomodate differences
686 	 * intentionally added to make our URI handling like IEs.
687 	 *
688 	 * <pre>
689 	 *       g:h           =  g:h
690 	 *       g             =  http://a/b/c/g
691 	 *       ./g           =  http://a/b/c/g
692 	 *       g/            =  http://a/b/c/g/
693 	 *       /g            =  http://a/g
694 	 *       //g           =  http://g
695 	 *       ?y            =  http://a/b/c/?y
696 	 *       g?y           =  http://a/b/c/g?y
697 	 *       #s            =  (current document)#s
698 	 *       g#s           =  http://a/b/c/g#s
699 	 *       g?y#s         =  http://a/b/c/g?y#s
700 	 *       ;x            =  http://a/b/c/;x
701 	 *       g;x           =  http://a/b/c/g;x
702 	 *       g;x?y#s       =  http://a/b/c/g;x?y#s
703 	 *       .             =  http://a/b/c/
704 	 *       ./            =  http://a/b/c/
705 	 *       ..            =  http://a/b/
706 	 *       ../           =  http://a/b/
707 	 *       ../g          =  http://a/b/g
708 	 *       ../..         =  http://a/
709 	 *       ../../        =  http://a/
710 	 *       ../../g       =  http://a/g
711 	 * </pre>
712 	 *
713 	 * @throws URIException
714 	 */
715 	public final void testRFC2396Relative() throws URIException {
716 		UURI base = UURIFactory.
717 		getInstance("http://a/b/c/d;p?q");
718 		TreeMap<String,String> m = new TreeMap<String,String>();
719 		m.put("..", "http://a/b/");
720 		m.put("../", "http://a/b/");
721 		m.put("../g", "http://a/b/g");
722 		m.put("../..", "http://a/");
723 		m.put("../../", "http://a/");
724 		m.put("../../g", "http://a/g");
725 		m.put("g#s", "http://a/b/c/g#s");
726 		m.put("g?y#s ", "http://a/b/c/g?y#s");
727 		m.put(";x", "http://a/b/c/;x");
728 		m.put("g;x", "http://a/b/c/g;x");
729 		m.put("g;x?y#s", "http://a/b/c/g;x?y#s");
730 		m.put(".", "http://a/b/c/");
731 		m.put("./", "http://a/b/c/");
732 		m.put("g", "http://a/b/c/g");
733 		m.put("./g", "http://a/b/c/g");
734 		m.put("g/", "http://a/b/c/g/");
735 		m.put("/g", "http://a/g");
736 		m.put("//g", "http://g");
737         // CHANGED BY RFC3986
738 		// m.put("?y", "http://a/b/c/?y");
739 		m.put("g?y", "http://a/b/c/g?y");
740 		// EXTRAS beyond the RFC set.
741 		// TODO: That these resolve to a path of /a/g might be wrong.  Perhaps
742 		// it should be '/g'?.
743 		m.put("/../../../../../../../../g", "http://a/g");
744 		m.put("../../../../../../../../g", "http://a/g");
745 		m.put("../G", "http://a/b/G");
746 		for (Iterator i = m.keySet().iterator(); i.hasNext();) {
747 			String key = (String)i.next();
748 			String value = (String)m.get(key);
749 			UURI uuri = UURIFactory.getInstance(base, key);
750 			assertTrue("Unexpected " + key + " " + value + " " + uuri,
751 					uuri.equals(UURIFactory.getInstance(value)));
752 		}
753 	}
754 	
755 	/***
756 	 * A UURI should always be without a 'fragment' segment, which is
757 	 * unused and irrelevant for network fetches. 
758 	 *  
759 	 * See [ 970666 ] #anchor links not trimmed, and thus recrawled 
760 	 * 
761 	 * @throws URIException
762 	 */
763 	public final void testAnchors() throws URIException {
764 		UURI uuri = UURIFactory.
765 		getInstance("http://www.example.com/path?query#anchor");
766 		assertEquals("Not equal", "http://www.example.com/path?query",
767 				uuri.toString());
768 	}
769     
770 
771     /***
772      * Ensure that URI strings beginning with a colon are treated
773      * the same as browsers do (as relative, rather than as absolute
774      * with zero-length scheme). 
775      * 
776      * @throws URIException
777      */
778     public void testStartsWithColon() throws URIException {
779         UURI base = UURIFactory.getInstance("http://www.example.com/path/page");
780         UURI uuri = UURIFactory.getInstance(base,":foo");
781         assertEquals("derelativize starsWithColon",
782                 uuri.getURI(),
783                 "http://www.example.com/path/:foo");
784     }
785     
786     /***
787      * Ensure that stray trailing '%' characters do not prevent
788      * UURI instances from being created, and are reasonably 
789      * escaped when encountered. 
790      *
791      * @throws URIException
792      */
793     public void testTrailingPercents() throws URIException {
794         String plainPath = "http://www.example.com/path%";
795         UURI plainPathUuri = UURIFactory.getInstance(plainPath);
796         assertEquals("plainPath getURI", plainPath, plainPathUuri.getURI());
797         assertEquals("plainPath getEscapedURI", 
798                 "http://www.example.com/path%", // browsers don't escape '%'
799                 plainPathUuri.getEscapedURI());
800         
801         String partiallyEscapedPath = "http://www.example.com/pa%20th%";
802         UURI partiallyEscapedPathUuri = UURIFactory.getInstance(
803                 partiallyEscapedPath);
804 //        assertEquals("partiallyEscapedPath getURI", 
805 //                "http://www.example.com/pa th%", // TODO: is this desirable?
806 ////              partiallyEscapedPath,
807 //                partiallyEscapedPathUuri.getURI());
808         assertEquals("partiallyEscapedPath getEscapedURI", 
809                 "http://www.example.com/pa%20th%",
810                 partiallyEscapedPathUuri.getEscapedURI());
811         
812         String plainQueryString = "http://www.example.com/path?q=foo%";
813         UURI plainQueryStringUuri = UURIFactory.getInstance(
814                 plainQueryString);
815 //        assertEquals("plainQueryString getURI", 
816 //                plainQueryString,
817 //                plainQueryStringUuri.getURI());
818         assertEquals("plainQueryString getEscapedURI", 
819                 "http://www.example.com/path?q=foo%",
820                 plainQueryStringUuri.getEscapedURI());        
821         
822         String partiallyEscapedQueryString = 
823             "http://www.example.com/pa%20th?q=foo%";
824         UURI partiallyEscapedQueryStringUuri = UURIFactory.getInstance(
825                 partiallyEscapedQueryString);
826         assertEquals("partiallyEscapedQueryString getURI", 
827                 "http://www.example.com/pa th?q=foo%",
828                 partiallyEscapedQueryStringUuri.getURI());
829         assertEquals("partiallyEscapedQueryString getEscapedURI", 
830                 "http://www.example.com/pa%20th?q=foo%",
831                 partiallyEscapedQueryStringUuri.getEscapedURI());  
832     }
833     
834     /***
835      * Ensure that stray '%' characters do not prevent
836      * UURI instances from being created, and are reasonably 
837      * escaped when encountered. 
838      *
839      * @throws URIException
840      */
841     public void testStrayPercents() throws URIException {
842         String oneStray = "http://www.example.com/pa%th";
843         UURI oneStrayUuri = UURIFactory.getInstance(oneStray);
844         assertEquals("oneStray getURI", oneStray, oneStrayUuri.getURI());
845         assertEquals("oneStray getEscapedURI", 
846                 "http://www.example.com/pa%th", // browsers don't escape '%'
847                 oneStrayUuri.getEscapedURI());
848         
849         String precededByValidEscape = "http://www.example.com/pa%20th%way";
850         UURI precededByValidEscapeUuri = UURIFactory.getInstance(
851                 precededByValidEscape);
852         assertEquals("precededByValidEscape getURI", 
853                 "http://www.example.com/pa th%way", // getURI interprets escapes
854                 precededByValidEscapeUuri.getURI());
855         assertEquals("precededByValidEscape getEscapedURI", 
856                 "http://www.example.com/pa%20th%way",
857                 precededByValidEscapeUuri.getEscapedURI());
858         
859         String followedByValidEscape = "http://www.example.com/pa%th%20way";
860         UURI followedByValidEscapeUuri = UURIFactory.getInstance(
861                 followedByValidEscape);
862         assertEquals("followedByValidEscape getURI", 
863                 "http://www.example.com/pa%th way", // getURI interprets escapes
864                 followedByValidEscapeUuri.getURI());
865         assertEquals("followedByValidEscape getEscapedURI", 
866                 "http://www.example.com/pa%th%20way",
867                 followedByValidEscapeUuri.getEscapedURI());        
868     }
869     
870     public void testEscapingNotNecessary() throws URIException {
871         String escapesUnnecessary = 
872             "http://www.example.com/misc;reserved:chars@that&don't=need"
873             +"+escaping$even,though!you(might)initially?think#so";
874         // expect everything but the #fragment
875         String expected = escapesUnnecessary.substring(0, escapesUnnecessary
876                 .length() - 3);
877         assertEquals("escapes unnecessary", 
878                 expected, 
879                 UURIFactory.getInstance(escapesUnnecessary).toString());
880     }
881     
882     public void testIdn() throws URIException {
883         // See http://www.josefsson.org/idn.php.
884         String idn1 = new String("http://r??ksm??rg??s.josef??on.org/");
885         String puny1 = "http://xn--rksmrgs-5wao1o.josefsson.org/";
886         assertEquals("encoding of " + idn1, puny1, UURIFactory
887                 .getInstance(idn1).toString());
888         String idn2 = "http://www.p??lse.dk/";
889         String puny2 = "http://www.xn--plse-gra.dk/";
890         assertEquals("encoding of " + idn2, puny2, UURIFactory
891                 .getInstance(idn2).toString());
892     }
893     
894     public void testNewLineInURL() throws URIException {
895     	UURI uuri = UURIFactory.getInstance("http://www.ar\rchive\n." +
896     	    "org/i\n\n\r\rndex.html");
897     	assertEquals("http://www.archive.org/index.html", uuri.toString());
898     }
899     
900     public void testTabsInURL() throws URIException {
901         UURI uuri = UURIFactory.getInstance("http://www.ar\tchive\t." +
902             "org/i\t\r\n\tndex.html");
903         assertEquals("http://www.archive.org/index.html", uuri.toString());
904     }
905     
906     public void testQueryEscaping() throws URIException {
907         UURI uuri = UURIFactory.getInstance(
908             "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'\";:/?.>,<");
909         assertEquals(
910             // tests in FF1.5 indicate it only escapes " < > 
911             "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'%22;:/?.%3E,%3C",
912             uuri.toString());
913     }
914     
915     /***
916      * Check that our 'normalization' does same as Nutch's
917      * Below before-and-afters were taken from the nutch urlnormalizer-basic
918      * TestBasicURLNormalizer class  (December 2006, Nutch 0.9-dev).
919      * @throws URIException
920      */
921     public void testSameAsNutchURLFilterBasic() throws URIException {
922         assertEquals(UURIFactory.getInstance(" http://foo.com/ ").toString(),
923             "http://foo.com/");
924 
925         // check that protocol is lower cased
926         assertEquals(UURIFactory.getInstance("HTTP://foo.com/").toString(),
927             "http://foo.com/");
928         
929         // check that host is lower cased
930         assertEquals(UURIFactory.
931                 getInstance("http://Foo.Com/index.html").toString(),
932             "http://foo.com/index.html");
933         assertEquals(UURIFactory.
934                 getInstance("http://Foo.Com/index.html").toString(),
935             "http://foo.com/index.html");
936 
937         // check that port number is normalized
938         assertEquals(UURIFactory.
939                 getInstance("http://foo.com:80/index.html").toString(),
940             "http://foo.com/index.html");
941         assertEquals(UURIFactory.getInstance("http://foo.com:81/").toString(),
942             "http://foo.com:81/");
943 
944         // check that null path is normalized
945         assertEquals(UURIFactory.getInstance("http://foo.com").toString(),
946             "http://foo.com/");
947 
948         // check that references are removed
949         assertEquals(UURIFactory.
950                 getInstance("http://foo.com/foo.html#ref").toString(),
951             "http://foo.com/foo.html");
952 
953         //     // check that encoding is normalized
954         //     normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
955 
956         // check that unnecessary "../" are removed
957         assertEquals(UURIFactory.
958                 getInstance("http://foo.com/aa/../").toString(),
959             "http://foo.com/" );
960         assertEquals(UURIFactory.
961                 getInstance("http://foo.com/aa/bb/../").toString(),
962             "http://foo.com/aa/");
963 
964         /* We fail this one.  Here we produce: 'http://foo.com/'." target="alexandria_uri">http://foo.com/'.
965         assertEquals(UURIFactory.
966                 getInstance("http://foo.com/aa/..").toString(),
967             "http://foo.com/aa/..");
968          */
969         
970         assertEquals(UURIFactory.
971             getInstance("http://foo.com/aa/bb/cc/../../foo.html").toString(),
972                 "http://foo.com/aa/foo.html");
973         assertEquals(UURIFactory.
974             getInstance("http://foo.com/aa/bb/../cc/dd/../ee/foo.html").
975                 toString(),
976                     "http://foo.com/aa/cc/ee/foo.html");
977         assertEquals(UURIFactory.
978             getInstance("http://foo.com/../foo.html").toString(),
979                 "http://foo.com/foo.html" );
980         assertEquals(UURIFactory.
981             getInstance("http://foo.com/../../foo.html").toString(),
982                 "http://foo.com/foo.html" );
983         assertEquals(UURIFactory.
984             getInstance("http://foo.com/../aa/../foo.html").toString(),
985                 "http://foo.com/foo.html" );
986         assertEquals(UURIFactory.
987             getInstance("http://foo.com/aa/../../foo.html").toString(),
988                 "http://foo.com/foo.html" );
989         assertEquals(UURIFactory.
990                 getInstance("http://foo.com/aa/../bb/../foo.html/../../").
991                     toString(),
992             "http://foo.com/" );
993         assertEquals(UURIFactory.getInstance("http://foo.com/../aa/foo.html").
994             toString(), "http://foo.com/aa/foo.html" );
995         assertEquals(UURIFactory.
996                 getInstance("http://foo.com/../aa/../foo.html").toString(),
997             "http://foo.com/foo.html" );
998         assertEquals(UURIFactory.
999                 getInstance("http://foo.com/a..a/foo.html").toString(),
1000             "http://foo.com/a..a/foo.html" );
1001         assertEquals(UURIFactory.
1002                 getInstance("http://foo.com/a..a/../foo.html").toString(),
1003             "http://foo.com/foo.html" );
1004         assertEquals(UURIFactory.
1005             getInstance("http://foo.com/foo.foo/../foo.html").toString(),
1006                  "http://foo.com/foo.html" );
1007     }
1008     
1009     public void testHttpSchemeColonSlash() {
1010     	boolean exception = false;
1011     	try {
1012     		UURIFactory.getInstance("https:/");
1013     	} catch (URIException e) {
1014     		exception = true;
1015     	}
1016     	assertTrue("Didn't throw exception when one expected", exception);
1017     	exception = false;
1018     	try {
1019     		UURIFactory.getInstance("http://");
1020     	} catch (URIException e) {
1021     		exception = true;
1022     	}
1023     	assertTrue("Didn't throw exception when one expected", exception);
1024     }
1025     
1026     public void testNakedHttpsSchemeColon() {
1027         boolean exception = false;
1028         try {
1029             UURIFactory.getInstance("https:");
1030         } catch (URIException e) {
1031             exception = true;
1032         }
1033         assertTrue("Didn't throw exception when one expected", exception);
1034         exception = false;
1035         try {
1036             UURI base = UURIFactory.getInstance("http://www.example.com");
1037             UURIFactory.getInstance(base, "https:");
1038         } catch (URIException e) {
1039             exception = true;
1040         }
1041         assertTrue("Didn't throw exception when one expected", exception);
1042     }
1043     
1044     /***
1045      * Test motivated by [#HER-616] The UURI class may throw 
1046      * NullPointerException in getReferencedHost()
1047      * 
1048      * @throws URIException
1049      */
1050     public void testMissingHttpColon() throws URIException {
1051         String suspectUri = "http//www.test.foo";
1052         UURI base = UURIFactory.getInstance("http://www.example.com");
1053         boolean exceptionThrown = false; 
1054         try {
1055             UURI badUuri = UURIFactory.getInstance(suspectUri);
1056             badUuri.getReferencedHost(); // not reached
1057         } catch (URIException e) {
1058             // should get relative-uri-no-base exception
1059             exceptionThrown = true;
1060         } finally {
1061             assertTrue("expected exception not thrown",exceptionThrown);
1062         }
1063         UURI goodUuri = UURIFactory.getInstance(base,suspectUri);
1064         goodUuri.getReferencedHost();
1065     }
1066     
1067 
1068 }