1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.regex.Pattern;
26
27
28
29 /***
30 * Strip any 'userinfo' found on http/https URLs.
31 * @author stack
32 * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
33 */
34 public class StripUserinfoRule extends BaseRule {
35
36 private static final long serialVersionUID = -4271062607638914996L;
37
38 private static final String DESCRIPTION = "Strip any 'userinfo' found. " +
39 "Use this rule to equate 'http://stack:psswrd@archive.org/index.htm'" +
40 " and 'http://archive.org/index.htm'. The resulting canonicalization" +
41 " returns 'http://archive.org/index.htm'. Removes any userinfo" +
42 " found. Operates on http/https/ftp/ftps schemes only.";
43
44 /***
45 * Strip userinfo.
46 */
47 private static final Pattern REGEX =
48 Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$",
49 Pattern.CASE_INSENSITIVE);
50
51 public StripUserinfoRule(String name) {
52 super(name, DESCRIPTION);
53 }
54
55 public String canonicalize(String url, Object context) {
56 return doStripRegexMatch(url, REGEX.matcher(url));
57 }
58 }