1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url.canonicalize;
24
25
26
27
28 /***
29 * Strip any trailing question mark.
30 * @author stack
31 * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
32 */
33 public class FixupQueryStr
34 extends BaseRule {
35
36 private static final long serialVersionUID = 3169526832544474794L;
37
38 private static final String DESCRIPTION =
39 "Fixup the question mark that leads off the query string. " +
40 "This rule returns 'http://www.archive.org/index.html' if passed" +
41 " 'http://www.archive.org/index.html?'. It will also strip '?&'" +
42 " if '?&' is all that comprises the query string. Also strips" +
43 " extraneous leading '&': Returns 'http://archive.org/index.html?x=y" +
44 " if passed 'http://archive.org/index.html?&x=y." +
45 " Will also strip '&' if last thing in query string." +
46 " Operates on all schemes. This is a good rule to run toward the" +
47 " end of canonicalization processing.";
48
49 public FixupQueryStr(String name) {
50 super(name, DESCRIPTION);
51 }
52
53 public String canonicalize(String url, Object context) {
54 if (url == null || url.length() <= 0) {
55 return url;
56 }
57
58 int index = url.lastIndexOf('?');
59 if (index > 0) {
60 if (index == (url.length() - 1)) {
61
62 url = url.substring(0, url.length() - 1);
63 } else if (url.charAt(index + 1) == '&') {
64
65 if (url.length() == (index + 2)) {
66
67 url = url.substring(0, url.length() - 2);
68 } else {
69
70 url = url.substring(0, index + 1) +
71 url.substring(index + 2);
72 }
73 } else if (url.charAt(url.length() - 1) == '&') {
74
75
76 url = url.substring(0, url.length() - 1);
77 }
78 }
79 return url;
80 }
81 }