1   /* FixupQueryStr
2    * 
3    * Created on Oct 5, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.url.canonicalize;
24  
25  
26  
27  
28  /***
29   * Strip any trailing question mark.
30   * @author stack
31   * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
32   */
33  public class FixupQueryStr
34  extends BaseRule {
35  
36      private static final long serialVersionUID = 3169526832544474794L;
37  
38      private static final String DESCRIPTION =
39          "Fixup the question mark that leads off the query string. " +
40          "This rule returns 'http://www.archive.org/index.html' if passed" +
41          " 'http://www.archive.org/index.html?'.  It will also strip '?&'" +
42          " if '?&' is all that comprises the query string.  Also strips" +
43          " extraneous leading '&': Returns 'http://archive.org/index.html?x=y" +
44          " if passed 'http://archive.org/index.html?&x=y." +
45          " Will also strip '&' if last thing in query string." +
46          " Operates on all schemes.  This is a good rule to run toward the" +
47          " end of canonicalization processing.";
48  
49      public FixupQueryStr(String name) {
50          super(name, DESCRIPTION);
51      }
52  
53      public String canonicalize(String url, Object context) {
54          if (url == null || url.length() <= 0) {
55              return url;
56          }
57          
58          int index = url.lastIndexOf('?');
59          if (index > 0) {
60              if (index == (url.length() - 1)) {
61                  // '?' is last char in url.  Strip it.
62                  url = url.substring(0, url.length() - 1);
63              } else if (url.charAt(index + 1) == '&') {
64                  // Next char is '&'. Strip it.
65                  if (url.length() == (index + 2)) {
66                      // Then url ends with '?&'.  Strip them.
67                      url = url.substring(0, url.length() - 2);
68                  } else {
69                      // The '&' is redundant.  Strip it.
70                      url = url.substring(0, index + 1) +
71                      url.substring(index + 2);
72                  }
73              } else if (url.charAt(url.length() - 1) == '&') {
74                  // If we have a lone '&' on end of query str,
75                  // strip it.
76                  url = url.substring(0, url.length() - 1);
77              }
78          }
79          return url;
80      }
81  }