1   /* HtmlFormCredential
2    *
3    * Created on Apr 7, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.datamodel.credential;
24  
25  import java.util.HashMap;
26  import java.util.Iterator;
27  import java.util.Map;
28  import java.util.logging.Logger;
29  
30  import javax.management.Attribute;
31  import javax.management.AttributeNotFoundException;
32  
33  import org.apache.commons.httpclient.HttpClient;
34  import org.apache.commons.httpclient.HttpMethod;
35  import org.apache.commons.httpclient.HttpMethodBase;
36  import org.apache.commons.httpclient.NameValuePair;
37  import org.apache.commons.httpclient.URIException;
38  import org.apache.commons.httpclient.methods.GetMethod;
39  import org.apache.commons.httpclient.methods.PostMethod;
40  import org.archive.crawler.datamodel.CrawlURI;
41  import org.archive.crawler.settings.MapType;
42  import org.archive.crawler.settings.SimpleType;
43  import org.archive.crawler.settings.Type;
44  import org.archive.net.UURI;
45  import org.archive.net.UURIFactory;
46  
47  
48  
49  /***
50   * Credential that holds all needed to do a GET/POST to a HTML form.
51   *
52   * @author stack
53   * @version $Revision: 4668 $, $Date: 2006-09-26 21:49:01 +0000 (Tue, 26 Sep 2006) $
54   */
55  public class HtmlFormCredential extends Credential {
56  
57      private static final long serialVersionUID = -4732570804435453949L;
58  
59      private static final Logger logger =
60          Logger.getLogger(HtmlFormCredential.class.getName());
61  
62      private static final String ATTR_LOGIN_URI = "login-uri";
63      private static final String ATTR_FORM_ITEMS = "form-items";
64      private static final String ATTR_FORM_METHOD = "http-method";
65      private static final String [] METHODS = {"POST", "GET"};
66  
67      /***
68       * Constructor.
69       *
70       * A constructor that takes name of the credential is required by settings
71       * framework.
72       *
73       * @param name Name of this credential.
74       */
75      public HtmlFormCredential(final String name)
76      {
77          super(name, "Credential that has all necessary" +
78              " for running a POST/GET to an HTML login form.");
79  
80          Type t = addElementToDefinition(new SimpleType("login-uri",
81              "Full URI of page that contains the HTML login form we're to" +
82              " apply these credentials too: E.g. http://www.archive.org", ""));
83          t.setOverrideable(false);
84          t.setExpertSetting(true);
85  
86  
87          t = addElementToDefinition(new SimpleType(ATTR_FORM_METHOD,
88              "GET or POST", METHODS[0], METHODS));
89          t.setOverrideable(false);
90          t.setExpertSetting(true);
91  
92          t = addElementToDefinition(new MapType(ATTR_FORM_ITEMS, "Form items.",
93              String.class));
94          t.setOverrideable(false);
95          t.setExpertSetting(true);
96      }
97  
98      /***
99       * @param context CrawlURI context to use.
100      * @return login-uri.
101      * @throws AttributeNotFoundException
102      */
103     public String getLoginUri(final CrawlURI context)
104             throws AttributeNotFoundException {
105         return (String)getAttribute(ATTR_LOGIN_URI, context);
106     }
107 
108     /***
109      * @param context CrawlURI context to use.
110      * @return login-uri.
111      * @throws AttributeNotFoundException
112      */
113     public String getHttpMethod(final CrawlURI context)
114             throws AttributeNotFoundException {
115         return (String)getAttribute(ATTR_FORM_METHOD, context);
116     }
117 
118     /***
119      * @param context CrawlURI context to use.
120      * @return Form inputs as convenient map.  Returns null if no form items.
121      * @throws AttributeNotFoundException
122      */
123     public Map<String,Object> getFormItems(final CrawlURI context)
124             throws AttributeNotFoundException {
125         Map<String,Object> result = null;
126         MapType items = (MapType)getAttribute(ATTR_FORM_ITEMS, context);
127         if (items != null) {
128             for (Iterator i = items.iterator(context); i.hasNext();) {
129                 Attribute a = (Attribute)i.next();
130                 if (result == null) {
131                     result = new HashMap<String,Object>();
132                 }
133                 result.put(a.getName(), a.getValue());
134             }
135         }
136         return result;
137     }
138 
139     public boolean isPrerequisite(final CrawlURI curi) {
140         boolean result = false;
141         String curiStr = curi.getUURI().toString();
142         String loginUri = getPrerequisite(curi);
143         if (loginUri != null) {
144             try {
145                 UURI uuri = UURIFactory.getInstance(curi.getUURI(), loginUri);
146                 if (uuri != null && curiStr != null &&
147                     uuri.toString().equals(curiStr)) {
148                     result = true;
149                     if (!curi.isPrerequisite()) {
150                         curi.setPrerequisite(true);
151                         logger.fine(curi + " is prereq.");
152                     }
153                 }
154             } catch (URIException e) {
155                 logger.severe("Failed to uuri: " + curi + ", " +
156                     e.getMessage());
157             }
158         }
159         return result;
160     }
161 
162     public boolean hasPrerequisite(CrawlURI curi) {
163         return getPrerequisite(curi) != null;
164     }
165 
166     public String getPrerequisite(CrawlURI curi) {
167         String loginUri = null;
168         try {
169             loginUri = getLoginUri(curi);
170         } catch (AttributeNotFoundException e) {
171             logger.severe("Failed to getLoginUri: " + this + ", " + curi + ","
172                 + e.getMessage());
173             // Not much I can do here. What if I fail every time? Then
174             // this prereq. will not ever be processed.  We'll never get on to
175             // this server.
176         }
177         return loginUri;
178     }
179 
180     public String getKey(CrawlURI curi) throws AttributeNotFoundException {
181         return getLoginUri(curi);
182     }
183 
184     public boolean isEveryTime() {
185         // This authentication is one time only.
186         return false;
187     }
188 
189     public boolean populate(CrawlURI curi, HttpClient http, HttpMethod method,
190             String payload) {
191         // http is not used.
192         // payload is not used.
193         boolean result = false;
194         Map formItems = null;
195         try {
196             formItems = getFormItems(curi);
197         }
198         catch (AttributeNotFoundException e1) {
199             logger.severe("Failed get of form items for " + curi);
200         }
201         if (formItems == null || formItems.size() <= 0) {
202             try {
203                 logger.severe("No form items for " + method.getURI());
204             }
205             catch (URIException e) {
206                 logger.severe("No form items and exception getting uri: " +
207                     e.getMessage());
208             }
209             return result;
210         }
211 
212         NameValuePair[] data = new NameValuePair[formItems.size()];
213         int index = 0;
214         String key = null;
215         for (Iterator i = formItems.keySet().iterator(); i.hasNext();) {
216             key = (String)i.next();
217             data[index++] = new NameValuePair(key, (String)formItems.get(key));
218         }
219         if (method instanceof PostMethod) {
220             ((PostMethod)method).setRequestBody(data);
221             result = true;
222         } else if (method instanceof GetMethod) {
223             // Append these values to the query string.
224             // Get current query string, then add data, then get it again
225             // only this time its our data only... then append.
226             HttpMethodBase hmb = (HttpMethodBase)method;
227             String currentQuery = hmb.getQueryString();
228             hmb.setQueryString(data);
229             String newQuery = hmb.getQueryString();
230             hmb.setQueryString(((currentQuery != null)? currentQuery: "") +
231             		"&" + newQuery);
232             result = true;
233         } else {
234             logger.severe("Unknown method type: " + method);
235         }
236         return result;
237     }
238 
239     public boolean isPost(CrawlURI curi) {
240         String method = null;
241         try {
242             method = getHttpMethod(curi);
243         }
244         catch (AttributeNotFoundException e) {
245             logger.severe("Failed to get method for " + curi + ", " + this);
246         }
247         return method != null && method.equalsIgnoreCase("POST");
248     }
249 }