View Javadoc

1   /* HeritrixProtocolSocketFactory
2    * 
3    * Created on Oct 8, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.fetcher;
24  
25  import java.io.IOException;
26  import java.net.InetAddress;
27  import java.net.InetSocketAddress;
28  import java.net.Socket;
29  import java.net.SocketTimeoutException;
30  import java.net.UnknownHostException;
31  
32  import org.apache.commons.httpclient.ConnectTimeoutException;
33  import org.apache.commons.httpclient.params.HttpConnectionParams;
34  import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
35  import org.archive.crawler.datamodel.CrawlHost;
36  import org.archive.crawler.datamodel.ServerCache;
37  
38  
39  /***
40   * Version of protocol socket factory that tries to get IP from heritrix IP
41   * cache -- if its been set into the HttpConnectionParameters.
42   * 
43   * Copied the guts of DefaultProtocolSocketFactory.  This factory gets
44   * setup by {@link FetchHTTP}.
45   * 
46   * @author stack
47   * @version $Date: 2006-08-29 22:47:03 +0000 (Tue, 29 Aug 2006) $, $Revision: 4553 $
48   */
49  public class HeritrixProtocolSocketFactory
50  implements ProtocolSocketFactory {
51      /***
52       * Constructor.
53       */
54      public HeritrixProtocolSocketFactory() {
55          super();
56      }
57  
58      /***
59       * @see #createSocket(java.lang.String,int,java.net.InetAddress,int)
60       */
61      public Socket createSocket(
62          String host,
63          int port,
64          InetAddress localAddress,
65          int localPort
66      ) throws IOException, UnknownHostException {
67          return new Socket(host, port, localAddress, localPort);
68      }
69  
70      /***
71       * Attempts to get a new socket connection to the given host within the
72       * given time limit.
73       * <p>
74       * This method employs several techniques to circumvent the limitations
75       * of older JREs that do not support connect timeout. When running in
76       * JRE 1.4 or above reflection is used to call
77       * Socket#connect(SocketAddress endpoint, int timeout) method. When
78       * executing in older JREs a controller thread is executed. The
79       * controller thread attempts to create a new socket within the given
80       * limit of time. If socket constructor does not return until the
81       * timeout expires, the controller terminates and throws an
82       * {@link ConnectTimeoutException}
83       * </p>
84       *
85       * @param host the host name/IP
86       * @param port the port on the host
87       * @param localAddress the local host name/IP to bind the socket to
88       * @param localPort the port on the local machine
89       * @param params {@link HttpConnectionParams Http connection parameters}
90       *
91       * @return Socket a new socket
92       *
93       * @throws IOException if an I/O error occurs while creating the socket
94       * @throws UnknownHostException if the IP address of the host cannot be
95       * @throws IOException if an I/O error occurs while creating the socket
96       * @throws UnknownHostException if the IP address of the host cannot be
97       * determined
98       * @throws ConnectTimeoutException if socket cannot be connected within the
99       *  given time limit
100      *
101      * @since 3.0
102      */
103     public Socket createSocket(
104         final String host,
105         final int port,
106         final InetAddress localAddress,
107         final int localPort,
108         final HttpConnectionParams params)
109     throws IOException, UnknownHostException, ConnectTimeoutException {
110         // Below code is from the DefaultSSLProtocolSocketFactory#createSocket
111         // method only it has workarounds to deal with pre-1.4 JVMs.  I've
112         // cut these out.
113         if (params == null) {
114             throw new IllegalArgumentException("Parameters may not be null");
115         }
116         Socket socket = null;
117         int timeout = params.getConnectionTimeout();
118         if (timeout == 0) {
119             socket = createSocket(host, port, localAddress, localPort);
120         } else {
121             socket = new Socket();
122             ServerCache cache = (ServerCache)params.
123                 getParameter(FetchHTTP.SERVER_CACHE_KEY);
124             InetAddress hostAddress =
125             	(cache != null)? getHostAddress(cache, host): null;
126             InetSocketAddress address = (hostAddress != null)?
127                     new InetSocketAddress(hostAddress, port):
128                     new InetSocketAddress(host, port);
129             socket.bind(new InetSocketAddress(localAddress, localPort));
130             try {
131                 socket.connect(address, timeout);
132             } catch (SocketTimeoutException e) {
133                 // Add timeout info. to the exception.
134                 throw new SocketTimeoutException(e.getMessage() +
135                     ": timeout set at " + Integer.toString(timeout) + "ms.");
136             }
137             assert socket.isConnected(): "Socket not connected " + host;
138         }
139         return socket;
140     }
141     
142     /***
143      * Get host address using first the heritrix cache of addresses, then,
144      * failing that, go to the dnsjava cache.
145      * 
146      * Default access and static so can be used by other classes in this
147      * package.
148      *
149      * @param host Host whose address we're to fetch.
150      * @return an IP address for this host or null if one can't be found
151      * in caches.
152      * @exception IOException If we fail to get host IP from ServerCache.
153      */
154     static InetAddress getHostAddress(final ServerCache cache,
155             final String host) throws IOException {
156         InetAddress result = null;
157         if (cache != null) {
158         	CrawlHost ch = cache.getHostFor(host);
159             if (ch != null) {
160                 result = ch.getIP();
161             }
162         }
163         if (result ==  null) {
164             throw new IOException("Failed to get host " + host +
165                 " address from ServerCache");
166         }
167         return result;
168     }
169 
170     /***
171      * @see ProtocolSocketFactory#createSocket(java.lang.String,int)
172      */
173     public Socket createSocket(String host, int port)
174             throws IOException, UnknownHostException {
175         return new Socket(host, port);
176     }
177 
178     /***
179      * All instances of DefaultProtocolSocketFactory are the same.
180      * @param obj Object to compare.
181      * @return True if equal
182      */
183     public boolean equals(Object obj) {
184         return ((obj != null) &&
185             obj.getClass().equals(HeritrixProtocolSocketFactory.class));
186     }
187 
188     /***
189      * All instances of DefaultProtocolSocketFactory have the same hash code.
190      * @return Hash code for this object.
191      */
192     public int hashCode() {
193         return HeritrixProtocolSocketFactory.class.hashCode();
194     }
195 }