1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.fetcher;
24
25 import java.io.IOException;
26 import java.net.InetAddress;
27 import java.net.InetSocketAddress;
28 import java.net.Socket;
29 import java.net.SocketTimeoutException;
30 import java.net.UnknownHostException;
31
32 import org.apache.commons.httpclient.ConnectTimeoutException;
33 import org.apache.commons.httpclient.params.HttpConnectionParams;
34 import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
35 import org.archive.crawler.datamodel.CrawlHost;
36 import org.archive.crawler.datamodel.ServerCache;
37
38
39 /***
40 * Version of protocol socket factory that tries to get IP from heritrix IP
41 * cache -- if its been set into the HttpConnectionParameters.
42 *
43 * Copied the guts of DefaultProtocolSocketFactory. This factory gets
44 * setup by {@link FetchHTTP}.
45 *
46 * @author stack
47 * @version $Date: 2006-08-29 22:47:03 +0000 (Tue, 29 Aug 2006) $, $Revision: 4553 $
48 */
49 public class HeritrixProtocolSocketFactory
50 implements ProtocolSocketFactory {
51 /***
52 * Constructor.
53 */
54 public HeritrixProtocolSocketFactory() {
55 super();
56 }
57
58 /***
59 * @see #createSocket(java.lang.String,int,java.net.InetAddress,int)
60 */
61 public Socket createSocket(
62 String host,
63 int port,
64 InetAddress localAddress,
65 int localPort
66 ) throws IOException, UnknownHostException {
67 return new Socket(host, port, localAddress, localPort);
68 }
69
70 /***
71 * Attempts to get a new socket connection to the given host within the
72 * given time limit.
73 * <p>
74 * This method employs several techniques to circumvent the limitations
75 * of older JREs that do not support connect timeout. When running in
76 * JRE 1.4 or above reflection is used to call
77 * Socket#connect(SocketAddress endpoint, int timeout) method. When
78 * executing in older JREs a controller thread is executed. The
79 * controller thread attempts to create a new socket within the given
80 * limit of time. If socket constructor does not return until the
81 * timeout expires, the controller terminates and throws an
82 * {@link ConnectTimeoutException}
83 * </p>
84 *
85 * @param host the host name/IP
86 * @param port the port on the host
87 * @param localAddress the local host name/IP to bind the socket to
88 * @param localPort the port on the local machine
89 * @param params {@link HttpConnectionParams Http connection parameters}
90 *
91 * @return Socket a new socket
92 *
93 * @throws IOException if an I/O error occurs while creating the socket
94 * @throws UnknownHostException if the IP address of the host cannot be
95 * @throws IOException if an I/O error occurs while creating the socket
96 * @throws UnknownHostException if the IP address of the host cannot be
97 * determined
98 * @throws ConnectTimeoutException if socket cannot be connected within the
99 * given time limit
100 *
101 * @since 3.0
102 */
103 public Socket createSocket(
104 final String host,
105 final int port,
106 final InetAddress localAddress,
107 final int localPort,
108 final HttpConnectionParams params)
109 throws IOException, UnknownHostException, ConnectTimeoutException {
110
111
112
113 if (params == null) {
114 throw new IllegalArgumentException("Parameters may not be null");
115 }
116 Socket socket = null;
117 int timeout = params.getConnectionTimeout();
118 if (timeout == 0) {
119 socket = createSocket(host, port, localAddress, localPort);
120 } else {
121 socket = new Socket();
122 ServerCache cache = (ServerCache)params.
123 getParameter(FetchHTTP.SERVER_CACHE_KEY);
124 InetAddress hostAddress =
125 (cache != null)? getHostAddress(cache, host): null;
126 InetSocketAddress address = (hostAddress != null)?
127 new InetSocketAddress(hostAddress, port):
128 new InetSocketAddress(host, port);
129 socket.bind(new InetSocketAddress(localAddress, localPort));
130 try {
131 socket.connect(address, timeout);
132 } catch (SocketTimeoutException e) {
133
134 throw new SocketTimeoutException(e.getMessage() +
135 ": timeout set at " + Integer.toString(timeout) + "ms.");
136 }
137 assert socket.isConnected(): "Socket not connected " + host;
138 }
139 return socket;
140 }
141
142 /***
143 * Get host address using first the heritrix cache of addresses, then,
144 * failing that, go to the dnsjava cache.
145 *
146 * Default access and static so can be used by other classes in this
147 * package.
148 *
149 * @param host Host whose address we're to fetch.
150 * @return an IP address for this host or null if one can't be found
151 * in caches.
152 * @exception IOException If we fail to get host IP from ServerCache.
153 */
154 static InetAddress getHostAddress(final ServerCache cache,
155 final String host) throws IOException {
156 InetAddress result = null;
157 if (cache != null) {
158 CrawlHost ch = cache.getHostFor(host);
159 if (ch != null) {
160 result = ch.getIP();
161 }
162 }
163 if (result == null) {
164 throw new IOException("Failed to get host " + host +
165 " address from ServerCache");
166 }
167 return result;
168 }
169
170 /***
171 * @see ProtocolSocketFactory#createSocket(java.lang.String,int)
172 */
173 public Socket createSocket(String host, int port)
174 throws IOException, UnknownHostException {
175 return new Socket(host, port);
176 }
177
178 /***
179 * All instances of DefaultProtocolSocketFactory are the same.
180 * @param obj Object to compare.
181 * @return True if equal
182 */
183 public boolean equals(Object obj) {
184 return ((obj != null) &&
185 obj.getClass().equals(HeritrixProtocolSocketFactory.class));
186 }
187
188 /***
189 * All instances of DefaultProtocolSocketFactory have the same hash code.
190 * @return Hash code for this object.
191 */
192 public int hashCode() {
193 return HeritrixProtocolSocketFactory.class.hashCode();
194 }
195 }