1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.fetcher;
25
26 import java.io.ByteArrayInputStream;
27 import java.io.ByteArrayOutputStream;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.net.InetAddress;
31 import java.net.UnknownHostException;
32 import java.security.MessageDigest;
33 import java.util.logging.Level;
34 import java.util.logging.Logger;
35 import java.util.regex.Matcher;
36
37 import org.apache.commons.httpclient.URIException;
38 import org.archive.crawler.datamodel.CoreAttributeConstants;
39 import org.archive.crawler.datamodel.CrawlHost;
40 import org.archive.crawler.datamodel.CrawlURI;
41 import org.archive.crawler.datamodel.FetchStatusCodes;
42 import org.archive.crawler.framework.Processor;
43 import org.archive.crawler.settings.SimpleType;
44 import org.archive.util.ArchiveUtils;
45 import org.archive.util.HttpRecorder;
46 import org.archive.util.InetAddressUtil;
47 import org.xbill.DNS.ARecord;
48 import org.xbill.DNS.DClass;
49 import org.xbill.DNS.Lookup;
50 import org.xbill.DNS.Record;
51 import org.xbill.DNS.ResolverConfig;
52 import org.xbill.DNS.TextParseException;
53 import org.xbill.DNS.Type;
54
55
56 /***
57 * Processor to resolve 'dns:' URIs.
58 *
59 * TODO: Refactor to use org.archive.util.DNSJavaUtils.
60 *
61 * @author multiple
62 */
63 public class FetchDNS extends Processor
64 implements CoreAttributeConstants, FetchStatusCodes {
65 private static final long serialVersionUID = 4686199203459704426L;
66
67 private Logger logger = Logger.getLogger(this.getClass().getName());
68
69
70 private short ClassType = DClass.IN;
71 private short TypeType = Type.A;
72 protected InetAddress serverInetAddr = null;
73
74 private static final String ATTR_ACCEPT_NON_DNS_RESOLVES =
75 "accept-non-dns-resolves";
76 private static final Boolean DEFAULT_ACCEPT_NON_DNS_RESOLVES =
77 Boolean.FALSE;
78 private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES
79 = 6 * 60 * 60;
80
81 private byte [] reusableBuffer = new byte[1024];
82
83 /***
84 * Create a new instance of FetchDNS.
85 *
86 * @param name the name of this attribute.
87 */
88 public FetchDNS(String name) {
89 super(name, "DNS Fetcher. Handles DNS lookups.");
90 org.archive.crawler.settings.Type e =
91 addElementToDefinition(new SimpleType(ATTR_ACCEPT_NON_DNS_RESOLVES,
92 "If a DNS lookup fails, whether or not to fallback to " +
93 "InetAddress resolution, which may use local 'hosts' files " +
94 "or other mechanisms.", DEFAULT_ACCEPT_NON_DNS_RESOLVES));
95 e.setExpertSetting(true);
96 e = addElementToDefinition(new SimpleType(FetchHTTP.ATTR_DIGEST_CONTENT,
97 "Whether or not to perform an on-the-fly digest hash of" +
98 " retrieved content-bodies.",
99 FetchHTTP.DEFAULT_DIGEST_CONTENT));
100 e.setExpertSetting(true);
101 e = addElementToDefinition(new SimpleType(
102 FetchHTTP.ATTR_DIGEST_ALGORITHM, "Which algorithm (for example " +
103 "MD5 or SHA-1) to use to perform an on-the-fly digest" +
104 " hash of retrieved content-bodies.",
105 FetchHTTP.DEFAULT_DIGEST_ALGORITHM,
106 FetchHTTP.DIGEST_ALGORITHMS));
107 e.setExpertSetting(true);
108 }
109
110 protected void innerProcess(CrawlURI curi) {
111 if (!curi.getUURI().getScheme().equals("dns")) {
112
113 return;
114 }
115 Record[] rrecordSet = null;
116 String dnsName = null;
117 try {
118 dnsName = curi.getUURI().getReferencedHost();
119 } catch (URIException e) {
120 logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
121 }
122
123 if(dnsName == null) {
124 curi.setFetchStatus(S_UNFETCHABLE_URI);
125 return;
126 }
127
128
129
130 CrawlHost targetHost = null;
131 if (getController() != null &&
132 getController().getServerCache() != null) {
133 targetHost = getController().getServerCache().getHostFor(dnsName);
134 } else {
135
136 targetHost = new CrawlHost(dnsName);
137 }
138 if (isQuadAddress(curi, dnsName, targetHost)) {
139
140 return;
141 }
142
143
144 curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
145
146
147
148 try {
149 rrecordSet = (new Lookup(dnsName, TypeType, ClassType)).run();
150 } catch (TextParseException e) {
151 rrecordSet = null;
152 }
153 curi.setContentType("text/dns");
154 if (rrecordSet != null) {
155 if (logger.isLoggable(Level.FINE)) {
156 logger.fine("Found recordset for " + dnsName);
157 }
158 storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
159 } else {
160 if (logger.isLoggable(Level.FINE)) {
161 logger.fine("Failed find of recordset for " + dnsName);
162 }
163 if (((Boolean)getUncheckedAttribute(null,
164 ATTR_ACCEPT_NON_DNS_RESOLVES)).booleanValue()) {
165
166 InetAddress address = null;
167 try {
168 address = InetAddress.getByName(dnsName);
169 } catch (UnknownHostException e1) {
170 address = null;
171 }
172 if (address != null) {
173 targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
174 curi.setFetchStatus(S_GETBYNAME_SUCCESS);
175 if (logger.isLoggable(Level.FINE)) {
176 logger.fine("Found address for " + dnsName +
177 " using native dns.");
178 }
179 } else {
180 if (logger.isLoggable(Level.FINE)) {
181 logger.fine("Failed find of address for " + dnsName +
182 " using native dns.");
183 }
184 setUnresolvable(curi, targetHost);
185 }
186 } else {
187 setUnresolvable(curi, targetHost);
188 }
189 }
190 curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
191 }
192
193 protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
194 final CrawlHost targetHost, final Record[] rrecordSet) {
195
196
197 ARecord arecord = getFirstARecord(rrecordSet);
198 if (arecord == null) {
199 throw new NullPointerException("Got null arecord for " +
200 dnsName);
201 }
202 targetHost.setIP(arecord.getAddress(), arecord.getTTL());
203 try {
204 recordDNS(curi, rrecordSet);
205 curi.setFetchStatus(S_DNS_SUCCESS);
206 curi.putString(A_DNS_SERVER_IP_LABEL, ResolverConfig.getCurrentConfig().server());
207 } catch (IOException e) {
208 logger.log(Level.SEVERE, "Failed store of DNS Record for " +
209 curi.toString(), e);
210 setUnresolvable(curi, targetHost);
211 }
212 }
213
214 protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
215 final CrawlHost targetHost) {
216 boolean result = false;
217 Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
218
219 if (matcher == null || !matcher.matches()) {
220 return result;
221 }
222
223 result = true;
224
225
226 if (logger.isLoggable(Level.WARNING)) {
227 logger.warning("Unnecessary DNS CrawlURI created: " + curi);
228 }
229 try {
230 targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
231 (byte) (new Integer(matcher.group(1)).intValue()),
232 (byte) (new Integer(matcher.group(2)).intValue()),
233 (byte) (new Integer(matcher.group(3)).intValue()),
234 (byte) (new Integer(matcher.group(4)).intValue()) }),
235 CrawlHost.IP_NEVER_EXPIRES);
236 curi.setFetchStatus(S_DNS_SUCCESS);
237 } catch (UnknownHostException e) {
238 logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
239 setUnresolvable(curi, targetHost);
240 }
241 return result;
242 }
243
244 protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
245 throws IOException {
246 final byte[] dnsRecord =
247 getDNSRecord(curi.getLong(A_FETCH_BEGAN_TIME), rrecordSet);
248 HttpRecorder rec = HttpRecorder.getHttpRecorder();
249
250
251 boolean digestContent = ((Boolean)getUncheckedAttribute(curi,
252 FetchHTTP.ATTR_DIGEST_CONTENT)).booleanValue();
253 String algorithm = null;
254 if (digestContent) {
255 algorithm = ((String)getUncheckedAttribute(curi,
256 FetchHTTP.ATTR_DIGEST_ALGORITHM));
257 rec.getRecordedInput().setDigest(algorithm);
258 } else {
259
260 rec.getRecordedInput().setDigest((MessageDigest)null);
261 }
262
263 curi.setHttpRecorder(rec);
264 InputStream is = curi.getHttpRecorder().inputWrap(
265 new ByteArrayInputStream(dnsRecord));
266 if(digestContent) {
267 rec.getRecordedInput().startDigest();
268 }
269
270
271 try {
272 while (is.read(this.reusableBuffer) != -1) {
273 continue;
274 }
275 } finally {
276 is.close();
277 rec.closeRecorders();
278 }
279 curi.setContentSize(dnsRecord.length);
280 if (digestContent) {
281 curi.setContentDigest(algorithm,
282 rec.getRecordedInput().getDigestValue());
283 }
284 }
285
286 protected byte [] getDNSRecord(final long fetchStart,
287 final Record[] rrecordSet)
288 throws IOException {
289 ByteArrayOutputStream baos = new ByteArrayOutputStream();
290
291 byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
292 baos.write(fetchDate);
293
294 baos.write("\n".getBytes());
295 int recordLength = fetchDate.length + 1;
296 if (rrecordSet != null) {
297 for (int i = 0; i < rrecordSet.length; i++) {
298 byte[] record = rrecordSet[i].toString().getBytes();
299 recordLength += record.length;
300 baos.write(record);
301
302 baos.write("\n".getBytes());
303 recordLength += 1;
304 }
305 }
306 return baos.toByteArray();
307 }
308
309 protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
310 host.setIP(null, 0);
311 curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE);
312 }
313
314 protected ARecord getFirstARecord(Record[] rrecordSet) {
315 ARecord arecord = null;
316 if (rrecordSet == null || rrecordSet.length == 0) {
317 if (logger.isLoggable(Level.FINEST)) {
318 logger.finest("rrecordSet is null or zero length: " +
319 rrecordSet);
320 }
321 return arecord;
322 }
323 for (int i = 0; i < rrecordSet.length; i++) {
324 if (rrecordSet[i].getType() != Type.A) {
325 if (logger.isLoggable(Level.FINEST)) {
326 logger.finest("Record " + Integer.toString(i) +
327 " is not A type but " + rrecordSet[i].getType());
328 }
329 continue;
330 }
331 arecord = (ARecord) rrecordSet[i];
332 break;
333 }
334 return arecord;
335 }
336 }