View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * FetchDNS
20   * Created on Jun 5, 2003
21   *
22   * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/fetcher/FetchDNS.java,v 1.29.4.1 2007/01/13 01:31:17 stack-sf Exp $
23   */
24  package org.archive.crawler.fetcher;
25  
26  import java.io.ByteArrayInputStream;
27  import java.io.ByteArrayOutputStream;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.net.InetAddress;
31  import java.net.UnknownHostException;
32  import java.security.MessageDigest;
33  import java.util.logging.Level;
34  import java.util.logging.Logger;
35  import java.util.regex.Matcher;
36  
37  import org.apache.commons.httpclient.URIException;
38  import org.archive.crawler.datamodel.CoreAttributeConstants;
39  import org.archive.crawler.datamodel.CrawlHost;
40  import org.archive.crawler.datamodel.CrawlURI;
41  import org.archive.crawler.datamodel.FetchStatusCodes;
42  import org.archive.crawler.framework.Processor;
43  import org.archive.crawler.settings.SimpleType;
44  import org.archive.util.ArchiveUtils;
45  import org.archive.util.HttpRecorder;
46  import org.archive.util.InetAddressUtil;
47  import org.xbill.DNS.ARecord;
48  import org.xbill.DNS.DClass;
49  import org.xbill.DNS.Lookup;
50  import org.xbill.DNS.Record;
51  import org.xbill.DNS.ResolverConfig;
52  import org.xbill.DNS.TextParseException;
53  import org.xbill.DNS.Type;
54  
55  
56  /***
57   * Processor to resolve 'dns:' URIs.
58   * 
59   * TODO: Refactor to use org.archive.util.DNSJavaUtils.
60   *
61   * @author multiple
62   */
63  public class FetchDNS extends Processor
64  implements CoreAttributeConstants, FetchStatusCodes {
65  	private static final long serialVersionUID = 4686199203459704426L;
66  
67  	private Logger logger = Logger.getLogger(this.getClass().getName());
68  
69      // Defaults.
70      private short ClassType = DClass.IN;
71      private short TypeType = Type.A;
72      protected InetAddress serverInetAddr = null;
73      
74      private static final String ATTR_ACCEPT_NON_DNS_RESOLVES =
75          "accept-non-dns-resolves";
76      private static final Boolean DEFAULT_ACCEPT_NON_DNS_RESOLVES =
77          Boolean.FALSE;
78      private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES
79          = 6 * 60 * 60; // 6 hrs
80      
81      private byte [] reusableBuffer = new byte[1024];
82  
83      /*** 
84       * Create a new instance of FetchDNS.
85       *
86       * @param name the name of this attribute.
87       */
88      public FetchDNS(String name) {
89          super(name, "DNS Fetcher. Handles DNS lookups.");
90          org.archive.crawler.settings.Type e =
91              addElementToDefinition(new SimpleType(ATTR_ACCEPT_NON_DNS_RESOLVES,
92                  "If a DNS lookup fails, whether or not to fallback to " +
93                  "InetAddress resolution, which may use local 'hosts' files " +
94                  "or other mechanisms.", DEFAULT_ACCEPT_NON_DNS_RESOLVES));
95          e.setExpertSetting(true);
96          e = addElementToDefinition(new SimpleType(FetchHTTP.ATTR_DIGEST_CONTENT,
97                  "Whether or not to perform an on-the-fly digest hash of" +
98                  " retrieved content-bodies.",
99                  FetchHTTP.DEFAULT_DIGEST_CONTENT));
100         e.setExpertSetting(true);
101         e = addElementToDefinition(new SimpleType(
102             FetchHTTP.ATTR_DIGEST_ALGORITHM, "Which algorithm (for example " +
103                 "MD5 or SHA-1) to use to perform an on-the-fly digest" +
104                 " hash of retrieved content-bodies.",
105                 FetchHTTP.DEFAULT_DIGEST_ALGORITHM,
106                 FetchHTTP.DIGEST_ALGORITHMS));
107         e.setExpertSetting(true);
108     }
109 
110     protected void innerProcess(CrawlURI curi) {
111         if (!curi.getUURI().getScheme().equals("dns")) {
112             // Only handles dns
113             return;
114         }
115         Record[] rrecordSet = null; // Retrieved dns records
116         String dnsName = null;
117         try {
118             dnsName = curi.getUURI().getReferencedHost();
119         } catch (URIException e) {
120             logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
121         }
122         
123         if(dnsName == null) {
124             curi.setFetchStatus(S_UNFETCHABLE_URI);
125             return;
126         }
127 
128         // Make sure we're in "normal operating mode", e.g. a cache +
129         // controller exist to assist us.
130         CrawlHost targetHost = null;
131         if (getController() != null &&
132                 getController().getServerCache() != null) {
133             targetHost = getController().getServerCache().getHostFor(dnsName);
134         } else {
135             // Standalone operation (mostly for test cases/potential other uses)
136             targetHost = new CrawlHost(dnsName);
137         }
138         if (isQuadAddress(curi, dnsName, targetHost)) {
139         	// We're done processing.
140         	return;
141         }
142         
143         // Do actual DNS lookup.
144         curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
145 
146         // Try to get the records for this host (assume domain name)
147         // TODO: Bug #935119 concerns potential hang here
148         try {
149             rrecordSet = (new Lookup(dnsName, TypeType, ClassType)).run();
150         } catch (TextParseException e) {
151             rrecordSet = null;
152         }
153         curi.setContentType("text/dns");
154         if (rrecordSet != null) {
155             if (logger.isLoggable(Level.FINE)) {
156                 logger.fine("Found recordset for " + dnsName);
157             }
158         	storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
159         } else {
160             if (logger.isLoggable(Level.FINE)) {
161                 logger.fine("Failed find of recordset for " + dnsName);
162             }
163             if (((Boolean)getUncheckedAttribute(null,
164                     ATTR_ACCEPT_NON_DNS_RESOLVES)).booleanValue()) {
165                 // Do lookup that bypasses javadns.
166                 InetAddress address = null;
167                 try {
168                     address = InetAddress.getByName(dnsName);
169                 } catch (UnknownHostException e1) {
170                     address = null;
171                 }
172                 if (address != null) {
173                     targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
174                     curi.setFetchStatus(S_GETBYNAME_SUCCESS);
175                     if (logger.isLoggable(Level.FINE)) {
176                         logger.fine("Found address for " + dnsName +
177                             " using native dns.");
178                     }
179                 } else {
180                     if (logger.isLoggable(Level.FINE)) {
181                         logger.fine("Failed find of address for " + dnsName +
182                             " using native dns.");
183                     }
184                     setUnresolvable(curi, targetHost);
185                 }
186             } else {
187                 setUnresolvable(curi, targetHost);
188             }
189         }
190         curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
191     }
192     
193     protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
194     		final CrawlHost targetHost, final Record[] rrecordSet) {
195         // Get TTL and IP info from the first A record (there may be
196         // multiple, e.g. www.washington.edu) then update the CrawlServer
197         ARecord arecord = getFirstARecord(rrecordSet);
198         if (arecord == null) {
199             throw new NullPointerException("Got null arecord for " +
200                 dnsName);
201         }
202         targetHost.setIP(arecord.getAddress(), arecord.getTTL());
203         try {
204         	recordDNS(curi, rrecordSet);
205             curi.setFetchStatus(S_DNS_SUCCESS);
206             curi.putString(A_DNS_SERVER_IP_LABEL, ResolverConfig.getCurrentConfig().server());
207         } catch (IOException e) {
208         	logger.log(Level.SEVERE, "Failed store of DNS Record for " +
209         		curi.toString(), e);
210         	setUnresolvable(curi, targetHost);
211         }
212     }
213     
214     protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
215 			final CrawlHost targetHost) {
216 		boolean result = false;
217 		Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
218 		// If it's an ip no need to do a lookup
219 		if (matcher == null || !matcher.matches()) {
220 			return result;
221 		}
222 		
223 		result = true;
224 		// Ideally this branch would never be reached: no CrawlURI
225 		// would be created for numerical IPs
226 		if (logger.isLoggable(Level.WARNING)) {
227 			logger.warning("Unnecessary DNS CrawlURI created: " + curi);
228 		}
229 		try {
230 			targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
231 					(byte) (new Integer(matcher.group(1)).intValue()),
232 					(byte) (new Integer(matcher.group(2)).intValue()),
233 					(byte) (new Integer(matcher.group(3)).intValue()),
234 					(byte) (new Integer(matcher.group(4)).intValue()) }),
235 					CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
236 			curi.setFetchStatus(S_DNS_SUCCESS);
237 		} catch (UnknownHostException e) {
238 			logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
239 			setUnresolvable(curi, targetHost);
240 		}
241 		return result;
242 	}
243     
244     protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
245 	throws IOException {
246 		final byte[] dnsRecord =
247 			getDNSRecord(curi.getLong(A_FETCH_BEGAN_TIME), rrecordSet);
248 		HttpRecorder rec = HttpRecorder.getHttpRecorder();
249         
250         // Shall we get a digest on the content downloaded?
251 		boolean digestContent  = ((Boolean)getUncheckedAttribute(curi,
252                 FetchHTTP.ATTR_DIGEST_CONTENT)).booleanValue();
253         String algorithm = null; 
254         if (digestContent) {
255             algorithm = ((String)getUncheckedAttribute(curi,
256                 FetchHTTP.ATTR_DIGEST_ALGORITHM));
257             rec.getRecordedInput().setDigest(algorithm);
258         } else {
259             // clear
260             rec.getRecordedInput().setDigest((MessageDigest)null);
261         }
262         
263 		curi.setHttpRecorder(rec);
264 		InputStream is = curi.getHttpRecorder().inputWrap(
265 				new ByteArrayInputStream(dnsRecord));
266         if(digestContent) {
267             rec.getRecordedInput().startDigest();
268         }
269 		// Reading from the wrapped stream, behind the scenes, will write
270 		// files into scratch space
271 		try {
272 			while (is.read(this.reusableBuffer) != -1) {
273 				continue;
274 			}
275 		} finally {
276 			is.close();
277 			rec.closeRecorders();
278 		}
279 		curi.setContentSize(dnsRecord.length);
280         if (digestContent) {
281             curi.setContentDigest(algorithm,
282                 rec.getRecordedInput().getDigestValue());
283         }
284 	}
285     
286     protected byte [] getDNSRecord(final long fetchStart,
287     		final Record[] rrecordSet)
288     throws IOException {
289         ByteArrayOutputStream baos = new ByteArrayOutputStream();
290         // Start the record with a 14-digit date per RFC 2540
291         byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
292         baos.write(fetchDate);
293         // Don't forget the newline
294         baos.write("\n".getBytes());
295         int recordLength = fetchDate.length + 1;
296         if (rrecordSet != null) {
297             for (int i = 0; i < rrecordSet.length; i++) {
298                 byte[] record = rrecordSet[i].toString().getBytes();
299                 recordLength += record.length;
300                 baos.write(record);
301                 // Add the newline between records back in
302                 baos.write("\n".getBytes());
303                 recordLength += 1;
304             }
305         }
306         return baos.toByteArray();
307     }
308     
309     protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
310         host.setIP(null, 0);
311         curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 
312     }
313     
314     protected ARecord getFirstARecord(Record[] rrecordSet) {
315         ARecord arecord = null;
316         if (rrecordSet == null || rrecordSet.length == 0) {
317             if (logger.isLoggable(Level.FINEST)) {
318                 logger.finest("rrecordSet is null or zero length: " +
319                     rrecordSet);
320             }
321             return arecord;
322         }
323         for (int i = 0; i < rrecordSet.length; i++) {
324             if (rrecordSet[i].getType() != Type.A) {
325                 if (logger.isLoggable(Level.FINEST)) {
326                     logger.finest("Record " + Integer.toString(i) +
327                         " is not A type but " + rrecordSet[i].getType());
328                 }
329                 continue;
330             }
331             arecord = (ARecord) rrecordSet[i];
332             break;
333         }
334         return arecord;
335     }
336 }