1   /* FetchFTP.java
2    *
3    * $Id: FetchFTP.java 5407 2007-08-16 17:51:21Z gojomo $
4    *
5    * Created on Jun 5, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.fetcher;
26  
27  
28  import java.io.IOException;
29  import java.io.UnsupportedEncodingException;
30  import java.net.Socket;
31  import java.net.URLEncoder;
32  import java.util.logging.Level;
33  import java.util.logging.Logger;
34  import java.util.regex.Matcher;
35  import java.util.regex.Pattern;
36  
37  import javax.management.AttributeNotFoundException;
38  
39  import org.apache.commons.httpclient.URIException;
40  import org.apache.commons.net.ftp.FTPCommand;
41  import org.archive.crawler.datamodel.CrawlURI;
42  import org.archive.crawler.datamodel.CoreAttributeConstants;
43  import org.archive.crawler.datamodel.FetchStatusCodes;
44  import org.archive.crawler.extractor.Link;
45  import static org.archive.crawler.extractor.Link.NAVLINK_HOP;
46  import static org.archive.crawler.extractor.Link.NAVLINK_MISC;
47  import org.archive.crawler.framework.Processor;
48  import org.archive.crawler.settings.SimpleType;
49  import org.archive.io.RecordingInputStream;
50  import org.archive.io.ReplayCharSequence;
51  import org.archive.net.ClientFTP;
52  import org.archive.net.FTPException;
53  import org.archive.net.UURI;
54  import org.archive.net.UURIFactory;
55  import org.archive.util.ArchiveUtils;
56  import org.archive.util.HttpRecorder;
57  
58  
59  /***
60   * Fetches documents and directory listings using FTP.  This class will also
61   * try to extract FTP "links" from directory listings.  For this class to
62   * archive a directory listing, the remote FTP server must support the NLIST
63   * command.  Most modern FTP servers should.
64   * 
65   * @author pjack
66   *
67   */
68  public class FetchFTP extends Processor implements CoreAttributeConstants {
69  
70      
71      /*** Serialization ID; robust against trivial API changes. */
72      private static final long serialVersionUID =
73       ArchiveUtils.classnameBasedUID(FetchFTP.class,1);
74  
75      /*** Logger for this class. */
76      private static Logger logger = Logger.getLogger(FetchFTP.class.getName());
77  
78      /*** Pattern for matching directory entries. */
79      private static Pattern DIR = 
80       Pattern.compile("(.+)$", Pattern.MULTILINE);
81  
82      
83      /*** The name for the <code>username</code> attribute. */
84      final public static String ATTR_USERNAME = "username";
85     
86      /*** The description for the <code>username</code> attribute. */
87      final private static String DESC_USERNAME = "The username to send to " +
88       "FTP servers.  By convention, the default value of \"anonymous\" is " +
89       "used for publicly available FTP sites.";
90      
91      /*** The default value for the <code>username</code> attribute. */
92      final private static String DEFAULT_USERNAME = "anonymous";
93  
94  
95      /*** The name for the <code>password</code> attribute. */
96      final public static String ATTR_PASSWORD = "password";
97     
98      /*** The description for the <code>password</code> attribute. */
99      final private static String DESC_PASSWORD = "The password to send to " +
100     "FTP servers.  By convention, anonymous users send their email address " +
101     "in this field.";
102     
103     /*** The default value for the <code>password</code> attribute. */
104     final private static String DEFAULT_PASSWORD = "";
105 
106     
107     /*** The name for the <code>extract-from-dirs</code> attribute. */
108     final private static String ATTR_EXTRACT = "extract-from-dirs";
109     
110     /*** The description for the <code>extract-from-dirs</code> attribute. */
111     final private static String DESC_EXTRACT = "Set to true to extract "
112      + "further URIs from FTP directories.  Default is true.";
113     
114     /*** The default value for the <code>extract-from-dirs</code> attribute. */
115     final private static boolean DEFAULT_EXTRACT = true;
116 
117     
118     /*** The name for the <code>extract-parent</code> attribute. */
119     final private static String ATTR_EXTRACT_PARENT = "extract_parent";
120     
121     /*** The description for the <code>extract-parent</code> attribute. */
122     final private static String DESC_EXTRACT_PARENT = "Set to true to extract "
123      + "the parent URI from all FTP URIs.  Default is true.";
124     
125     /*** The default value for the <code>extract-parent</code> attribute. */
126     final private static boolean DEFAULT_EXTRACT_PARENT = true;
127     
128     
129     /*** The name for the <code>max-length-bytes</code> attribute. */
130     final public static String ATTR_MAX_LENGTH = "max-length-bytes";
131     
132     /*** The description for the <code>max-length-bytes</code> attribute. */
133     final private static String DESC_MAX_LENGTH = 
134         "Maximum length in bytes to fetch.\n" +
135         "Fetch is truncated at this length. A value of 0 means no limit.";
136     
137     /*** The default value for the <code>max-length-bytes</code> attribute. */
138     final private static long DEFAULT_MAX_LENGTH = 0;
139 
140     
141     /*** The name for the <code>fetch-bandwidth</code> attribute. */
142     final public static String ATTR_BANDWIDTH = "fetch-bandwidth";
143     
144     /*** The description for the <code>fetch-bandwidth</code> attribute. */
145     final private static String DESC_BANDWIDTH = "";
146     
147     /*** The default value for the <code>fetch-bandwidth</code> attribute. */
148     final private static int DEFAULT_BANDWIDTH = 0;
149     
150     
151     /*** The name for the <code>timeout-seconds</code> attribute. */
152     final public static String ATTR_TIMEOUT = "timeout-seconds";
153     
154     /*** The description for the <code>timeout-seconds</code> attribute. */
155     final private static String DESC_TIMEOUT = "If the fetch is not "
156      + "completed in this number of seconds, give up (and retry later).";
157     
158     /*** The default value for the <code>timeout-seconds</code> attribute. */
159     final private static int DEFAULT_TIMEOUT = 1200;
160     
161 
162     /***
163      * Constructs a new <code>FetchFTP</code>.
164      * 
165      * @param name  the name of this processor
166      */
167     public FetchFTP(String name) {
168         super(name, "FTP Fetcher.");
169         add(ATTR_USERNAME, DESC_USERNAME, DEFAULT_USERNAME);
170         add(ATTR_PASSWORD, DESC_PASSWORD, DEFAULT_PASSWORD);
171         add(ATTR_EXTRACT, DESC_EXTRACT, DEFAULT_EXTRACT);
172         add(ATTR_EXTRACT_PARENT, DESC_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT);
173         add(ATTR_MAX_LENGTH, DESC_MAX_LENGTH, DEFAULT_MAX_LENGTH);
174         add(ATTR_BANDWIDTH, DESC_BANDWIDTH, DEFAULT_BANDWIDTH);
175         add(ATTR_TIMEOUT, DESC_TIMEOUT, DEFAULT_TIMEOUT);
176     }
177 
178     
179     /***
180      * Convenience method for adding an attribute.
181      * 
182      * @param name   The name of the attribute
183      * @param desc   The description of the attribute
184      * @param def    The default value for the attribute
185      */
186     private void add(String name, String desc, Object def) {
187         SimpleType st = new SimpleType(name, desc, def);
188         addElementToDefinition(st);
189     }
190     
191     
192     /***
193      * Convenience method for extracting an attribute.
194      * If a value for the specified name cannot be found,
195      * a warning is written to the log and the specified
196      * default value is returned instead.
197      * 
198      * @param context  The context for the attribute fetch
199      * @param name     The name of the attribute to fetch
200      * @param def      The value to return if the attribute isn't found
201      * @return         The value of that attribute
202      */
203     private Object get(Object context, String name, Object def) {
204         try {
205             return getAttribute(context, name);
206         } catch (AttributeNotFoundException e) {
207             logger.warning("Attribute not found (using default): " + name);
208             return def;
209         }
210     }
211     
212 
213     /***
214      * Processes the given URI.  If the given URI is not an FTP URI, then
215      * this method does nothing.  Otherwise an attempt is made to connect
216      * to the FTP server.
217      * 
218      * <p>If the connection is successful, an attempt will be made to CD to 
219      * the path specified in the URI.  If the remote CD command succeeds, 
220      * then it is assumed that the URI represents a directory.  If the
221      * CD command fails, then it is assumed that the URI represents
222      * a file.
223      * 
224      * <p>For directories, the directory listing will be fetched using
225      * the FTP LIST command, and saved to the HttpRecorder.  If the
226      * <code>extract.from.dirs</code> attribute is set to true, then
227      * the files in the fetched list will be added to the curi as
228      * extracted FTP links.  (It was easier to do that here, rather
229      * than writing a separate FTPExtractor.)
230      * 
231      * <p>For files, the file will be fetched using the FTP RETR
232      * command, and saved to the HttpRecorder.
233      * 
234      * <p>All file transfers (including directory listings) occur using
235      * Binary mode transfer.  Also, the local passive transfer mode
236      * is always used, to play well with firewalls.
237      * 
238      * @param curi  the curi to process
239      * @throws InterruptedException  if the thread is interrupted during
240      *   processing
241      */
242     public void innerProcess(CrawlURI curi) throws InterruptedException {
243         if (!curi.getUURI().getScheme().equals("ftp")) {
244             return;
245         }
246         
247         curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
248         HttpRecorder recorder = HttpRecorder.getHttpRecorder();
249         ClientFTP client = new ClientFTP();
250         
251         try {
252             fetch(curi, client, recorder);
253         } catch (FTPException e) {
254             logger.log(Level.SEVERE, "FTP server reported problem.", e);
255             curi.setFetchStatus(e.getReplyCode());
256         } catch (IOException e) {
257             logger.log(Level.SEVERE, "IO Error during FTP fetch.", e);
258             curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST);
259         } finally {
260             disconnect(client);
261             curi.setContentSize(recorder.getRecordedInput().getSize());
262             curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
263         }
264     }
265 
266 
267     /***
268      * Fetches a document from an FTP server.
269      * 
270      * @param curi      the URI of the document to fetch
271      * @param client    the FTPClient to use for the fetch
272      * @param recorder  the recorder to preserve the document in
273      * @throws IOException  if a network or protocol error occurs
274      * @throws InterruptedException  if the thread is interrupted
275      */
276     private void fetch(CrawlURI curi, ClientFTP client, HttpRecorder recorder) 
277     throws IOException, InterruptedException {
278         // Connect to the FTP server.
279         UURI uuri = curi.getUURI();
280         int port = uuri.getPort();
281         if (port == -1) {
282             port = 21;
283         }
284         client.connectStrict(uuri.getHost(), port);
285         
286         // Authenticate.
287         String[] auth = getAuth(curi);
288         client.loginStrict(auth[0], auth[1]);
289         
290         // The given resource may or may not be a directory.
291         // To figure out which is which, execute a CD command to
292         // the UURI's path.  If CD works, it's a directory.
293         boolean dir = client.changeWorkingDirectory(uuri.getPath());
294         if (dir) {
295             curi.setContentType("text/plain");
296         }
297         
298         // TODO: A future version of this class could use the system string to
299         // set up custom directory parsing if the FTP server doesn't support 
300         // the nlist command.
301         if (logger.isLoggable(Level.FINE)) {
302             String system = client.getSystemName();
303             logger.fine(system);
304         }
305         
306         // Get a data socket.  This will either be the result of a NLIST
307         // command for a directory, or a RETR command for a file.
308         int command = dir ? FTPCommand.NLST : FTPCommand.RETR;
309         String path = dir ? "." : uuri.getPath();
310         client.enterLocalPassiveMode();
311         client.setBinary();
312         Socket socket = client.openDataConnection(command, path);
313         curi.setFetchStatus(client.getReplyCode());
314 
315         // Save the streams in the CURI, where downstream processors
316         // expect to find them.
317         try {
318             saveToRecorder(curi, socket, recorder);
319         } finally {
320             recorder.close();
321             close(socket);
322         }
323 
324         curi.setFetchStatus(200);
325         if (dir) {
326             extract(curi, recorder);
327         }
328         addParent(curi);
329     }
330     
331     
332     /***
333      * Saves the given socket to the given recorder.
334      * 
335      * @param curi      the curi that owns the recorder
336      * @param socket    the socket whose streams to save
337      * @param recorder  the recorder to save them to
338      * @throws IOException  if a network or file error occurs
339      * @throws InterruptedException  if the thread is interrupted
340      */
341     private void saveToRecorder(CrawlURI curi,
342             Socket socket, HttpRecorder recorder) 
343     throws IOException, InterruptedException {
344         curi.setHttpRecorder(recorder);
345         recorder.markContentBegin();
346         recorder.inputWrap(socket.getInputStream());
347         recorder.outputWrap(socket.getOutputStream());
348 
349         // Read the remote file/dir listing in its entirety.
350         long softMax = 0;
351         long hardMax = getMaxLength(curi);
352         long timeout = (long)getTimeout(curi) * 1000;
353         int maxRate = getFetchBandwidth(curi);
354         RecordingInputStream input = recorder.getRecordedInput();
355         input.setLimits(hardMax, timeout, maxRate); 
356         input.readFullyOrUntil(softMax);
357     }
358     
359     
360     /***
361      * Extract FTP links in a directory listing.
362      * The listing must already be saved to the given recorder.
363      * 
364      * @param curi      The curi to save extracted links to
365      * @param recorder  The recorder containing the directory listing
366      */
367     private void extract(CrawlURI curi, HttpRecorder recorder) {
368         if (!getExtractFromDirs(curi)) {
369             return;
370         }
371         
372         ReplayCharSequence seq = null;
373         try {
374             seq = recorder.getReplayCharSequence();
375             extract(curi, seq);
376         } catch (IOException e) {
377             logger.log(Level.SEVERE, "IO error during extraction.", e);
378         } catch (RuntimeException e) {
379             logger.log(Level.SEVERE, "IO error during extraction.", e);
380         } finally {
381             close(seq);
382         }
383     }
384     
385     
386     /***
387      * Extracts FTP links in a directory listing.
388      * 
389      * @param curi  The curi to save extracted links to
390      * @param dir   The directory listing to extract links from
391      * @throws URIException  if an extracted link is invalid
392      */
393     private void extract(CrawlURI curi, ReplayCharSequence dir) {
394         logger.log(Level.FINEST, "Extracting URIs from FTP directory.");
395         Matcher matcher = DIR.matcher(dir);
396         while (matcher.find()) {
397             String file = matcher.group(1);
398             addExtracted(curi, file);
399         }
400     }
401 
402 
403     /***
404      * Adds an extracted filename to the curi.  A new URI will be formed
405      * by taking the given curi (which should represent the directory the
406      * file lives in) and appending the file.
407      * 
408      * @param curi  the curi to store the discovered link in
409      * @param file  the filename of the discovered link
410      */
411     private void addExtracted(CrawlURI curi, String file) {
412         try {
413             file = URLEncoder.encode(file, "UTF-8");
414         } catch (UnsupportedEncodingException e) {
415             throw new AssertionError(e);
416         }
417         if (logger.isLoggable(Level.FINEST)) {
418             logger.log(Level.FINEST, "Found " + file);
419         }
420         String base = curi.toString();
421         if (base.endsWith("/")) {
422             base = base.substring(0, base.length() - 1);
423         }
424         try {
425             UURI n = UURIFactory.getInstance(base + "/" + file);
426             Link link = new Link(curi.getUURI(), n, NAVLINK_MISC, NAVLINK_HOP);
427             curi.addOutLink(link);
428         } catch (URIException e) {
429             logger.log(Level.WARNING, "URI error during extraction.", e);            
430         }
431     }
432     
433 
434     /***
435      * Extracts the parent URI from the given curi, then adds that parent
436      * URI as a discovered link to the curi. 
437      * 
438      * <p>If the <code>extract-parent</code> attribute is false, then this
439      * method does nothing.  Also, if the path of the given curi is 
440      * <code>/</code>, then this method does nothing.
441      * 
442      * <p>Otherwise the parent is determined by eliminated the lowest part
443      * of the URI's path.  Eg, the parent of <code>ftp://foo.com/one/two</code>
444      * is <code>ftp://foo.com/one</code>.
445      * 
446      * @param curi  the curi whose parent to add
447      */
448     private void addParent(CrawlURI curi) {
449         if (!getExtractParent(curi)) {
450             return;
451         }
452         UURI uuri = curi.getUURI();
453         try {
454             if (uuri.getPath().equals("/")) {
455                 // There's no parent to add.
456                 return;
457             }
458             String scheme = uuri.getScheme();
459             String auth = uuri.getEscapedAuthority();
460             String path = uuri.getEscapedCurrentHierPath();
461             UURI parent = UURIFactory.getInstance(scheme + "://" + auth + path);
462 
463             Link link = new Link(uuri, parent, NAVLINK_MISC, NAVLINK_HOP);
464             curi.addOutLink(link);
465         } catch (URIException e) {
466             logger.log(Level.WARNING, "URI error during extraction.", e);
467         }
468     }
469     
470     
471     /***
472      * Returns the <code>extract.from.dirs</code> attribute for this
473      * <code>FetchFTP</code> and the given curi.
474      * 
475      * @param curi  the curi whose attribute to return
476      * @return  that curi's <code>extract.from.dirs</code>
477      */
478     public boolean getExtractFromDirs(CrawlURI curi) {
479         return (Boolean)get(curi, ATTR_EXTRACT, DEFAULT_EXTRACT);
480     }
481     
482     
483     /***
484      * Returns the <code>extract.parent</code> attribute for this
485      * <code>FetchFTP</code> and the given curi.
486      * 
487      * @param curi  the curi whose attribute to return
488      * @return  that curi's <code>extract-parent</code>
489      */
490     public boolean getExtractParent(CrawlURI curi) {
491         return (Boolean)get(curi, ATTR_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT);
492     }
493 
494 
495     /***
496      * Returns the <code>timeout-seconds</code> attribute for this
497      * <code>FetchFTP</code> and the given curi.
498      * 
499      * @param curi   the curi whose attribute to return
500      * @return   that curi's <code>timeout-seconds</code>
501      */
502     public int getTimeout(CrawlURI curi) {
503         return (Integer)get(curi, ATTR_TIMEOUT, DEFAULT_TIMEOUT);
504     }
505 
506 
507     /***
508      * Returns the <code>max-length-bytes</code> attribute for this
509      * <code>FetchFTP</code> and the given curi.
510      * 
511      * @param curi  the curi whose attribute to return
512      * @return  that curi's <code>max-length-bytes</code>
513      */
514     public long getMaxLength(CrawlURI curi) {
515         return (Long)get(curi, ATTR_MAX_LENGTH, DEFAULT_MAX_LENGTH);
516     }
517 
518 
519     /***
520      * Returns the <code>fetch-bandwidth</code> attribute for this
521      * <code>FetchFTP</code> and the given curi.
522      * 
523      * @param curi  the curi whose attribute to return
524      * @return  that curi's <code>fetch-bandwidth</code>
525      */
526     public int getFetchBandwidth(CrawlURI curi) {
527         return (Integer)get(curi, ATTR_BANDWIDTH, DEFAULT_BANDWIDTH);
528     }
529 
530 
531     /***
532      * Returns the username and password for the given URI.  This method
533      * always returns an array of length 2.  The first element in the returned
534      * array is the username for the URI, and the second element is the
535      * password.
536      * 
537      * <p>If the URI itself contains the username and password (i.e., it looks
538      * like <code>ftp://username:password@host/path</code>) then that username
539      * and password are returned.
540      * 
541      * <p>Otherwise the settings system is probed for the <code>username</code>
542      * and <code>password</code> attributes for this <code>FTPFetch</code>
543      * and the given <code>curi</code> context.  The values of those 
544      * attributes are then returned.
545      * 
546      * @param curi  the curi whose username and password to return
547      * @return  an array containing the username and password
548      */
549     private String[] getAuth(CrawlURI curi) {
550         String[] result = new String[2];
551         UURI uuri = curi.getUURI();
552         String userinfo;
553         try {
554             userinfo = uuri.getUserinfo();
555         } catch (URIException e) {
556             assert false;
557             logger.finest("getUserinfo raised URIException.");
558             userinfo = null;
559         }
560         if (userinfo != null) {
561             int p = userinfo.indexOf(':');
562             if (p > 0) {
563                 result[0] = userinfo.substring(0,p);
564                 result[1] = userinfo.substring(p + 1);
565                 return result;
566             }
567         }
568         result[0] = (String)get(curi, ATTR_USERNAME, DEFAULT_USERNAME);
569         result[1] = (String)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);
570         return result;
571     }
572     
573     
574     /***
575      * Determines the password for the given URI.  If the URI itself contains
576      * a password, then that password is returned.  Otherwise the settings
577      * system is probed for the <code>password</code> attribute, and the value
578      * for that attribute is returned.
579      * 
580      * @param curi  the curi whose password to return
581      * @return  that password
582      */
583     public String determinePassword(CrawlURI curi) {
584         return (String)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);
585     }
586 
587 
588     /***
589      * Quietly closes the given socket.
590      * 
591      * @param socket  the socket to close
592      */
593     private static void close(Socket socket) {
594         try {
595             socket.close();
596         } catch (IOException e) {
597             logger.log(Level.WARNING, "IO error closing socket.", e);
598         }
599     }
600 
601 
602     /***
603      * Quietly closes the given sequence.
604      * If an IOException is raised, this method logs it as a warning.
605      * 
606      * @param seq  the sequence to close
607      */
608     private static void close(ReplayCharSequence seq) {
609         if (seq == null) {
610             return;
611         }
612         try {
613             seq.close();
614         } catch (IOException e) {
615             logger.log(Level.WARNING, "IO error closing ReplayCharSequence.", 
616              e);
617         }
618     }
619 
620     
621     /***
622      * Quietly disconnects from the given FTP client.
623      * If an IOException is raised, this method logs it as a warning.
624      * 
625      * @param client  the client to disconnect
626      */
627     private static void disconnect(ClientFTP client) {
628         if (client.isConnected()) try {
629             client.disconnect();
630         } catch (IOException e) {
631             if (logger.isLoggable(Level.WARNING)) {
632                 logger.warning("Could not disconnect from FTP client: " 
633                  + e.getMessage());
634             }
635         }        
636     }
637 
638 
639 }