1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.fetcher;
26
27
28 import java.io.IOException;
29 import java.io.UnsupportedEncodingException;
30 import java.net.Socket;
31 import java.net.URLEncoder;
32 import java.util.logging.Level;
33 import java.util.logging.Logger;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37 import javax.management.AttributeNotFoundException;
38
39 import org.apache.commons.httpclient.URIException;
40 import org.apache.commons.net.ftp.FTPCommand;
41 import org.archive.crawler.datamodel.CrawlURI;
42 import org.archive.crawler.datamodel.CoreAttributeConstants;
43 import org.archive.crawler.datamodel.FetchStatusCodes;
44 import org.archive.crawler.extractor.Link;
45 import static org.archive.crawler.extractor.Link.NAVLINK_HOP;
46 import static org.archive.crawler.extractor.Link.NAVLINK_MISC;
47 import org.archive.crawler.framework.Processor;
48 import org.archive.crawler.settings.SimpleType;
49 import org.archive.io.RecordingInputStream;
50 import org.archive.io.ReplayCharSequence;
51 import org.archive.net.ClientFTP;
52 import org.archive.net.FTPException;
53 import org.archive.net.UURI;
54 import org.archive.net.UURIFactory;
55 import org.archive.util.ArchiveUtils;
56 import org.archive.util.HttpRecorder;
57
58
59 /***
60 * Fetches documents and directory listings using FTP. This class will also
61 * try to extract FTP "links" from directory listings. For this class to
62 * archive a directory listing, the remote FTP server must support the NLIST
63 * command. Most modern FTP servers should.
64 *
65 * @author pjack
66 *
67 */
68 public class FetchFTP extends Processor implements CoreAttributeConstants {
69
70
71 /*** Serialization ID; robust against trivial API changes. */
72 private static final long serialVersionUID =
73 ArchiveUtils.classnameBasedUID(FetchFTP.class,1);
74
75 /*** Logger for this class. */
76 private static Logger logger = Logger.getLogger(FetchFTP.class.getName());
77
78 /*** Pattern for matching directory entries. */
79 private static Pattern DIR =
80 Pattern.compile("(.+)$", Pattern.MULTILINE);
81
82
83 /*** The name for the <code>username</code> attribute. */
84 final public static String ATTR_USERNAME = "username";
85
86 /*** The description for the <code>username</code> attribute. */
87 final private static String DESC_USERNAME = "The username to send to " +
88 "FTP servers. By convention, the default value of \"anonymous\" is " +
89 "used for publicly available FTP sites.";
90
91 /*** The default value for the <code>username</code> attribute. */
92 final private static String DEFAULT_USERNAME = "anonymous";
93
94
95 /*** The name for the <code>password</code> attribute. */
96 final public static String ATTR_PASSWORD = "password";
97
98 /*** The description for the <code>password</code> attribute. */
99 final private static String DESC_PASSWORD = "The password to send to " +
100 "FTP servers. By convention, anonymous users send their email address " +
101 "in this field.";
102
103 /*** The default value for the <code>password</code> attribute. */
104 final private static String DEFAULT_PASSWORD = "";
105
106
107 /*** The name for the <code>extract-from-dirs</code> attribute. */
108 final private static String ATTR_EXTRACT = "extract-from-dirs";
109
110 /*** The description for the <code>extract-from-dirs</code> attribute. */
111 final private static String DESC_EXTRACT = "Set to true to extract "
112 + "further URIs from FTP directories. Default is true.";
113
114 /*** The default value for the <code>extract-from-dirs</code> attribute. */
115 final private static boolean DEFAULT_EXTRACT = true;
116
117
118 /*** The name for the <code>extract-parent</code> attribute. */
119 final private static String ATTR_EXTRACT_PARENT = "extract_parent";
120
121 /*** The description for the <code>extract-parent</code> attribute. */
122 final private static String DESC_EXTRACT_PARENT = "Set to true to extract "
123 + "the parent URI from all FTP URIs. Default is true.";
124
125 /*** The default value for the <code>extract-parent</code> attribute. */
126 final private static boolean DEFAULT_EXTRACT_PARENT = true;
127
128
129 /*** The name for the <code>max-length-bytes</code> attribute. */
130 final public static String ATTR_MAX_LENGTH = "max-length-bytes";
131
132 /*** The description for the <code>max-length-bytes</code> attribute. */
133 final private static String DESC_MAX_LENGTH =
134 "Maximum length in bytes to fetch.\n" +
135 "Fetch is truncated at this length. A value of 0 means no limit.";
136
137 /*** The default value for the <code>max-length-bytes</code> attribute. */
138 final private static long DEFAULT_MAX_LENGTH = 0;
139
140
141 /*** The name for the <code>fetch-bandwidth</code> attribute. */
142 final public static String ATTR_BANDWIDTH = "fetch-bandwidth";
143
144 /*** The description for the <code>fetch-bandwidth</code> attribute. */
145 final private static String DESC_BANDWIDTH = "";
146
147 /*** The default value for the <code>fetch-bandwidth</code> attribute. */
148 final private static int DEFAULT_BANDWIDTH = 0;
149
150
151 /*** The name for the <code>timeout-seconds</code> attribute. */
152 final public static String ATTR_TIMEOUT = "timeout-seconds";
153
154 /*** The description for the <code>timeout-seconds</code> attribute. */
155 final private static String DESC_TIMEOUT = "If the fetch is not "
156 + "completed in this number of seconds, give up (and retry later).";
157
158 /*** The default value for the <code>timeout-seconds</code> attribute. */
159 final private static int DEFAULT_TIMEOUT = 1200;
160
161
162 /***
163 * Constructs a new <code>FetchFTP</code>.
164 *
165 * @param name the name of this processor
166 */
167 public FetchFTP(String name) {
168 super(name, "FTP Fetcher.");
169 add(ATTR_USERNAME, DESC_USERNAME, DEFAULT_USERNAME);
170 add(ATTR_PASSWORD, DESC_PASSWORD, DEFAULT_PASSWORD);
171 add(ATTR_EXTRACT, DESC_EXTRACT, DEFAULT_EXTRACT);
172 add(ATTR_EXTRACT_PARENT, DESC_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT);
173 add(ATTR_MAX_LENGTH, DESC_MAX_LENGTH, DEFAULT_MAX_LENGTH);
174 add(ATTR_BANDWIDTH, DESC_BANDWIDTH, DEFAULT_BANDWIDTH);
175 add(ATTR_TIMEOUT, DESC_TIMEOUT, DEFAULT_TIMEOUT);
176 }
177
178
179 /***
180 * Convenience method for adding an attribute.
181 *
182 * @param name The name of the attribute
183 * @param desc The description of the attribute
184 * @param def The default value for the attribute
185 */
186 private void add(String name, String desc, Object def) {
187 SimpleType st = new SimpleType(name, desc, def);
188 addElementToDefinition(st);
189 }
190
191
192 /***
193 * Convenience method for extracting an attribute.
194 * If a value for the specified name cannot be found,
195 * a warning is written to the log and the specified
196 * default value is returned instead.
197 *
198 * @param context The context for the attribute fetch
199 * @param name The name of the attribute to fetch
200 * @param def The value to return if the attribute isn't found
201 * @return The value of that attribute
202 */
203 private Object get(Object context, String name, Object def) {
204 try {
205 return getAttribute(context, name);
206 } catch (AttributeNotFoundException e) {
207 logger.warning("Attribute not found (using default): " + name);
208 return def;
209 }
210 }
211
212
213 /***
214 * Processes the given URI. If the given URI is not an FTP URI, then
215 * this method does nothing. Otherwise an attempt is made to connect
216 * to the FTP server.
217 *
218 * <p>If the connection is successful, an attempt will be made to CD to
219 * the path specified in the URI. If the remote CD command succeeds,
220 * then it is assumed that the URI represents a directory. If the
221 * CD command fails, then it is assumed that the URI represents
222 * a file.
223 *
224 * <p>For directories, the directory listing will be fetched using
225 * the FTP LIST command, and saved to the HttpRecorder. If the
226 * <code>extract.from.dirs</code> attribute is set to true, then
227 * the files in the fetched list will be added to the curi as
228 * extracted FTP links. (It was easier to do that here, rather
229 * than writing a separate FTPExtractor.)
230 *
231 * <p>For files, the file will be fetched using the FTP RETR
232 * command, and saved to the HttpRecorder.
233 *
234 * <p>All file transfers (including directory listings) occur using
235 * Binary mode transfer. Also, the local passive transfer mode
236 * is always used, to play well with firewalls.
237 *
238 * @param curi the curi to process
239 * @throws InterruptedException if the thread is interrupted during
240 * processing
241 */
242 public void innerProcess(CrawlURI curi) throws InterruptedException {
243 if (!curi.getUURI().getScheme().equals("ftp")) {
244 return;
245 }
246
247 curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
248 HttpRecorder recorder = HttpRecorder.getHttpRecorder();
249 ClientFTP client = new ClientFTP();
250
251 try {
252 fetch(curi, client, recorder);
253 } catch (FTPException e) {
254 logger.log(Level.SEVERE, "FTP server reported problem.", e);
255 curi.setFetchStatus(e.getReplyCode());
256 } catch (IOException e) {
257 logger.log(Level.SEVERE, "IO Error during FTP fetch.", e);
258 curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST);
259 } finally {
260 disconnect(client);
261 curi.setContentSize(recorder.getRecordedInput().getSize());
262 curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
263 }
264 }
265
266
267 /***
268 * Fetches a document from an FTP server.
269 *
270 * @param curi the URI of the document to fetch
271 * @param client the FTPClient to use for the fetch
272 * @param recorder the recorder to preserve the document in
273 * @throws IOException if a network or protocol error occurs
274 * @throws InterruptedException if the thread is interrupted
275 */
276 private void fetch(CrawlURI curi, ClientFTP client, HttpRecorder recorder)
277 throws IOException, InterruptedException {
278
279 UURI uuri = curi.getUURI();
280 int port = uuri.getPort();
281 if (port == -1) {
282 port = 21;
283 }
284 client.connectStrict(uuri.getHost(), port);
285
286
287 String[] auth = getAuth(curi);
288 client.loginStrict(auth[0], auth[1]);
289
290
291
292
293 boolean dir = client.changeWorkingDirectory(uuri.getPath());
294 if (dir) {
295 curi.setContentType("text/plain");
296 }
297
298
299
300
301 if (logger.isLoggable(Level.FINE)) {
302 String system = client.getSystemName();
303 logger.fine(system);
304 }
305
306
307
308 int command = dir ? FTPCommand.NLST : FTPCommand.RETR;
309 String path = dir ? "." : uuri.getPath();
310 client.enterLocalPassiveMode();
311 client.setBinary();
312 Socket socket = client.openDataConnection(command, path);
313 curi.setFetchStatus(client.getReplyCode());
314
315
316
317 try {
318 saveToRecorder(curi, socket, recorder);
319 } finally {
320 recorder.close();
321 close(socket);
322 }
323
324 curi.setFetchStatus(200);
325 if (dir) {
326 extract(curi, recorder);
327 }
328 addParent(curi);
329 }
330
331
332 /***
333 * Saves the given socket to the given recorder.
334 *
335 * @param curi the curi that owns the recorder
336 * @param socket the socket whose streams to save
337 * @param recorder the recorder to save them to
338 * @throws IOException if a network or file error occurs
339 * @throws InterruptedException if the thread is interrupted
340 */
341 private void saveToRecorder(CrawlURI curi,
342 Socket socket, HttpRecorder recorder)
343 throws IOException, InterruptedException {
344 curi.setHttpRecorder(recorder);
345 recorder.markContentBegin();
346 recorder.inputWrap(socket.getInputStream());
347 recorder.outputWrap(socket.getOutputStream());
348
349
350 long softMax = 0;
351 long hardMax = getMaxLength(curi);
352 long timeout = (long)getTimeout(curi) * 1000;
353 int maxRate = getFetchBandwidth(curi);
354 RecordingInputStream input = recorder.getRecordedInput();
355 input.setLimits(hardMax, timeout, maxRate);
356 input.readFullyOrUntil(softMax);
357 }
358
359
360 /***
361 * Extract FTP links in a directory listing.
362 * The listing must already be saved to the given recorder.
363 *
364 * @param curi The curi to save extracted links to
365 * @param recorder The recorder containing the directory listing
366 */
367 private void extract(CrawlURI curi, HttpRecorder recorder) {
368 if (!getExtractFromDirs(curi)) {
369 return;
370 }
371
372 ReplayCharSequence seq = null;
373 try {
374 seq = recorder.getReplayCharSequence();
375 extract(curi, seq);
376 } catch (IOException e) {
377 logger.log(Level.SEVERE, "IO error during extraction.", e);
378 } catch (RuntimeException e) {
379 logger.log(Level.SEVERE, "IO error during extraction.", e);
380 } finally {
381 close(seq);
382 }
383 }
384
385
386 /***
387 * Extracts FTP links in a directory listing.
388 *
389 * @param curi The curi to save extracted links to
390 * @param dir The directory listing to extract links from
391 * @throws URIException if an extracted link is invalid
392 */
393 private void extract(CrawlURI curi, ReplayCharSequence dir) {
394 logger.log(Level.FINEST, "Extracting URIs from FTP directory.");
395 Matcher matcher = DIR.matcher(dir);
396 while (matcher.find()) {
397 String file = matcher.group(1);
398 addExtracted(curi, file);
399 }
400 }
401
402
403 /***
404 * Adds an extracted filename to the curi. A new URI will be formed
405 * by taking the given curi (which should represent the directory the
406 * file lives in) and appending the file.
407 *
408 * @param curi the curi to store the discovered link in
409 * @param file the filename of the discovered link
410 */
411 private void addExtracted(CrawlURI curi, String file) {
412 try {
413 file = URLEncoder.encode(file, "UTF-8");
414 } catch (UnsupportedEncodingException e) {
415 throw new AssertionError(e);
416 }
417 if (logger.isLoggable(Level.FINEST)) {
418 logger.log(Level.FINEST, "Found " + file);
419 }
420 String base = curi.toString();
421 if (base.endsWith("/")) {
422 base = base.substring(0, base.length() - 1);
423 }
424 try {
425 UURI n = UURIFactory.getInstance(base + "/" + file);
426 Link link = new Link(curi.getUURI(), n, NAVLINK_MISC, NAVLINK_HOP);
427 curi.addOutLink(link);
428 } catch (URIException e) {
429 logger.log(Level.WARNING, "URI error during extraction.", e);
430 }
431 }
432
433
434 /***
435 * Extracts the parent URI from the given curi, then adds that parent
436 * URI as a discovered link to the curi.
437 *
438 * <p>If the <code>extract-parent</code> attribute is false, then this
439 * method does nothing. Also, if the path of the given curi is
440 * <code>/</code>, then this method does nothing.
441 *
442 * <p>Otherwise the parent is determined by eliminated the lowest part
443 * of the URI's path. Eg, the parent of <code>ftp://foo.com/one/two</code>
444 * is <code>ftp://foo.com/one</code>.
445 *
446 * @param curi the curi whose parent to add
447 */
448 private void addParent(CrawlURI curi) {
449 if (!getExtractParent(curi)) {
450 return;
451 }
452 UURI uuri = curi.getUURI();
453 try {
454 if (uuri.getPath().equals("/")) {
455
456 return;
457 }
458 String scheme = uuri.getScheme();
459 String auth = uuri.getEscapedAuthority();
460 String path = uuri.getEscapedCurrentHierPath();
461 UURI parent = UURIFactory.getInstance(scheme + "://" + auth + path);
462
463 Link link = new Link(uuri, parent, NAVLINK_MISC, NAVLINK_HOP);
464 curi.addOutLink(link);
465 } catch (URIException e) {
466 logger.log(Level.WARNING, "URI error during extraction.", e);
467 }
468 }
469
470
471 /***
472 * Returns the <code>extract.from.dirs</code> attribute for this
473 * <code>FetchFTP</code> and the given curi.
474 *
475 * @param curi the curi whose attribute to return
476 * @return that curi's <code>extract.from.dirs</code>
477 */
478 public boolean getExtractFromDirs(CrawlURI curi) {
479 return (Boolean)get(curi, ATTR_EXTRACT, DEFAULT_EXTRACT);
480 }
481
482
483 /***
484 * Returns the <code>extract.parent</code> attribute for this
485 * <code>FetchFTP</code> and the given curi.
486 *
487 * @param curi the curi whose attribute to return
488 * @return that curi's <code>extract-parent</code>
489 */
490 public boolean getExtractParent(CrawlURI curi) {
491 return (Boolean)get(curi, ATTR_EXTRACT_PARENT, DEFAULT_EXTRACT_PARENT);
492 }
493
494
495 /***
496 * Returns the <code>timeout-seconds</code> attribute for this
497 * <code>FetchFTP</code> and the given curi.
498 *
499 * @param curi the curi whose attribute to return
500 * @return that curi's <code>timeout-seconds</code>
501 */
502 public int getTimeout(CrawlURI curi) {
503 return (Integer)get(curi, ATTR_TIMEOUT, DEFAULT_TIMEOUT);
504 }
505
506
507 /***
508 * Returns the <code>max-length-bytes</code> attribute for this
509 * <code>FetchFTP</code> and the given curi.
510 *
511 * @param curi the curi whose attribute to return
512 * @return that curi's <code>max-length-bytes</code>
513 */
514 public long getMaxLength(CrawlURI curi) {
515 return (Long)get(curi, ATTR_MAX_LENGTH, DEFAULT_MAX_LENGTH);
516 }
517
518
519 /***
520 * Returns the <code>fetch-bandwidth</code> attribute for this
521 * <code>FetchFTP</code> and the given curi.
522 *
523 * @param curi the curi whose attribute to return
524 * @return that curi's <code>fetch-bandwidth</code>
525 */
526 public int getFetchBandwidth(CrawlURI curi) {
527 return (Integer)get(curi, ATTR_BANDWIDTH, DEFAULT_BANDWIDTH);
528 }
529
530
531 /***
532 * Returns the username and password for the given URI. This method
533 * always returns an array of length 2. The first element in the returned
534 * array is the username for the URI, and the second element is the
535 * password.
536 *
537 * <p>If the URI itself contains the username and password (i.e., it looks
538 * like <code>ftp://username:password@host/path</code>) then that username
539 * and password are returned.
540 *
541 * <p>Otherwise the settings system is probed for the <code>username</code>
542 * and <code>password</code> attributes for this <code>FTPFetch</code>
543 * and the given <code>curi</code> context. The values of those
544 * attributes are then returned.
545 *
546 * @param curi the curi whose username and password to return
547 * @return an array containing the username and password
548 */
549 private String[] getAuth(CrawlURI curi) {
550 String[] result = new String[2];
551 UURI uuri = curi.getUURI();
552 String userinfo;
553 try {
554 userinfo = uuri.getUserinfo();
555 } catch (URIException e) {
556 assert false;
557 logger.finest("getUserinfo raised URIException.");
558 userinfo = null;
559 }
560 if (userinfo != null) {
561 int p = userinfo.indexOf(':');
562 if (p > 0) {
563 result[0] = userinfo.substring(0,p);
564 result[1] = userinfo.substring(p + 1);
565 return result;
566 }
567 }
568 result[0] = (String)get(curi, ATTR_USERNAME, DEFAULT_USERNAME);
569 result[1] = (String)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);
570 return result;
571 }
572
573
574 /***
575 * Determines the password for the given URI. If the URI itself contains
576 * a password, then that password is returned. Otherwise the settings
577 * system is probed for the <code>password</code> attribute, and the value
578 * for that attribute is returned.
579 *
580 * @param curi the curi whose password to return
581 * @return that password
582 */
583 public String determinePassword(CrawlURI curi) {
584 return (String)get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);
585 }
586
587
588 /***
589 * Quietly closes the given socket.
590 *
591 * @param socket the socket to close
592 */
593 private static void close(Socket socket) {
594 try {
595 socket.close();
596 } catch (IOException e) {
597 logger.log(Level.WARNING, "IO error closing socket.", e);
598 }
599 }
600
601
602 /***
603 * Quietly closes the given sequence.
604 * If an IOException is raised, this method logs it as a warning.
605 *
606 * @param seq the sequence to close
607 */
608 private static void close(ReplayCharSequence seq) {
609 if (seq == null) {
610 return;
611 }
612 try {
613 seq.close();
614 } catch (IOException e) {
615 logger.log(Level.WARNING, "IO error closing ReplayCharSequence.",
616 e);
617 }
618 }
619
620
621 /***
622 * Quietly disconnects from the given FTP client.
623 * If an IOException is raised, this method logs it as a warning.
624 *
625 * @param client the client to disconnect
626 */
627 private static void disconnect(ClientFTP client) {
628 if (client.isConnected()) try {
629 client.disconnect();
630 } catch (IOException e) {
631 if (logger.isLoggable(Level.WARNING)) {
632 logger.warning("Could not disconnect from FTP client: "
633 + e.getMessage());
634 }
635 }
636 }
637
638
639 }