1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.extractor;
26
27 import java.io.InputStream;
28 import java.nio.charset.Charset;
29 import java.util.Iterator;
30
31 import org.archive.crawler.extractor.Link;
32 import org.archive.net.UURI;
33
34 /***
35 * LinkExtractor is a general interface for classes which, when given an
36 * InputStream and Charset, can scan for Links and return them via
37 * an Iterator interface.
38 *
39 * Implementors may in fact complete all extraction on the first
40 * hasNext(), then trickle Links out from an internal collection,
41 * depending on whether the link-extraction technique used is amenable
42 * to incremental scanning.
43 *
44 * ROUGH DRAFT IN PROGRESS / incomplete... untested...
45 *
46 * @author gojomo
47 */
48 public interface LinkExtractor extends Iterator {
49 /***
50 * Setup the LinkExtractor to operate on the given stream and charset,
51 * considering the given contextURI as the initial 'base' URI for
52 * resolving relative URIs.
53 *
54 * May be called to 'reset' a LinkExtractor to start with new input.
55 *
56 * @param source source URI
57 * @param base base URI (usually the source URI) for URI derelativizing
58 * @param content input stream of content to scan for links
59 * @param charset Charset to consult to decode stream to characters
60 * @param listener ExtractErrorListener to notify, rather than raising
61 * exception through extraction loop
62 */
63 public void setup(UURI source, UURI base, InputStream content,
64 Charset charset, ExtractErrorListener listener);
65
66 /***
67 * Convenience version of above for common case where source and base are
68 * same.
69 *
70 * @param sourceandbase URI to use as source and base for derelativizing
71 * @param content input stream of content to scan for links
72 * @param charset Charset to consult to decode stream to characters
73 * @param listener ExtractErrorListener to notify, rather than raising
74 * exception through extraction loop
75 */
76 public void setup(UURI sourceandbase, InputStream content,
77 Charset charset, ExtractErrorListener listener);
78
79 /***
80 * Alternative to Iterator.next() which returns type Link.
81 * @return a discovered Link
82 */
83 public Link nextLink();
84
85 /***
86 * Discard all state and release any used resources.
87 */
88 public void reset();
89 }