1   /* LinkExtractor
2   *
3   * $Id: LinkExtractor.java 3704 2005-07-18 17:30:21Z stack-sf $
4   *
5   * Created on Mar 16, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.extractor;
26  
27  import java.io.InputStream;
28  import java.nio.charset.Charset;
29  import java.util.Iterator;
30  
31  import org.archive.crawler.extractor.Link;
32  import org.archive.net.UURI;
33  
34  /***
35   * LinkExtractor is a general interface for classes which, when given an
36   * InputStream and Charset, can scan for Links and return them via
37   * an Iterator interface.
38   *
39   * Implementors may in fact complete all extraction on the first
40   * hasNext(), then trickle Links out from an internal collection,
41   * depending on whether the link-extraction technique used is amenable
42   * to incremental scanning.
43   *
44   * ROUGH DRAFT IN PROGRESS / incomplete... untested...
45   * 
46   * @author gojomo
47   */
48  public interface LinkExtractor extends Iterator {
49      /***
50       * Setup the LinkExtractor to operate on the given stream and charset,
51       * considering the given contextURI as the initial 'base' URI for
52       * resolving relative URIs.
53       *
54       * May be called to 'reset' a LinkExtractor to start with new input.
55       *
56       * @param source source URI 
57       * @param base base URI (usually the source URI) for URI derelativizing
58       * @param content input stream of content to scan for links
59       * @param charset Charset to consult to decode stream to characters
60       * @param listener ExtractErrorListener to notify, rather than raising
61       *   exception through extraction loop
62       */
63      public void setup(UURI source, UURI base, InputStream content,
64              Charset charset, ExtractErrorListener listener);
65      
66      /***
67       * Convenience version of above for common case where source and base are 
68       * same. 
69       * 
70       * @param sourceandbase  URI to use as source and base for derelativizing
71       * @param content input stream of content to scan for links
72       * @param charset Charset to consult to decode stream to characters
73       * @param listener ExtractErrorListener to notify, rather than raising
74       *   exception through extraction loop
75       */
76      public void setup(UURI sourceandbase, InputStream content,
77              Charset charset, ExtractErrorListener listener);
78      
79      /***
80       * Alternative to Iterator.next() which returns type Link.
81       * @return a discovered Link
82       */
83      public Link nextLink();
84  
85      /***
86       * Discard all state and release any used resources.
87       */
88      public void reset();
89  }