1   /* CharSequenceLinkExtractor
2   *
3   * $Id: CharSequenceLinkExtractor.java 4646 2006-09-22 17:23:04Z paul_jack $
4   *
5   * Created on Mar 17, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.extractor;
26  
27  import java.io.InputStream;
28  import java.nio.charset.Charset;
29  import java.util.LinkedList;
30  import java.util.List;
31  import java.util.NoSuchElementException;
32  
33  import org.archive.crawler.extractor.Link;
34  import org.archive.net.UURI;
35  
36  /***
37   * Abstract superclass providing utility methods for LinkExtractors which
38   * would prefer to work on a CharSequence rather than a stream.
39   *
40   * ROUGH DRAFT IN PROGRESS / incomplete... untested... 
41   * 
42   * @author gojomo
43   */
44  public abstract class CharSequenceLinkExtractor implements LinkExtractor {
45  
46      protected UURI source;
47      protected UURI base;
48      protected ExtractErrorListener extractErrorListener;
49  
50      protected CharSequence sourceContent;
51      protected LinkedList<Link> next;
52  
53      public void setup(UURI source, UURI base, InputStream content,
54              Charset charset, ExtractErrorListener listener) {
55          setup(source, base, charSequenceFrom(content,charset), listener);
56      }
57  
58      /***
59       * @param source
60       * @param base
61       * @param content
62       * @param listener
63       */
64      public void setup(UURI source, UURI base, CharSequence content,
65              ExtractErrorListener listener) {
66          this.source = source;
67          this.base = base;
68          this.extractErrorListener = listener;
69          this.sourceContent = content;
70          this.next = new LinkedList<Link>();
71      }
72  
73  
74      /***
75       * Convenience method for when source and base are same.
76       *
77       * @param sourceandbase
78       * @param content
79       * @param listener
80       */
81      public void setup(UURI sourceandbase, CharSequence content,
82              ExtractErrorListener listener) {
83          setup(sourceandbase, sourceandbase, content, listener);
84      }
85  
86      /* (non-Javadoc)
87       * @see org.archive.extractor.LinkExtractor#setup(org.archive.crawler.datamodel.UURI, java.io.InputStream, java.nio.charset.Charset)
88       */
89      public void setup(UURI sourceandbase, InputStream content, Charset charset,
90              ExtractErrorListener listener) {
91          setup(sourceandbase,sourceandbase,content,charset,listener);
92      }
93  
94      /* (non-Javadoc)
95       * @see org.archive.extractor.LinkExtractor#nextLink()
96       */
97      public Link nextLink() {
98          if(!hasNext()) {
99              throw new NoSuchElementException();
100         }
101         // next will have been filled with at least one item
102         return (Link) next.removeFirst();
103     }
104 
105     /***
106      * Discard all state. Another setup() is required to use again.
107      */
108     public void reset() {
109         base = null;
110         source = null;
111         sourceContent = null; // TODO: discard other resources
112     }
113 
114     /* (non-Javadoc)
115      * @see java.util.Iterator#hasNext()
116      */
117     public boolean hasNext() {
118         if (!next.isEmpty()) {
119             return true;
120         }
121         return findNextLink();
122     }
123 
124     /***
125      * Scan to the next link(s), if any, loading it into the next buffer.
126      *
127      * @return true if any links are found/available, false otherwise
128      */
129     abstract protected boolean findNextLink();
130 
131     /* (non-Javadoc)
132      * @see java.util.Iterator#next()
133      */
134     public Object next() {
135         return nextLink();
136     }
137 
138     /* (non-Javadoc)
139      * @see java.util.Iterator#remove()
140      */
141     public void remove() {
142         throw new UnsupportedOperationException();
143     }
144 
145     /***
146      * @param content
147      * @param charset
148      * @return CharSequence obtained from stream in given charset
149      */
150     protected CharSequence charSequenceFrom(InputStream content, Charset charset) {
151         // See if content InputStream can provide
152         if(content instanceof CharSequenceProvider) {
153             return ((CharSequenceProvider)content).getCharSequence();
154         }
155         // otherwise, create one
156         return createCharSequenceFrom(content, charset);
157     }
158 
159     /***
160      * @param content
161      * @param charset
162      * @return CharSequence built over given stream in given charset
163      */
164     protected CharSequence createCharSequenceFrom(InputStream content, Charset charset) {
165         // TODO: implement
166         return null;
167         // TODO: consider cleanup in reset()
168     }
169 
170     /***
171      * Convenience method to do default extraction.
172      *
173      * @param content
174      * @param source
175      * @param base
176      * @param collector
177      * @param extractErrorListener
178      */
179     public static void extract(CharSequence content, UURI source, UURI base,
180             List<Link> collector, ExtractErrorListener extractErrorListener) {
181         // TODO: arrange for inheritance of prefs... eg when HTML includes JS
182         // includes HTML, have inner HTML follow robots, etc from outer
183         CharSequenceLinkExtractor extractor = newDefaultInstance();
184         extractor.setup(source, base, content, extractErrorListener);
185         while (extractor.hasNext()) {
186             collector.add(extractor.nextLink());
187         }
188         extractor.reset();
189     }
190 
191     protected static CharSequenceLinkExtractor newDefaultInstance() {
192         // override in subclasses
193         return null;
194     }
195 }