1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.extractor;
26
27 import java.io.InputStream;
28 import java.nio.charset.Charset;
29 import java.util.LinkedList;
30 import java.util.List;
31 import java.util.NoSuchElementException;
32
33 import org.archive.crawler.extractor.Link;
34 import org.archive.net.UURI;
35
36 /***
37 * Abstract superclass providing utility methods for LinkExtractors which
38 * would prefer to work on a CharSequence rather than a stream.
39 *
40 * ROUGH DRAFT IN PROGRESS / incomplete... untested...
41 *
42 * @author gojomo
43 */
44 public abstract class CharSequenceLinkExtractor implements LinkExtractor {
45
46 protected UURI source;
47 protected UURI base;
48 protected ExtractErrorListener extractErrorListener;
49
50 protected CharSequence sourceContent;
51 protected LinkedList<Link> next;
52
53 public void setup(UURI source, UURI base, InputStream content,
54 Charset charset, ExtractErrorListener listener) {
55 setup(source, base, charSequenceFrom(content,charset), listener);
56 }
57
58 /***
59 * @param source
60 * @param base
61 * @param content
62 * @param listener
63 */
64 public void setup(UURI source, UURI base, CharSequence content,
65 ExtractErrorListener listener) {
66 this.source = source;
67 this.base = base;
68 this.extractErrorListener = listener;
69 this.sourceContent = content;
70 this.next = new LinkedList<Link>();
71 }
72
73
74 /***
75 * Convenience method for when source and base are same.
76 *
77 * @param sourceandbase
78 * @param content
79 * @param listener
80 */
81 public void setup(UURI sourceandbase, CharSequence content,
82 ExtractErrorListener listener) {
83 setup(sourceandbase, sourceandbase, content, listener);
84 }
85
86
87
88
89 public void setup(UURI sourceandbase, InputStream content, Charset charset,
90 ExtractErrorListener listener) {
91 setup(sourceandbase,sourceandbase,content,charset,listener);
92 }
93
94
95
96
97 public Link nextLink() {
98 if(!hasNext()) {
99 throw new NoSuchElementException();
100 }
101
102 return (Link) next.removeFirst();
103 }
104
105 /***
106 * Discard all state. Another setup() is required to use again.
107 */
108 public void reset() {
109 base = null;
110 source = null;
111 sourceContent = null;
112 }
113
114
115
116
117 public boolean hasNext() {
118 if (!next.isEmpty()) {
119 return true;
120 }
121 return findNextLink();
122 }
123
124 /***
125 * Scan to the next link(s), if any, loading it into the next buffer.
126 *
127 * @return true if any links are found/available, false otherwise
128 */
129 abstract protected boolean findNextLink();
130
131
132
133
134 public Object next() {
135 return nextLink();
136 }
137
138
139
140
141 public void remove() {
142 throw new UnsupportedOperationException();
143 }
144
145 /***
146 * @param content
147 * @param charset
148 * @return CharSequence obtained from stream in given charset
149 */
150 protected CharSequence charSequenceFrom(InputStream content, Charset charset) {
151
152 if(content instanceof CharSequenceProvider) {
153 return ((CharSequenceProvider)content).getCharSequence();
154 }
155
156 return createCharSequenceFrom(content, charset);
157 }
158
159 /***
160 * @param content
161 * @param charset
162 * @return CharSequence built over given stream in given charset
163 */
164 protected CharSequence createCharSequenceFrom(InputStream content, Charset charset) {
165
166 return null;
167
168 }
169
170 /***
171 * Convenience method to do default extraction.
172 *
173 * @param content
174 * @param source
175 * @param base
176 * @param collector
177 * @param extractErrorListener
178 */
179 public static void extract(CharSequence content, UURI source, UURI base,
180 List<Link> collector, ExtractErrorListener extractErrorListener) {
181
182
183 CharSequenceLinkExtractor extractor = newDefaultInstance();
184 extractor.setup(source, base, content, extractErrorListener);
185 while (extractor.hasNext()) {
186 collector.add(extractor.nextLink());
187 }
188 extractor.reset();
189 }
190
191 protected static CharSequenceLinkExtractor newDefaultInstance() {
192
193 return null;
194 }
195 }