1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io.warc;
24
25 import java.io.File;
26 import java.io.FileInputStream;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.net.MalformedURLException;
30 import java.net.URL;
31 import java.util.Iterator;
32
33 import org.archive.io.ArchiveReader;
34 import org.archive.io.ArchiveReaderFactory;
35 import org.archive.io.ArchiveRecord;
36 import org.archive.io.GzippedInputStream;
37 import org.archive.io.warc.WARCConstants;
38 import org.archive.util.FileUtils;
39
40 /***
41 * Factory for WARC Readers.
42 * Figures whether to give out a compressed file Reader or an uncompressed
43 * Reader.
44 * @author stack
45 * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$
46 */
47 public class WARCReaderFactory extends ArchiveReaderFactory
48 implements WARCConstants {
49 private static final WARCReaderFactory factory = new WARCReaderFactory();
50
51 /***
52 * Shutdown any access to default constructor.
53 * This factory is Singleton.
54 */
55 private WARCReaderFactory() {
56 super();
57 }
58
59 public static WARCReader get(String arcFileOrUrl)
60 throws MalformedURLException, IOException {
61 return (WARCReader)WARCReaderFactory.factory.
62 getArchiveReader(arcFileOrUrl);
63 }
64
65 public static WARCReader get(final File f) throws IOException {
66 return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f);
67 }
68
69 /***
70 * @param f An arcfile to read.
71 * @param offset Have returned Reader set to start reading at this offset.
72 * @return A WARCReader.
73 * @throws IOException
74 */
75 public static WARCReader get(final File f, final long offset)
76 throws IOException {
77 return (WARCReader)WARCReaderFactory.factory.
78 getArchiveReader(f, offset);
79 }
80
81 protected ArchiveReader getArchiveReader(final File f, final long offset)
82 throws IOException {
83 boolean compressed = testCompressedWARCFile(f);
84 if (!compressed) {
85 if (!FileUtils.isReadableWithExtensionAndMagic(f,
86 DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
87 throw new IOException(f.getAbsolutePath()
88 + " is not a WARC file.");
89 }
90 }
91 return (WARCReader)(compressed?
92 WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
93 WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
94 }
95
96 public static ArchiveReader get(final String s, final InputStream is,
97 final boolean atFirstRecord)
98 throws IOException {
99 return WARCReaderFactory.factory.getArchiveReader(s, is,
100 atFirstRecord);
101 }
102
103 protected ArchiveReader getArchiveReader(final String f,
104 final InputStream is, final boolean atFirstRecord)
105 throws IOException {
106
107
108 return new CompressedWARCReader(f, is, atFirstRecord);
109 }
110
111 public static WARCReader get(final URL arcUrl, final long offset)
112 throws IOException {
113 return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl,
114 offset);
115 }
116
117 /***
118 * Get an ARCReader.
119 * Pulls the ARC local into whereever the System Property
120 * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
121 * points at this local copy. A close on this ARCReader instance will
122 * remove the local copy.
123 * @param arcUrl An URL that points at an ARC.
124 * @return An ARCReader.
125 * @throws IOException
126 */
127 public static WARCReader get(final URL arcUrl)
128 throws IOException {
129 return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl);
130 }
131
132 /***
133 * Check file is compressed WARC.
134 *
135 * @param f File to test.
136 *
137 * @return True if this is compressed WARC (TODO: Just tests if file is
138 * GZIP'd file (It begins w/ GZIP MAGIC)).
139 *
140 * @exception IOException If file does not exist or is not unreadable.
141 */
142 public static boolean testCompressedWARCFile(final File f)
143 throws IOException {
144 FileUtils.isReadable(f);
145 boolean compressed = false;
146 final InputStream is = new FileInputStream(f);
147 try {
148 compressed = GzippedInputStream.isCompressedStream(is);
149 } finally {
150 is.close();
151 }
152 return compressed;
153 }
154
155 /***
156 * Uncompressed WARC file reader.
157 * @author stack
158 */
159 private class UncompressedWARCReader extends WARCReader {
160 /***
161 * Constructor.
162 * @param f Uncompressed arcfile to read.
163 * @throws IOException
164 */
165 public UncompressedWARCReader(final File f)
166 throws IOException {
167 this(f, 0);
168 }
169
170 /***
171 * Constructor.
172 *
173 * @param f Uncompressed file to read.
174 * @param offset Offset at which to position Reader.
175 * @throws IOException
176 */
177 public UncompressedWARCReader(final File f, final long offset)
178 throws IOException {
179
180 setIn(getInputStream(f, offset));
181 initialize(f.getAbsolutePath());
182 }
183
184 /***
185 * Constructor.
186 *
187 * @param f Uncompressed file to read.
188 * @param is InputStream.
189 */
190 public UncompressedWARCReader(final String f, final InputStream is) {
191
192
193 setIn(is);
194 initialize(f);
195 }
196 }
197
198 /***
199 * Compressed WARC file reader.
200 *
201 * @author stack
202 */
203 private class CompressedWARCReader extends WARCReader {
204 /***
205 * Constructor.
206 *
207 * @param f Compressed file to read.
208 * @throws IOException
209 */
210 public CompressedWARCReader(final File f) throws IOException {
211 this(f, 0);
212 }
213
214 /***
215 * Constructor.
216 *
217 * @param f Compressed arcfile to read.
218 * @param offset Position at where to start reading file.
219 * @throws IOException
220 */
221 public CompressedWARCReader(final File f, final long offset)
222 throws IOException {
223
224 setIn(new GzippedInputStream(getInputStream(f, offset)));
225 setCompressed((offset == 0));
226 initialize(f.getAbsolutePath());
227 }
228
229 /***
230 * Constructor.
231 *
232 * @param f Compressed arcfile.
233 * @param is InputStream to use.
234 * @param atFirstRecord
235 * @throws IOException
236 */
237 public CompressedWARCReader(final String f, final InputStream is,
238 final boolean atFirstRecord)
239 throws IOException {
240
241
242 setIn(new GzippedInputStream(is));
243 setCompressed(true);
244 initialize(f);
245
246 }
247
248 /***
249 * Get record at passed <code>offset</code>.
250 *
251 * @param offset Byte index into file at which a record starts.
252 * @return A WARCRecord reference.
253 * @throws IOException
254 */
255 public WARCRecord get(long offset) throws IOException {
256 cleanupCurrentRecord();
257 ((GzippedInputStream)getIn()).gzipMemberSeek(offset);
258 return (WARCRecord) createArchiveRecord(getIn(), offset);
259 }
260
261 public Iterator<ArchiveRecord> iterator() {
262 /***
263 * Override ArchiveRecordIterator so can base returned iterator on
264 * GzippedInputStream iterator.
265 */
266 return new ArchiveRecordIterator() {
267 private GzippedInputStream gis =
268 (GzippedInputStream)getInputStream();
269
270 private Iterator gzipIterator = this.gis.iterator();
271
272 protected boolean innerHasNext() {
273 return this.gzipIterator.hasNext();
274 }
275
276 protected ArchiveRecord innerNext() throws IOException {
277
278
279 long p = this.gis.position();
280 InputStream is = (InputStream) this.gzipIterator.next();
281 return createArchiveRecord(is, p);
282 }
283 };
284 }
285
286 protected void gotoEOR(ArchiveRecord rec) throws IOException {
287
288 }
289 }
290
291 public static boolean isWARCSuffix(final String f) {
292 return (f == null)?
293 false:
294 (f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
295 true:
296 (f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?
297 true: false;
298 }
299 }