View Javadoc

1   /* $Id: WARCReaderFactory.java 4533 2006-08-24 00:59:04Z stack-sf $
2    *
3    * Created Aug 22, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.warc;
24  
25  import java.io.File;
26  import java.io.FileInputStream;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.net.MalformedURLException;
30  import java.net.URL;
31  import java.util.Iterator;
32  
33  import org.archive.io.ArchiveReader;
34  import org.archive.io.ArchiveReaderFactory;
35  import org.archive.io.ArchiveRecord;
36  import org.archive.io.GzippedInputStream;
37  import org.archive.io.warc.WARCConstants;
38  import org.archive.util.FileUtils;
39  
40  /***
41   * Factory for WARC Readers.
42   * Figures whether to give out a compressed file Reader or an uncompressed
43   * Reader.
44   * @author stack
45   * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$
46   */
47  public class WARCReaderFactory extends ArchiveReaderFactory
48  implements WARCConstants {
49      private static final WARCReaderFactory factory = new WARCReaderFactory();
50  
51      /***
52       * Shutdown any access to default constructor.
53       * This factory is Singleton.
54       */
55      private WARCReaderFactory() {
56          super();
57      }
58      
59      public static WARCReader get(String arcFileOrUrl)
60      throws MalformedURLException, IOException {
61      	return (WARCReader)WARCReaderFactory.factory.
62      		getArchiveReader(arcFileOrUrl);
63      }
64      
65      public static WARCReader get(final File f) throws IOException {
66      	return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f);
67      }
68      
69      /***
70       * @param f An arcfile to read.
71       * @param offset Have returned Reader set to start reading at this offset.
72       * @return A WARCReader.
73       * @throws IOException 
74       */
75      public static WARCReader get(final File f, final long offset)
76      throws IOException {
77      	return (WARCReader)WARCReaderFactory.factory.
78      		getArchiveReader(f, offset);
79      }
80      
81      protected ArchiveReader getArchiveReader(final File f, final long offset)
82      throws IOException {
83  		boolean compressed = testCompressedWARCFile(f);
84  		if (!compressed) {
85  			if (!FileUtils.isReadableWithExtensionAndMagic(f,
86  					DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
87  				throw new IOException(f.getAbsolutePath()
88  						+ " is not a WARC file.");
89  			}
90  		}
91  		return (WARCReader)(compressed?
92  			WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
93  			WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
94  	}
95      
96      public static ArchiveReader get(final String s, final InputStream is,
97              final boolean atFirstRecord)
98      throws IOException {
99          return WARCReaderFactory.factory.getArchiveReader(s, is,
100             atFirstRecord);
101     }
102     
103     protected ArchiveReader getArchiveReader(final String f,
104 			final InputStream is, final boolean atFirstRecord)
105 			throws IOException {
106 		// For now, assume stream is compressed. Later add test of input
107 		// stream or handle exception thrown when figure not compressed stream.
108 		return new CompressedWARCReader(f, is, atFirstRecord);
109 	}
110     
111     public static WARCReader get(final URL arcUrl, final long offset)
112     throws IOException {
113         return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl,
114             offset);
115     }
116     
117     /***
118      * Get an ARCReader.
119      * Pulls the ARC local into whereever the System Property
120      * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
121      * points at this local copy.  A close on this ARCReader instance will
122      * remove the local copy.
123      * @param arcUrl An URL that points at an ARC.
124      * @return An ARCReader.
125      * @throws IOException 
126      */
127     public static WARCReader get(final URL arcUrl)
128     throws IOException {
129         return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl);
130     }
131     
132     /***
133      * Check file is compressed WARC.
134      *
135      * @param f File to test.
136      *
137      * @return True if this is compressed WARC (TODO: Just tests if file is
138      * GZIP'd file (It begins w/ GZIP MAGIC)).
139      *
140      * @exception IOException If file does not exist or is not unreadable.
141      */
142     public static boolean testCompressedWARCFile(final File f)
143     throws IOException {
144         FileUtils.isReadable(f);
145         boolean compressed = false;
146         final InputStream is = new FileInputStream(f);
147         try {
148             compressed = GzippedInputStream.isCompressedStream(is);
149         } finally {
150             is.close();
151         }
152         return compressed;
153     }
154 
155     /***
156      * Uncompressed WARC file reader.
157      * @author stack
158      */
159     private class UncompressedWARCReader extends WARCReader {
160         /***
161          * Constructor.
162          * @param f Uncompressed arcfile to read.
163          * @throws IOException
164          */
165         public UncompressedWARCReader(final File f)
166         throws IOException {
167             this(f, 0);
168         }
169 
170         /***
171          * Constructor.
172          * 
173          * @param f Uncompressed file to read.
174          * @param offset Offset at which to position Reader.
175          * @throws IOException
176          */
177         public UncompressedWARCReader(final File f, final long offset)
178         throws IOException {
179             // File has been tested for existence by time it has come to here.
180             setIn(getInputStream(f, offset));
181             initialize(f.getAbsolutePath());
182         }
183         
184         /***
185          * Constructor.
186          * 
187          * @param f Uncompressed file to read.
188          * @param is InputStream.
189          */
190         public UncompressedWARCReader(final String f, final InputStream is) {
191             // Arc file has been tested for existence by time it has come
192             // to here.
193             setIn(is);
194             initialize(f);
195         }
196     }
197     
198     /***
199      * Compressed WARC file reader.
200      * 
201      * @author stack
202      */
203     private class CompressedWARCReader extends WARCReader {
204         /***
205          * Constructor.
206          * 
207          * @param f Compressed file to read.
208          * @throws IOException
209          */
210         public CompressedWARCReader(final File f) throws IOException {
211             this(f, 0);
212         }
213 
214         /***
215          * Constructor.
216          * 
217          * @param f Compressed arcfile to read.
218          * @param offset Position at where to start reading file.
219          * @throws IOException
220          */
221         public CompressedWARCReader(final File f, final long offset)
222                 throws IOException {
223             // File has been tested for existence by time it has come to here.
224             setIn(new GzippedInputStream(getInputStream(f, offset)));
225             setCompressed((offset == 0));
226             initialize(f.getAbsolutePath());
227         }
228         
229         /***
230          * Constructor.
231          * 
232          * @param f Compressed arcfile.
233          * @param is InputStream to use.
234          * @param atFirstRecord
235          * @throws IOException
236          */
237         public CompressedWARCReader(final String f, final InputStream is,
238             final boolean atFirstRecord)
239         throws IOException {
240             // Arc file has been tested for existence by time it has come
241             // to here.
242             setIn(new GzippedInputStream(is));
243             setCompressed(true);
244             initialize(f);
245             // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world.
246         }
247         
248         /***
249          * Get record at passed <code>offset</code>.
250          * 
251          * @param offset Byte index into file at which a record starts.
252          * @return A WARCRecord reference.
253          * @throws IOException
254          */
255         public WARCRecord get(long offset) throws IOException {
256             cleanupCurrentRecord();
257             ((GzippedInputStream)getIn()).gzipMemberSeek(offset);
258             return (WARCRecord) createArchiveRecord(getIn(), offset);
259         }
260         
261         public Iterator<ArchiveRecord> iterator() {
262             /***
263              * Override ArchiveRecordIterator so can base returned iterator on
264              * GzippedInputStream iterator.
265              */
266             return new ArchiveRecordIterator() {
267                 private GzippedInputStream gis =
268                     (GzippedInputStream)getInputStream();
269 
270                 private Iterator gzipIterator = this.gis.iterator();
271 
272                 protected boolean innerHasNext() {
273                     return this.gzipIterator.hasNext();
274                 }
275 
276                 protected ArchiveRecord innerNext() throws IOException {
277                     // Get the positoin before gzipIterator.next moves
278                     // it on past the gzip header.
279                     long p = this.gis.position();
280                     InputStream is = (InputStream) this.gzipIterator.next();
281                     return createArchiveRecord(is, p);
282                 }
283             };
284         }
285         
286         protected void gotoEOR(ArchiveRecord rec) throws IOException {
287         	// TODO
288         }
289     }
290     
291     public static boolean isWARCSuffix(final String f) {
292     	return (f == null)?
293     		false:
294     		(f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
295     		    true:
296     			(f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?
297     			true: false;
298     }
299 }