1   /*
2    * ExperimentalWARCWriterTest
3    *
4    * $Id: ExperimentalWARCWriterTest.java 4554 2006-08-30 02:35:48Z stack-sf $
5    *
6    * Created on July 27th, 2006
7    *
8    * Copyright (C) 2006 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  package org.archive.io.warc;
27  
28  import java.io.ByteArrayInputStream;
29  import java.io.ByteArrayOutputStream;
30  import java.io.File;
31  import java.io.FileNotFoundException;
32  import java.io.IOException;
33  import java.net.URI;
34  import java.net.URISyntaxException;
35  import java.util.Arrays;
36  import java.util.Iterator;
37  import java.util.List;
38  import java.util.concurrent.atomic.AtomicInteger;
39  
40  import org.archive.io.ArchiveRecord;
41  import org.archive.io.ArchiveRecordHeader;
42  import org.archive.io.UTF8Bytes;
43  import org.archive.io.WriterPoolMember;
44  import org.archive.io.warc.WARCConstants;
45  import org.archive.uid.GeneratorFactory;
46  import org.archive.util.ArchiveUtils;
47  import org.archive.util.TmpDirTestCase;
48  import org.archive.util.anvl.ANVLRecord;
49  
50  /***
51   * Test Writer and Reader.
52   * @author stack
53   * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
54   */
55  public class WARCWriterTest
56  extends TmpDirTestCase implements WARCConstants {
57      private static final AtomicInteger SERIAL_NO = new AtomicInteger();
58      
59      /***
60       * Prefix to use for ARC files made by JUNIT.
61       */
62      private static final String PREFIX = "IAH";
63      
64      private static final String SOME_URL = "http://www.archive.org/test/";
65      
66      public void testCheckHeaderLineValue() throws Exception {
67          WARCWriter writer = new WARCWriter();
68          writer.checkHeaderValue("one");
69          IOException exception = null;
70          try {
71              writer.checkHeaderValue("with space");
72          } catch(IOException e) {
73              exception = e;
74          }
75         assertNotNull(exception);
76         exception = null;
77         try {
78             writer.checkHeaderValue("with\0x0000controlcharacter");
79         } catch(IOException e) {
80             exception = e;
81         }
82        assertNotNull(exception);
83      }
84  
85      public void testMimetypes() throws IOException {
86          WARCWriter writer = new WARCWriter();
87          writer.checkHeaderLineMimetypeParameter("text/xml");
88          writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
89          assertEquals(writer.checkHeaderLineMimetypeParameter(
90          	"text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS");
91          assertEquals(writer.checkHeaderLineMimetypeParameter(
92      		"multipart/mixed; \r\n        boundary=\"simple boundary\""),
93              "multipart/mixed; boundary=\"simple boundary\"");
94      }
95      
96      public void testWriteRecord() throws IOException {
97      	File [] files = {getTmpDir()};
98          
99      	// Write uncompressed.
100         WARCWriter writer =
101         	new WARCWriter(SERIAL_NO, Arrays.asList(files),
102         			this.getClass().getName(), "suffix", false, -1, null);
103         writeFile(writer);
104         
105         // Write compressed.
106         writer = new WARCWriter(SERIAL_NO, Arrays.asList(files),
107         		this.getClass().getName(), "suffix", true, -1, null);
108         writeFile(writer);
109     }
110     
111     private void writeFile(final WARCWriter writer)
112     throws IOException {
113         try {
114             writeWarcinfoRecord(writer);
115             writeBasicRecords(writer);
116         } finally {
117             writer.close();
118             writer.getFile().delete();
119         }
120     }
121     
122     private void writeWarcinfoRecord(WARCWriter writer)
123     throws IOException {
124     	ANVLRecord meta = new ANVLRecord();
125     	meta.addLabelValue("size", "1G");
126     	meta.addLabelValue("operator", "igor");
127     	byte [] bytes = meta.getUTF8Bytes();
128     	writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,
129     		new ByteArrayInputStream(bytes), bytes.length);
130 	}
131 
132 	protected void writeBasicRecords(final WARCWriter writer)
133     throws IOException {
134     	ANVLRecord headerFields = new ANVLRecord();
135     	headerFields.addLabelValue("x", "y");
136     	headerFields.addLabelValue("a", "b");
137     	
138     	URI rid = null;
139     	try {
140     		rid = GeneratorFactory.getFactory().
141     			getQualifiedRecordID(TYPE, METADATA);
142     	} catch (URISyntaxException e) {
143     		// Convert to IOE so can let it out.
144     		throw new IOException(e.getMessage());
145     	}
146     	final String content = "Any old content.";
147     	for (int i = 0; i < 10; i++) {
148     		String body = i + ". " + content;
149     		byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
150     		writer.writeRecord(METADATA, "http://www.archive.org/",
151     			ArchiveUtils.get14DigitDate(), "no/type",
152     			rid, headerFields, new ByteArrayInputStream(bodyBytes),
153     			(long)bodyBytes.length, true);
154     	}
155     }
156 
157     /***
158      * @return Generic HTML Content.
159      */
160     protected static String getContent() {
161         return getContent(null);
162     }
163     
164     /***
165      * @return Generic HTML Content with mention of passed <code>indexStr</code>
166      * in title and body.
167      */
168     protected static String getContent(String indexStr) {
169         String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
170         return "HTTP/1.1 200 OK\r\n" +
171         "Content-Type: text/html\r\n\r\n" +
172         "<html><head><title>" + page +
173         "</title></head>" +
174         "<body>" + page +
175         "</body></html>";
176     }
177 
178     /***
179      * Write random HTML Record.
180      * @param w Where to write.
181      * @param index An index to put into content.
182      * @return Length of record written.
183      * @throws IOException
184      */
185     protected int writeRandomHTTPRecord(WARCWriter w, int index)
186     throws IOException {
187         ByteArrayOutputStream baos = new ByteArrayOutputStream();
188         String indexStr = Integer.toString(index);
189         byte[] record = (getContent(indexStr)).getBytes();
190         int recordLength = record.length;
191         baos.write(record);
192         // Add named fields for ip, checksum, and relate the metadata
193         // and request to the resource field.
194         ANVLRecord r = new ANVLRecord(1);
195         r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");
196         w.writeResourceRecord(
197             "http://www.one.net/id=" + indexStr,
198             ArchiveUtils.get14DigitDate(),
199             "text/html; charset=UTF-8",
200             r,
201             new ByteArrayInputStream(baos.toByteArray()),
202             recordLength);
203         return recordLength;
204     }
205 
206     /***
207      * Fill a WARC with HTML Records.
208      * @param baseName WARC basename.
209      * @param compress Whether to compress or not.
210      * @param maxSize Maximum WARC size.
211      * @param recordCount How many records.
212      * @return The written file.
213      * @throws IOException
214      */
215     private File writeRecords(String baseName, boolean compress,
216         int maxSize, int recordCount)
217     throws IOException {
218         cleanUpOldFiles(baseName);
219         File [] files = {getTmpDir()};
220         WARCWriter w = new WARCWriter(SERIAL_NO,
221             Arrays.asList(files), baseName + '-' + PREFIX, "", compress,
222             maxSize, null);
223         assertNotNull(w);
224         for (int i = 0; i < recordCount; i++) {
225             writeRandomHTTPRecord(w, i);
226         }
227         w.close();
228         assertTrue("Doesn't exist: " +  w.getFile().getAbsolutePath(), 
229             w.getFile().exists());
230         return w.getFile();
231     }
232 
233     /***
234      * Run validation of passed file.
235      * @param f File to validate.
236      * @param recordCount Expected count of records.
237      * @throws FileNotFoundException
238      * @throws IOException
239      */
240     private void validate(File f, int recordCount)
241     throws FileNotFoundException, IOException {
242         WARCReader reader = WARCReaderFactory.get(f);
243         assertNotNull(reader);
244         List headers = null;
245         if (recordCount == -1) {
246             headers = reader.validate();
247         } else {
248             headers = reader.validate(recordCount);
249         }
250         reader.close();
251         
252         // Now, run through each of the records doing absolute get going from
253         // the end to start.  Reopen the arc so no context between this test
254         // and the previous.
255         reader = WARCReaderFactory.get(f);
256         for (int i = headers.size() - 1; i >= 0; i--) {
257             ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
258             ArchiveRecord r = reader.get(h.getOffset());
259             String mimeType = r.getHeader().getMimetype();
260             assertTrue("Record is bogus, bad mimetype "+mimeType,
261                 mimeType != null && mimeType.length() > 0);
262         }
263         reader.close();
264         
265         assertTrue("Metadatas not equal", headers.size() == recordCount);
266         for (Iterator i = headers.iterator(); i.hasNext();) {
267             ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
268             assertTrue("Record is empty", r.getLength() > 0);
269         }
270     }
271 
272     public void testWriteRecords() throws IOException {
273         final int recordCount = 2;
274         File f = writeRecords("writeRecord", false, DEFAULT_MAX_WARC_FILE_SIZE,
275             recordCount);
276      	validate(f, recordCount  + 1); // Header record.
277     }
278 
279     public void testRandomAccess() throws IOException {
280         final int recordCount = 3;
281         File f = writeRecords("writeRecord", true, DEFAULT_MAX_WARC_FILE_SIZE,
282             recordCount);
283         WARCReader reader = WARCReaderFactory.get(f);
284         // Get to second record.  Get its offset for later use.
285         boolean readFirst = false;
286         String url = null;
287         long offset = -1;
288         long totalRecords = 0;
289         boolean readSecond = false;
290         for (final Iterator i = reader.iterator(); i.hasNext();
291                 totalRecords++) {
292             WARCRecord ar = (WARCRecord)i.next();
293             if (!readFirst) {
294                 readFirst = true;
295                 continue;
296             }
297             if (!readSecond) {
298                 url = ar.getHeader().getUrl();
299                 offset = ar.getHeader().getOffset();
300                 readSecond = true;
301             }
302         }
303         
304         reader = WARCReaderFactory.get(f, offset);
305         ArchiveRecord ar = reader.get();
306         assertEquals(ar.getHeader().getUrl(), url);
307         ar.close();
308         
309         // Get reader again.  See how iterator works with offset
310         reader = WARCReaderFactory.get(f, offset);
311         int count = 0;
312         for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
313             count++;
314         }
315         reader.close();
316         assertEquals(totalRecords - 1, count);
317     }
318     
319     public void testWriteRecordCompressed() throws IOException {
320         final int recordCount = 2;
321         File arcFile = writeRecords("writeRecordCompressed", true,
322             DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
323         validate(arcFile, recordCount + 1 /*Header record*/);
324     }
325     
326     protected WARCWriter createWARCWriter(String NAME,
327             boolean compress) {
328         File [] files = {getTmpDir()};
329         return new WARCWriter(SERIAL_NO,
330         	Arrays.asList(files), NAME, "",
331             compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
332     }
333     
334     protected static ByteArrayOutputStream getBaos(String str)
335     throws IOException {
336         ByteArrayOutputStream baos = new ByteArrayOutputStream();
337         baos.write(str.getBytes());
338         return baos;
339     }
340     
341     protected static void writeRecord(WARCWriter w, String url,
342         String mimetype, int len, ByteArrayOutputStream baos)
343     throws IOException {
344         w.writeResourceRecord(url,
345             ArchiveUtils.get14DigitDate(),
346             mimetype,
347             null,
348             new ByteArrayInputStream(baos.toByteArray()),
349             len);
350     }
351     
352     protected int iterateRecords(WARCReader r)
353     throws IOException {
354         int count = 0;
355         for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
356             ArchiveRecord ar = i.next();
357             ar.close();
358             if (count != 0) {
359                 assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
360                     ar.getHeader().getUrl().equals(SOME_URL));
361             }
362             count++;
363         }
364         return count;
365     }
366     
367     protected WARCWriter createWithOneRecord(String name,
368         boolean compressed)
369     throws IOException {
370         WARCWriter writer = createWARCWriter(name, compressed);
371         String content = getContent();
372         writeRecord(writer, SOME_URL, "text/html",
373             content.length(), getBaos(content));
374         return writer;
375     }
376     
377     public void testSpaceInURL() {
378         String eMessage = null;
379         try {
380             holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
381         } catch (IOException e) {
382             eMessage = e.getMessage();
383         }
384         assertTrue("Didn't get expected exception: " + eMessage,
385             eMessage.startsWith("Contains disallowed"));
386     }
387 
388     public void testTabInURL() {
389         String eMessage = null;
390         try {
391             holeyUrl("testTabInURL-" + PREFIX, false, "\t");
392         } catch (IOException e) {
393             eMessage = e.getMessage();
394         }
395         assertTrue("Didn't get expected exception: " + eMessage,
396             eMessage.startsWith("Contains illegal"));
397     }
398     
399     protected void holeyUrl(String name, boolean compress, String urlInsert)
400     throws IOException {
401         WARCWriter writer = createWithOneRecord(name, compress);
402         // Add some bytes on the end to mess up the record.
403         String content = getContent();
404         ByteArrayOutputStream baos = getBaos(content);
405         writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
406             content.length(), baos);
407         writer.close();
408     }
409     
410     /***
411      * Write an arc file for other tests to use.
412      * @param arcdir Directory to write to.
413      * @param compress True if file should be compressed.
414      * @return ARC written.
415      * @throws IOException 
416      */
417     public static File createWARCFile(File arcdir, boolean compress)
418     throws IOException {
419         File [] files = {arcdir};
420         WARCWriter writer =
421             new WARCWriter(SERIAL_NO, Arrays.asList(files),
422             "test", "", compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
423         String content = getContent();
424         writeRecord(writer, SOME_URL, "text/html", content.length(),
425             getBaos(content));
426         writer.close();
427         return writer.getFile();
428     }
429     
430 //    public void testSpeed() throws IOException {
431 //        ARCWriter writer = createArcWithOneRecord("speed", true);
432 //        // Add a record with a length that is too long.
433 //        String content = getContent();
434 //        final int count = 100000;
435 //        logger.info("Starting speed write of " + count + " records.");
436 //        for (int i = 0; i < count; i++) {
437 //            writeRecord(writer, SOME_URL, "text/html", content.length(),
438 //                    getBaos(content));
439 //        }
440 //        writer.close();
441 //        logger.info("Finished speed write test.");
442 //    }
443     
444     public void testArcRecordOffsetReads() throws Exception {
445     	// Get an ARC with one record.
446 		WriterPoolMember w =
447 			createWithOneRecord("testArcRecordInBufferStream", true);
448 		w.close();
449 		// Get reader on said ARC.
450 		WARCReader r = WARCReaderFactory.get(w.getFile());
451 		final Iterator<ArchiveRecord> i = r.iterator();
452 		// Skip first ARC meta record.
453 		ArchiveRecord ar = i.next();
454 		i.hasNext();
455 		// Now we're at first and only record in ARC.
456 		ar = (WARCRecord) i.next();
457 		// Now try getting some random set of bytes out of it 
458 		// at an odd offset (used to fail because we were
459 		// doing bad math to find where in buffer to read).
460 		final byte[] buffer = new byte[17];
461 		final int maxRead = 4;
462 		int totalRead = 0;
463 		while (totalRead < maxRead) {
464 			totalRead = totalRead
465 			    + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
466 			assertTrue(totalRead > 0);
467 		}
468 	}
469 }