1   /* ARCWriterTest
2    *
3    * $Id: ARCWriterTest.java 5478 2007-09-19 01:37:07Z gojomo $
4    *
5    * Created on Dec 31, 2003.
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io.arc;
26  
27  import java.io.ByteArrayInputStream;
28  import java.io.ByteArrayOutputStream;
29  import java.io.File;
30  import java.io.FileNotFoundException;
31  import java.io.IOException;
32  import java.io.OutputStream;
33  import java.io.PrintStream;
34  import java.util.Arrays;
35  import java.util.Date;
36  import java.util.Iterator;
37  import java.util.List;
38  import java.util.concurrent.atomic.AtomicInteger;
39  
40  import org.apache.commons.io.IOUtils;
41  import org.apache.commons.io.input.NullInputStream;
42  import org.apache.commons.io.output.NullOutputStream;
43  import org.archive.io.ArchiveRecord;
44  import org.archive.io.ReplayInputStream;
45  import org.archive.io.WriterPoolMember;
46  import org.archive.util.ArchiveUtils;
47  import org.archive.util.FileUtils;
48  import org.archive.util.TmpDirTestCase;
49  
50  
51  /***
52   * Test ARCWriter class.
53   *
54   * This code exercises ARCWriter AND ARCReader.  First it writes ARCs w/
55   * ARCWriter.  Then it validates what was written w/ ARCReader.
56   *
57   * @author stack
58   */
59  public class ARCWriterTest
60  extends TmpDirTestCase implements ARCConstants {
61      /*** Utility class for writing bad ARCs (with trailing junk)
62        */
63      public class CorruptibleARCWriter extends ARCWriter {
64          byte[] endJunk = null;
65          public CorruptibleARCWriter(AtomicInteger serial_no, List<File> name, String name2, boolean compress, long default_max_arc_file_size) {
66              super(serial_no,name,name2,compress,default_max_arc_file_size);
67          }    
68          @Override
69          protected void postWriteRecordTasks() throws IOException {
70              if(endJunk!=null) {
71                  this.write(endJunk);
72              }
73              super.postWriteRecordTasks();
74          }
75          public void setEndJunk(byte[] b) throws IOException {
76              this.endJunk = b;
77          }
78      }
79  
80      /***
81       * Prefix to use for ARC files made by JUNIT.
82       */
83      private static final String SUFFIX =
84          /* TODO DEFAULT_ARC_FILE_PREFIX*/ "JUNIT";
85      
86      private static final String SOME_URL = "http://www.archive.org/test/";
87  
88      
89      private static final AtomicInteger SERIAL_NO = new AtomicInteger();
90  
91      /*
92       * @see TestCase#setUp()
93       */
94      protected void setUp() throws Exception {
95          super.setUp();
96      }
97  
98      /*
99       * @see TestCase#tearDown()
100      */
101     protected void tearDown() throws Exception {
102         super.tearDown();
103     }
104     
105     protected static String getContent() {
106         return getContent(null);
107     }
108     
109     protected static String getContent(String indexStr) {
110         String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
111         return "HTTP/1.1 200 OK\r\n" +
112         "Content-Type: text/html\r\n\r\n" +
113         "<html><head><title>" + page +
114         "</title></head>" +
115         "<body>" + page +
116         "</body></html>";
117     }
118 
119     protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
120     throws IOException {
121         String indexStr = Integer.toString(index);
122         ByteArrayOutputStream baos = new ByteArrayOutputStream();
123         // Start the record with an arbitrary 14-digit date per RFC2540
124         String now = ArchiveUtils.get14DigitDate();
125         int recordLength = 0;
126         byte[] record = (getContent(indexStr)).getBytes();
127         recordLength += record.length;
128         baos.write(record);
129         // Add the newline between records back in
130         baos.write("\n".getBytes());
131         recordLength += 1;
132         arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
133             "0.1.2.3", Long.parseLong(now), recordLength, baos);
134         return recordLength;
135     }
136 
137     private File writeRecords(String baseName, boolean compress,
138         long maxSize, int recordCount)
139     throws IOException {
140         cleanUpOldFiles(baseName);
141         File [] files = {getTmpDir()};
142         ARCWriter arcWriter = new ARCWriter(SERIAL_NO, Arrays.asList(files),
143             baseName + '-' + SUFFIX, compress, maxSize);
144         assertNotNull(arcWriter);
145         for (int i = 0; i < recordCount; i++) {
146             writeRandomHTTPRecord(arcWriter, i);
147         }
148         arcWriter.close();
149         assertTrue("Doesn't exist: " +
150                 arcWriter.getFile().getAbsolutePath(), 
151             arcWriter.getFile().exists());
152         return arcWriter.getFile();
153     }
154 
155     private void validate(File arcFile, int recordCount)
156     throws FileNotFoundException, IOException {
157         ARCReader reader = ARCReaderFactory.get(arcFile);
158         assertNotNull(reader);
159         List metaDatas = null;
160         if (recordCount == -1) {
161             metaDatas = reader.validate();
162         } else {
163             metaDatas = reader.validate(recordCount);
164         }
165         reader.close();
166         // Now, run through each of the records doing absolute get going from
167         // the end to start.  Reopen the arc so no context between this test
168         // and the previous.
169         reader = ARCReaderFactory.get(arcFile);
170         for (int i = metaDatas.size() - 1; i >= 0; i--) {
171             ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
172             ArchiveRecord r = reader.get(meta.getOffset());
173             String mimeType = r.getHeader().getMimetype();
174             assertTrue("Record is bogus",
175                 mimeType != null && mimeType.length() > 0);
176         }
177         reader.close();
178         assertTrue("Metadatas not equal", metaDatas.size() == recordCount);
179         for (Iterator i = metaDatas.iterator(); i.hasNext();) {
180                 ARCRecordMetaData r = (ARCRecordMetaData)i.next();
181                 assertTrue("Record is empty", r.getLength() > 0);
182         }
183     }
184 
185     public void testCheckARCFileSize()
186     throws IOException {
187         runCheckARCFileSizeTest("checkARCFileSize", false);
188     }
189 
190     public void testCheckARCFileSizeCompressed()
191     throws IOException {
192         runCheckARCFileSizeTest("checkARCFileSize", true);
193     }
194 
195     public void testWriteRecord() throws IOException {
196         final int recordCount = 2;
197         File arcFile = writeRecords("writeRecord", false,
198                 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
199         validate(arcFile, recordCount  + 1); // Header record.
200     }
201     
202     public void testRandomAccess() throws IOException {
203         final int recordCount = 3;
204         File arcFile = writeRecords("writeRecord", true,
205             DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
206         ARCReader reader = ARCReaderFactory.get(arcFile);
207         // Get to second record.  Get its offset for later use.
208         boolean readFirst = false;
209         String url = null;
210         long offset = -1;
211         long totalRecords = 0;
212         boolean readSecond = false;
213         for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
214             ARCRecord ar = (ARCRecord)i.next();
215             if (!readFirst) {
216                 readFirst = true;
217                 continue;
218             }
219             if (!readSecond) {
220                 url = ar.getMetaData().getUrl();
221                 offset = ar.getMetaData().getOffset();
222                 readSecond = true;
223             }
224         }
225         
226         reader = ARCReaderFactory.get(arcFile, offset);
227         ArchiveRecord ar = reader.get();
228         assertEquals(ar.getHeader().getUrl(), url);
229         ar.close();
230         
231         // Get reader again.  See how iterator works with offset
232         reader = ARCReaderFactory.get(arcFile, offset);
233         int count = 0;
234         for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
235             count++;
236         }
237         reader.close();
238         assertEquals(totalRecords - 1, count);
239     }
240 
241     public void testWriteRecordCompressed() throws IOException {
242         final int recordCount = 2;
243         File arcFile = writeRecords("writeRecordCompressed", true,
244                 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
245         validate(arcFile, recordCount + 1 /*Header record*/);
246     }
247     
248     public void testWriteGiantRecord() throws IOException {
249         File [] files = {getTmpDir()};
250         PrintStream dummyStream = new PrintStream(new NullOutputStream());
251         ARCWriter arcWriter = new ARCWriter(SERIAL_NO, dummyStream,
252                 new File("dummy"),
253                 false, null, null);
254         assertNotNull(arcWriter);
255 
256         // Start the record with an arbitrary 14-digit date per RFC2540
257         long now = System.currentTimeMillis();
258         long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;
259        
260         arcWriter.write("dummy:uri", "application/octet-stream",
261             "0.1.2.3", now, recordLength, new NullInputStream(recordLength));
262         arcWriter.close();
263         }
264     
265     private void runCheckARCFileSizeTest(String baseName, boolean compress)
266     throws FileNotFoundException, IOException  {
267         writeRecords(baseName, compress, 1024, 15);
268         // Now validate all files just created.
269         File [] files = FileUtils.getFilesWithPrefix(getTmpDir(), SUFFIX);
270         for (int i = 0; i < files.length; i++) {
271             validate(files[i], -1);
272         }
273     }
274     
275     protected CorruptibleARCWriter createARCWriter(String NAME, boolean compress) {
276         File [] files = {getTmpDir()};
277         return new CorruptibleARCWriter(SERIAL_NO, Arrays.asList(files), NAME,
278             compress, DEFAULT_MAX_ARC_FILE_SIZE);
279     }
280     
281     protected static ByteArrayInputStream getBais(String str)
282     throws IOException {
283         return new ByteArrayInputStream(str.getBytes());
284     }
285     
286     /***
287      * Writes a record, suppressing normal length-checks (so that 
288      * intentionally malformed records may be written). 
289      */
290     protected static void writeRecord(ARCWriter writer, String url,
291         String type, int len, ByteArrayInputStream bais)
292     throws IOException {
293         writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,
294             bais, false);
295     }
296     
297     protected int iterateRecords(ARCReader r)
298     throws IOException {
299         int count = 0;
300         for (Iterator i = r.iterator(); i.hasNext();) {
301             ARCRecord rec = (ARCRecord)i.next();
302             rec.close();
303             if (count != 0) {
304                 assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
305                     rec.getMetaData().getUrl().equals(SOME_URL));
306             }
307             count++;
308         }
309         return count;
310     }
311     
312     protected CorruptibleARCWriter createArcWithOneRecord(String name,
313         boolean compressed)
314     throws IOException {
315     	CorruptibleARCWriter writer = createARCWriter(name, compressed);
316         String content = getContent();
317         writeRecord(writer, SOME_URL, "text/html",
318             content.length(), getBais(content));
319         return writer;
320     }
321     
322     public void testSpaceInURL() {
323         String eMessage = null;
324         try {
325             holeyUrl("testSpaceInURL-" + SUFFIX, false, " ");
326         } catch (IOException e) {
327             eMessage = e.getMessage();
328         }
329         assertTrue("Didn't get expected exception: " + eMessage,
330             eMessage.startsWith("Metadata line doesn't match"));
331     }
332 
333     public void testTabInURL() {        
334         String eMessage = null;
335         try {
336             holeyUrl("testTabInURL-" + SUFFIX, false, "\t");
337         } catch (IOException e) {
338             eMessage = e.getMessage();
339         }
340         assertTrue("Didn't get expected exception: " + eMessage,
341             eMessage.startsWith("Metadata line doesn't match"));
342     }
343     
344     protected void holeyUrl(String name, boolean compress, String urlInsert)
345     throws IOException {
346     	ARCWriter writer = createArcWithOneRecord(name, compress);
347         // Add some bytes on the end to mess up the record.
348         String content = getContent();
349         writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
350             content.length(), getBais(content));
351         writer.close();
352     }
353     
354 // If uncompressed, length has to be right or parse will fail.
355 //
356 //    public void testLengthTooShort() throws IOException {
357 //        lengthTooShort("testLengthTooShort-" + PREFIX, false);
358 //    }
359     
360     public void testLengthTooShortCompressed() throws IOException {
361         lengthTooShort("testLengthTooShortCompressed-" + SUFFIX, true, false);
362     }
363     
364     public void testLengthTooShortCompressedStrict()
365     throws IOException {      
366         String eMessage = null;
367         try {
368             lengthTooShort("testLengthTooShortCompressedStrict-" + SUFFIX,
369                 true, true);
370         } catch (RuntimeException e) {
371             eMessage = e.getMessage();
372         }
373         assertTrue("Didn't get expected exception: " + eMessage,
374             eMessage.startsWith("java.io.IOException: Record ENDING at"));
375     }
376      
377     protected void lengthTooShort(String name, boolean compress, boolean strict)
378     throws IOException {
379     	CorruptibleARCWriter writer = createArcWithOneRecord(name, compress);
380         // Add some bytes on the end to mess up the record.
381         String content = getContent();
382         ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
383         writeRecord(writer, SOME_URL, "text/html",
384             content.length(), bais);
385         writer.setEndJunk("SOME TRAILING BYTES".getBytes());
386         writeRecord(writer, SOME_URL, "text/html",
387             content.length(), getBais(content));
388         writer.close();
389         
390         // Catch System.err into a byte stream.
391         ByteArrayOutputStream os = new ByteArrayOutputStream();
392         System.setErr(new PrintStream(os));
393         
394         ARCReader r = ARCReaderFactory.get(writer.getFile());
395         r.setStrict(strict);
396         int count = iterateRecords(r);
397         assertTrue("Count wrong " + count, count == 4);
398 
399         // Make sure we get the warning string which complains about the
400         // trailing bytes.
401         String err = os.toString();
402         assertTrue("No message " + err, err.startsWith("WARNING") &&
403             (err.indexOf("Record ENDING at") > 0));
404     }
405     
406 //  If uncompressed, length has to be right or parse will fail.
407 //
408 //    public void testLengthTooLong()
409 //    throws IOException {
410 //        lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
411 //            false, false);
412 //    }
413     
414     public void testLengthTooLongCompressed()
415     throws IOException {
416         lengthTooLong("testLengthTooLongCompressed-" + SUFFIX,
417             true, false);
418     }
419     
420     public void testLengthTooLongCompressedStrict() {
421         String eMessage = null;
422         try {
423             lengthTooLong("testLengthTooLongCompressed-" + SUFFIX,
424                 true, true);
425         } catch (IOException e) {
426             eMessage = e.getMessage();
427         }
428         assertTrue("Didn't get expected exception: " + eMessage,
429             eMessage.startsWith("Premature EOF before end-of-record"));
430     }
431     
432     protected void lengthTooLong(String name, boolean compress,
433             boolean strict)
434     throws IOException {
435     	ARCWriter writer = createArcWithOneRecord(name, compress);
436         // Add a record with a length that is too long.
437         String content = getContent();
438         writeRecord(writer, SOME_URL, "text/html",
439             content.length() + 10, getBais(content));
440         writeRecord(writer, SOME_URL, "text/html",
441             content.length(), getBais(content));
442         writer.close();
443         
444         // Catch System.err.
445         ByteArrayOutputStream os = new ByteArrayOutputStream();
446         System.setErr(new PrintStream(os));
447         
448         ARCReader r = ARCReaderFactory.get(writer.getFile());
449         r.setStrict(strict);
450         int count = iterateRecords(r);
451         assertTrue("Count wrong " + count, count == 4);
452         
453         // Make sure we get the warning string which complains about the
454         // trailing bytes.
455         String err = os.toString();
456         assertTrue("No message " + err, 
457             err.startsWith("WARNING Premature EOF before end-of-record"));
458     }
459     
460     public void testGapError() throws IOException {
461     	ARCWriter writer = createArcWithOneRecord("testGapError", true);
462         String content = getContent();
463         // Make a 'weird' RIS that returns bad 'remaining' length
464         // awhen remaining should be 0
465         ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
466                 content.length(), null) {
467             public long remaining() {
468                 return (super.remaining()==0) ? -1 : super.remaining();
469             }
470         };
471         String message = null;
472         try {
473         writer.write(SOME_URL, "text/html", "192.168.1.1",
474             (new Date()).getTime(), content.length(), ris);
475         } catch (IOException e) {
476             message = e.getMessage();
477         } finally {
478             IOUtils.closeQuietly(ris);
479         }
480         writer.close();
481         assertTrue("No gap when should be",
482             message != null &&
483             message.indexOf("Gap between expected and actual") >= 0);
484     }
485     
486     /***
487      * Write an arc file for other tests to use.
488      * @param arcdir Directory to write to.
489      * @param compress True if file should be compressed.
490      * @return ARC written.
491      * @throws IOException 
492      */
493     public static File createARCFile(File arcdir, boolean compress)
494     throws IOException {
495         File [] files = {arcdir};
496         ARCWriter writer = new ARCWriter(SERIAL_NO, Arrays.asList(files),
497             "test", compress, DEFAULT_MAX_ARC_FILE_SIZE);
498         String content = getContent();
499         writeRecord(writer, SOME_URL, "text/html", content.length(),
500             getBais(content));
501         writer.close();
502         return writer.getFile();
503     }
504     
505 //    public void testSpeed() throws IOException {
506 //        ARCWriter writer = createArcWithOneRecord("speed", true);
507 //        // Add a record with a length that is too long.
508 //        String content = getContent();
509 //        final int count = 100000;
510 //        logger.info("Starting speed write of " + count + " records.");
511 //        for (int i = 0; i < count; i++) {
512 //            writeRecord(writer, SOME_URL, "text/html", content.length(),
513 //                    getBaos(content));
514 //        }
515 //        writer.close();
516 //        logger.info("Finished speed write test.");
517 //    }
518     
519     
520     public void testValidateMetaLine() throws Exception {
521         final String line = "http://www.aandw.net/images/walden2.png " +
522             "128.197.34.86 20060111174224 image/png 2160";
523         ARCWriter w = createARCWriter("testValidateMetaLine", true);
524         try {
525             w.validateMetaLine(line);
526             w.validateMetaLine(line + LINE_SEPARATOR);
527             w.validateMetaLine(line + "//r//n");
528         } finally {
529             w.close();
530         }
531     }
532     
533     public void testArcRecordOffsetReads() throws Exception {
534     	// Get an ARC with one record.
535 		WriterPoolMember w =
536 			createArcWithOneRecord("testArcRecordInBufferStream", true);
537 		w.close();
538 		// Get reader on said ARC.
539 		ARCReader r = ARCReaderFactory.get(w.getFile());
540 		final Iterator i = r.iterator();
541 		// Skip first ARC meta record.
542 		ARCRecord ar = (ARCRecord) i.next();
543 		i.hasNext();
544 		// Now we're at first and only record in ARC.
545 		ar = (ARCRecord) i.next();
546 		// Now try getting some random set of bytes out of it 
547 		// at an odd offset (used to fail because we were
548 		// doing bad math to find where in buffer to read).
549 		final byte[] buffer = new byte[17];
550 		final int maxRead = 4;
551 		int totalRead = 0;
552 		while (totalRead < maxRead) {
553 			totalRead = totalRead
554 			    + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
555 			assertTrue(totalRead > 0);
556 		}
557 	}
558 }