1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.io.arc;
26
27 import java.io.ByteArrayInputStream;
28 import java.io.ByteArrayOutputStream;
29 import java.io.File;
30 import java.io.FileNotFoundException;
31 import java.io.IOException;
32 import java.io.OutputStream;
33 import java.io.PrintStream;
34 import java.util.Arrays;
35 import java.util.Date;
36 import java.util.Iterator;
37 import java.util.List;
38 import java.util.concurrent.atomic.AtomicInteger;
39
40 import org.apache.commons.io.IOUtils;
41 import org.apache.commons.io.input.NullInputStream;
42 import org.apache.commons.io.output.NullOutputStream;
43 import org.archive.io.ArchiveRecord;
44 import org.archive.io.ReplayInputStream;
45 import org.archive.io.WriterPoolMember;
46 import org.archive.util.ArchiveUtils;
47 import org.archive.util.FileUtils;
48 import org.archive.util.TmpDirTestCase;
49
50
51 /***
52 * Test ARCWriter class.
53 *
54 * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/
55 * ARCWriter. Then it validates what was written w/ ARCReader.
56 *
57 * @author stack
58 */
59 public class ARCWriterTest
60 extends TmpDirTestCase implements ARCConstants {
61 /*** Utility class for writing bad ARCs (with trailing junk)
62 */
63 public class CorruptibleARCWriter extends ARCWriter {
64 byte[] endJunk = null;
65 public CorruptibleARCWriter(AtomicInteger serial_no, List<File> name, String name2, boolean compress, long default_max_arc_file_size) {
66 super(serial_no,name,name2,compress,default_max_arc_file_size);
67 }
68 @Override
69 protected void postWriteRecordTasks() throws IOException {
70 if(endJunk!=null) {
71 this.write(endJunk);
72 }
73 super.postWriteRecordTasks();
74 }
75 public void setEndJunk(byte[] b) throws IOException {
76 this.endJunk = b;
77 }
78 }
79
80 /***
81 * Prefix to use for ARC files made by JUNIT.
82 */
83 private static final String SUFFIX =
84
85
86 private static final String SOME_URL = "http://www.archive.org/test/";
87
88
89 private static final AtomicInteger SERIAL_NO = new AtomicInteger();
90
91
92
93
94 protected void setUp() throws Exception {
95 super.setUp();
96 }
97
98
99
100
101 protected void tearDown() throws Exception {
102 super.tearDown();
103 }
104
105 protected static String getContent() {
106 return getContent(null);
107 }
108
109 protected static String getContent(String indexStr) {
110 String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
111 return "HTTP/1.1 200 OK\r\n" +
112 "Content-Type: text/html\r\n\r\n" +
113 "<html><head><title>" + page +
114 "</title></head>" +
115 "<body>" + page +
116 "</body></html>";
117 }
118
119 protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
120 throws IOException {
121 String indexStr = Integer.toString(index);
122 ByteArrayOutputStream baos = new ByteArrayOutputStream();
123
124 String now = ArchiveUtils.get14DigitDate();
125 int recordLength = 0;
126 byte[] record = (getContent(indexStr)).getBytes();
127 recordLength += record.length;
128 baos.write(record);
129
130 baos.write("\n".getBytes());
131 recordLength += 1;
132 arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
133 "0.1.2.3", Long.parseLong(now), recordLength, baos);
134 return recordLength;
135 }
136
137 private File writeRecords(String baseName, boolean compress,
138 long maxSize, int recordCount)
139 throws IOException {
140 cleanUpOldFiles(baseName);
141 File [] files = {getTmpDir()};
142 ARCWriter arcWriter = new ARCWriter(SERIAL_NO, Arrays.asList(files),
143 baseName + '-' + SUFFIX, compress, maxSize);
144 assertNotNull(arcWriter);
145 for (int i = 0; i < recordCount; i++) {
146 writeRandomHTTPRecord(arcWriter, i);
147 }
148 arcWriter.close();
149 assertTrue("Doesn't exist: " +
150 arcWriter.getFile().getAbsolutePath(),
151 arcWriter.getFile().exists());
152 return arcWriter.getFile();
153 }
154
155 private void validate(File arcFile, int recordCount)
156 throws FileNotFoundException, IOException {
157 ARCReader reader = ARCReaderFactory.get(arcFile);
158 assertNotNull(reader);
159 List metaDatas = null;
160 if (recordCount == -1) {
161 metaDatas = reader.validate();
162 } else {
163 metaDatas = reader.validate(recordCount);
164 }
165 reader.close();
166
167
168
169 reader = ARCReaderFactory.get(arcFile);
170 for (int i = metaDatas.size() - 1; i >= 0; i--) {
171 ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
172 ArchiveRecord r = reader.get(meta.getOffset());
173 String mimeType = r.getHeader().getMimetype();
174 assertTrue("Record is bogus",
175 mimeType != null && mimeType.length() > 0);
176 }
177 reader.close();
178 assertTrue("Metadatas not equal", metaDatas.size() == recordCount);
179 for (Iterator i = metaDatas.iterator(); i.hasNext();) {
180 ARCRecordMetaData r = (ARCRecordMetaData)i.next();
181 assertTrue("Record is empty", r.getLength() > 0);
182 }
183 }
184
185 public void testCheckARCFileSize()
186 throws IOException {
187 runCheckARCFileSizeTest("checkARCFileSize", false);
188 }
189
190 public void testCheckARCFileSizeCompressed()
191 throws IOException {
192 runCheckARCFileSizeTest("checkARCFileSize", true);
193 }
194
195 public void testWriteRecord() throws IOException {
196 final int recordCount = 2;
197 File arcFile = writeRecords("writeRecord", false,
198 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
199 validate(arcFile, recordCount + 1);
200 }
201
202 public void testRandomAccess() throws IOException {
203 final int recordCount = 3;
204 File arcFile = writeRecords("writeRecord", true,
205 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
206 ARCReader reader = ARCReaderFactory.get(arcFile);
207
208 boolean readFirst = false;
209 String url = null;
210 long offset = -1;
211 long totalRecords = 0;
212 boolean readSecond = false;
213 for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
214 ARCRecord ar = (ARCRecord)i.next();
215 if (!readFirst) {
216 readFirst = true;
217 continue;
218 }
219 if (!readSecond) {
220 url = ar.getMetaData().getUrl();
221 offset = ar.getMetaData().getOffset();
222 readSecond = true;
223 }
224 }
225
226 reader = ARCReaderFactory.get(arcFile, offset);
227 ArchiveRecord ar = reader.get();
228 assertEquals(ar.getHeader().getUrl(), url);
229 ar.close();
230
231
232 reader = ARCReaderFactory.get(arcFile, offset);
233 int count = 0;
234 for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
235 count++;
236 }
237 reader.close();
238 assertEquals(totalRecords - 1, count);
239 }
240
241 public void testWriteRecordCompressed() throws IOException {
242 final int recordCount = 2;
243 File arcFile = writeRecords("writeRecordCompressed", true,
244 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
245 validate(arcFile, recordCount + 1
246 }
247
248 public void testWriteGiantRecord() throws IOException {
249 File [] files = {getTmpDir()};
250 PrintStream dummyStream = new PrintStream(new NullOutputStream());
251 ARCWriter arcWriter = new ARCWriter(SERIAL_NO, dummyStream,
252 new File("dummy"),
253 false, null, null);
254 assertNotNull(arcWriter);
255
256
257 long now = System.currentTimeMillis();
258 long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;
259
260 arcWriter.write("dummy:uri", "application/octet-stream",
261 "0.1.2.3", now, recordLength, new NullInputStream(recordLength));
262 arcWriter.close();
263 }
264
265 private void runCheckARCFileSizeTest(String baseName, boolean compress)
266 throws FileNotFoundException, IOException {
267 writeRecords(baseName, compress, 1024, 15);
268
269 File [] files = FileUtils.getFilesWithPrefix(getTmpDir(), SUFFIX);
270 for (int i = 0; i < files.length; i++) {
271 validate(files[i], -1);
272 }
273 }
274
275 protected CorruptibleARCWriter createARCWriter(String NAME, boolean compress) {
276 File [] files = {getTmpDir()};
277 return new CorruptibleARCWriter(SERIAL_NO, Arrays.asList(files), NAME,
278 compress, DEFAULT_MAX_ARC_FILE_SIZE);
279 }
280
281 protected static ByteArrayInputStream getBais(String str)
282 throws IOException {
283 return new ByteArrayInputStream(str.getBytes());
284 }
285
286 /***
287 * Writes a record, suppressing normal length-checks (so that
288 * intentionally malformed records may be written).
289 */
290 protected static void writeRecord(ARCWriter writer, String url,
291 String type, int len, ByteArrayInputStream bais)
292 throws IOException {
293 writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,
294 bais, false);
295 }
296
297 protected int iterateRecords(ARCReader r)
298 throws IOException {
299 int count = 0;
300 for (Iterator i = r.iterator(); i.hasNext();) {
301 ARCRecord rec = (ARCRecord)i.next();
302 rec.close();
303 if (count != 0) {
304 assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
305 rec.getMetaData().getUrl().equals(SOME_URL));
306 }
307 count++;
308 }
309 return count;
310 }
311
312 protected CorruptibleARCWriter createArcWithOneRecord(String name,
313 boolean compressed)
314 throws IOException {
315 CorruptibleARCWriter writer = createARCWriter(name, compressed);
316 String content = getContent();
317 writeRecord(writer, SOME_URL, "text/html",
318 content.length(), getBais(content));
319 return writer;
320 }
321
322 public void testSpaceInURL() {
323 String eMessage = null;
324 try {
325 holeyUrl("testSpaceInURL-" + SUFFIX, false, " ");
326 } catch (IOException e) {
327 eMessage = e.getMessage();
328 }
329 assertTrue("Didn't get expected exception: " + eMessage,
330 eMessage.startsWith("Metadata line doesn't match"));
331 }
332
333 public void testTabInURL() {
334 String eMessage = null;
335 try {
336 holeyUrl("testTabInURL-" + SUFFIX, false, "\t");
337 } catch (IOException e) {
338 eMessage = e.getMessage();
339 }
340 assertTrue("Didn't get expected exception: " + eMessage,
341 eMessage.startsWith("Metadata line doesn't match"));
342 }
343
344 protected void holeyUrl(String name, boolean compress, String urlInsert)
345 throws IOException {
346 ARCWriter writer = createArcWithOneRecord(name, compress);
347
348 String content = getContent();
349 writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
350 content.length(), getBais(content));
351 writer.close();
352 }
353
354
355
356
357
358
359
360 public void testLengthTooShortCompressed() throws IOException {
361 lengthTooShort("testLengthTooShortCompressed-" + SUFFIX, true, false);
362 }
363
364 public void testLengthTooShortCompressedStrict()
365 throws IOException {
366 String eMessage = null;
367 try {
368 lengthTooShort("testLengthTooShortCompressedStrict-" + SUFFIX,
369 true, true);
370 } catch (RuntimeException e) {
371 eMessage = e.getMessage();
372 }
373 assertTrue("Didn't get expected exception: " + eMessage,
374 eMessage.startsWith("java.io.IOException: Record ENDING at"));
375 }
376
377 protected void lengthTooShort(String name, boolean compress, boolean strict)
378 throws IOException {
379 CorruptibleARCWriter writer = createArcWithOneRecord(name, compress);
380
381 String content = getContent();
382 ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
383 writeRecord(writer, SOME_URL, "text/html",
384 content.length(), bais);
385 writer.setEndJunk("SOME TRAILING BYTES".getBytes());
386 writeRecord(writer, SOME_URL, "text/html",
387 content.length(), getBais(content));
388 writer.close();
389
390
391 ByteArrayOutputStream os = new ByteArrayOutputStream();
392 System.setErr(new PrintStream(os));
393
394 ARCReader r = ARCReaderFactory.get(writer.getFile());
395 r.setStrict(strict);
396 int count = iterateRecords(r);
397 assertTrue("Count wrong " + count, count == 4);
398
399
400
401 String err = os.toString();
402 assertTrue("No message " + err, err.startsWith("WARNING") &&
403 (err.indexOf("Record ENDING at") > 0));
404 }
405
406
407
408
409
410
411
412
413
414 public void testLengthTooLongCompressed()
415 throws IOException {
416 lengthTooLong("testLengthTooLongCompressed-" + SUFFIX,
417 true, false);
418 }
419
420 public void testLengthTooLongCompressedStrict() {
421 String eMessage = null;
422 try {
423 lengthTooLong("testLengthTooLongCompressed-" + SUFFIX,
424 true, true);
425 } catch (IOException e) {
426 eMessage = e.getMessage();
427 }
428 assertTrue("Didn't get expected exception: " + eMessage,
429 eMessage.startsWith("Premature EOF before end-of-record"));
430 }
431
432 protected void lengthTooLong(String name, boolean compress,
433 boolean strict)
434 throws IOException {
435 ARCWriter writer = createArcWithOneRecord(name, compress);
436
437 String content = getContent();
438 writeRecord(writer, SOME_URL, "text/html",
439 content.length() + 10, getBais(content));
440 writeRecord(writer, SOME_URL, "text/html",
441 content.length(), getBais(content));
442 writer.close();
443
444
445 ByteArrayOutputStream os = new ByteArrayOutputStream();
446 System.setErr(new PrintStream(os));
447
448 ARCReader r = ARCReaderFactory.get(writer.getFile());
449 r.setStrict(strict);
450 int count = iterateRecords(r);
451 assertTrue("Count wrong " + count, count == 4);
452
453
454
455 String err = os.toString();
456 assertTrue("No message " + err,
457 err.startsWith("WARNING Premature EOF before end-of-record"));
458 }
459
460 public void testGapError() throws IOException {
461 ARCWriter writer = createArcWithOneRecord("testGapError", true);
462 String content = getContent();
463
464
465 ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
466 content.length(), null) {
467 public long remaining() {
468 return (super.remaining()==0) ? -1 : super.remaining();
469 }
470 };
471 String message = null;
472 try {
473 writer.write(SOME_URL, "text/html", "192.168.1.1",
474 (new Date()).getTime(), content.length(), ris);
475 } catch (IOException e) {
476 message = e.getMessage();
477 } finally {
478 IOUtils.closeQuietly(ris);
479 }
480 writer.close();
481 assertTrue("No gap when should be",
482 message != null &&
483 message.indexOf("Gap between expected and actual") >= 0);
484 }
485
486 /***
487 * Write an arc file for other tests to use.
488 * @param arcdir Directory to write to.
489 * @param compress True if file should be compressed.
490 * @return ARC written.
491 * @throws IOException
492 */
493 public static File createARCFile(File arcdir, boolean compress)
494 throws IOException {
495 File [] files = {arcdir};
496 ARCWriter writer = new ARCWriter(SERIAL_NO, Arrays.asList(files),
497 "test", compress, DEFAULT_MAX_ARC_FILE_SIZE);
498 String content = getContent();
499 writeRecord(writer, SOME_URL, "text/html", content.length(),
500 getBais(content));
501 writer.close();
502 return writer.getFile();
503 }
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520 public void testValidateMetaLine() throws Exception {
521 final String line = "http://www.aandw.net/images/walden2.png " +
522 "128.197.34.86 20060111174224 image/png 2160";
523 ARCWriter w = createARCWriter("testValidateMetaLine", true);
524 try {
525 w.validateMetaLine(line);
526 w.validateMetaLine(line + LINE_SEPARATOR);
527 w.validateMetaLine(line + "//r//n");
528 } finally {
529 w.close();
530 }
531 }
532
533 public void testArcRecordOffsetReads() throws Exception {
534
535 WriterPoolMember w =
536 createArcWithOneRecord("testArcRecordInBufferStream", true);
537 w.close();
538
539 ARCReader r = ARCReaderFactory.get(w.getFile());
540 final Iterator i = r.iterator();
541
542 ARCRecord ar = (ARCRecord) i.next();
543 i.hasNext();
544
545 ar = (ARCRecord) i.next();
546
547
548
549 final byte[] buffer = new byte[17];
550 final int maxRead = 4;
551 int totalRead = 0;
552 while (totalRead < maxRead) {
553 totalRead = totalRead
554 + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
555 assertTrue(totalRead > 0);
556 }
557 }
558 }