1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.io.warc;
27
28 import java.io.ByteArrayInputStream;
29 import java.io.ByteArrayOutputStream;
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.net.URI;
34 import java.net.URISyntaxException;
35 import java.util.Arrays;
36 import java.util.Iterator;
37 import java.util.List;
38 import java.util.concurrent.atomic.AtomicInteger;
39
40 import org.archive.io.ArchiveRecord;
41 import org.archive.io.ArchiveRecordHeader;
42 import org.archive.io.UTF8Bytes;
43 import org.archive.io.WriterPoolMember;
44 import org.archive.io.warc.WARCConstants;
45 import org.archive.uid.GeneratorFactory;
46 import org.archive.util.ArchiveUtils;
47 import org.archive.util.TmpDirTestCase;
48 import org.archive.util.anvl.ANVLRecord;
49
50 /***
51 * Test Writer and Reader.
52 * @author stack
53 * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
54 */
55 public class WARCWriterTest
56 extends TmpDirTestCase implements WARCConstants {
57 private static final AtomicInteger SERIAL_NO = new AtomicInteger();
58
59 /***
60 * Prefix to use for ARC files made by JUNIT.
61 */
62 private static final String PREFIX = "IAH";
63
64 private static final String SOME_URL = "http://www.archive.org/test/";
65
66 public void testCheckHeaderLineValue() throws Exception {
67 WARCWriter writer = new WARCWriter();
68 writer.checkHeaderValue("one");
69 IOException exception = null;
70 try {
71 writer.checkHeaderValue("with space");
72 } catch(IOException e) {
73 exception = e;
74 }
75 assertNotNull(exception);
76 exception = null;
77 try {
78 writer.checkHeaderValue("with\0x0000controlcharacter");
79 } catch(IOException e) {
80 exception = e;
81 }
82 assertNotNull(exception);
83 }
84
85 public void testMimetypes() throws IOException {
86 WARCWriter writer = new WARCWriter();
87 writer.checkHeaderLineMimetypeParameter("text/xml");
88 writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
89 assertEquals(writer.checkHeaderLineMimetypeParameter(
90 "text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS");
91 assertEquals(writer.checkHeaderLineMimetypeParameter(
92 "multipart/mixed; \r\n boundary=\"simple boundary\""),
93 "multipart/mixed; boundary=\"simple boundary\"");
94 }
95
96 public void testWriteRecord() throws IOException {
97 File [] files = {getTmpDir()};
98
99
100 WARCWriter writer =
101 new WARCWriter(SERIAL_NO, Arrays.asList(files),
102 this.getClass().getName(), "suffix", false, -1, null);
103 writeFile(writer);
104
105
106 writer = new WARCWriter(SERIAL_NO, Arrays.asList(files),
107 this.getClass().getName(), "suffix", true, -1, null);
108 writeFile(writer);
109 }
110
111 private void writeFile(final WARCWriter writer)
112 throws IOException {
113 try {
114 writeWarcinfoRecord(writer);
115 writeBasicRecords(writer);
116 } finally {
117 writer.close();
118 writer.getFile().delete();
119 }
120 }
121
122 private void writeWarcinfoRecord(WARCWriter writer)
123 throws IOException {
124 ANVLRecord meta = new ANVLRecord();
125 meta.addLabelValue("size", "1G");
126 meta.addLabelValue("operator", "igor");
127 byte [] bytes = meta.getUTF8Bytes();
128 writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,
129 new ByteArrayInputStream(bytes), bytes.length);
130 }
131
132 protected void writeBasicRecords(final WARCWriter writer)
133 throws IOException {
134 ANVLRecord headerFields = new ANVLRecord();
135 headerFields.addLabelValue("x", "y");
136 headerFields.addLabelValue("a", "b");
137
138 URI rid = null;
139 try {
140 rid = GeneratorFactory.getFactory().
141 getQualifiedRecordID(TYPE, METADATA);
142 } catch (URISyntaxException e) {
143
144 throw new IOException(e.getMessage());
145 }
146 final String content = "Any old content.";
147 for (int i = 0; i < 10; i++) {
148 String body = i + ". " + content;
149 byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
150 writer.writeRecord(METADATA, "http://www.archive.org/",
151 ArchiveUtils.get14DigitDate(), "no/type",
152 rid, headerFields, new ByteArrayInputStream(bodyBytes),
153 (long)bodyBytes.length, true);
154 }
155 }
156
157 /***
158 * @return Generic HTML Content.
159 */
160 protected static String getContent() {
161 return getContent(null);
162 }
163
164 /***
165 * @return Generic HTML Content with mention of passed <code>indexStr</code>
166 * in title and body.
167 */
168 protected static String getContent(String indexStr) {
169 String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
170 return "HTTP/1.1 200 OK\r\n" +
171 "Content-Type: text/html\r\n\r\n" +
172 "<html><head><title>" + page +
173 "</title></head>" +
174 "<body>" + page +
175 "</body></html>";
176 }
177
178 /***
179 * Write random HTML Record.
180 * @param w Where to write.
181 * @param index An index to put into content.
182 * @return Length of record written.
183 * @throws IOException
184 */
185 protected int writeRandomHTTPRecord(WARCWriter w, int index)
186 throws IOException {
187 ByteArrayOutputStream baos = new ByteArrayOutputStream();
188 String indexStr = Integer.toString(index);
189 byte[] record = (getContent(indexStr)).getBytes();
190 int recordLength = record.length;
191 baos.write(record);
192
193
194 ANVLRecord r = new ANVLRecord(1);
195 r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");
196 w.writeResourceRecord(
197 "http://www.one.net/id=" + indexStr,
198 ArchiveUtils.get14DigitDate(),
199 "text/html; charset=UTF-8",
200 r,
201 new ByteArrayInputStream(baos.toByteArray()),
202 recordLength);
203 return recordLength;
204 }
205
206 /***
207 * Fill a WARC with HTML Records.
208 * @param baseName WARC basename.
209 * @param compress Whether to compress or not.
210 * @param maxSize Maximum WARC size.
211 * @param recordCount How many records.
212 * @return The written file.
213 * @throws IOException
214 */
215 private File writeRecords(String baseName, boolean compress,
216 int maxSize, int recordCount)
217 throws IOException {
218 cleanUpOldFiles(baseName);
219 File [] files = {getTmpDir()};
220 WARCWriter w = new WARCWriter(SERIAL_NO,
221 Arrays.asList(files), baseName + '-' + PREFIX, "", compress,
222 maxSize, null);
223 assertNotNull(w);
224 for (int i = 0; i < recordCount; i++) {
225 writeRandomHTTPRecord(w, i);
226 }
227 w.close();
228 assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(),
229 w.getFile().exists());
230 return w.getFile();
231 }
232
233 /***
234 * Run validation of passed file.
235 * @param f File to validate.
236 * @param recordCount Expected count of records.
237 * @throws FileNotFoundException
238 * @throws IOException
239 */
240 private void validate(File f, int recordCount)
241 throws FileNotFoundException, IOException {
242 WARCReader reader = WARCReaderFactory.get(f);
243 assertNotNull(reader);
244 List headers = null;
245 if (recordCount == -1) {
246 headers = reader.validate();
247 } else {
248 headers = reader.validate(recordCount);
249 }
250 reader.close();
251
252
253
254
255 reader = WARCReaderFactory.get(f);
256 for (int i = headers.size() - 1; i >= 0; i--) {
257 ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
258 ArchiveRecord r = reader.get(h.getOffset());
259 String mimeType = r.getHeader().getMimetype();
260 assertTrue("Record is bogus, bad mimetype "+mimeType,
261 mimeType != null && mimeType.length() > 0);
262 }
263 reader.close();
264
265 assertTrue("Metadatas not equal", headers.size() == recordCount);
266 for (Iterator i = headers.iterator(); i.hasNext();) {
267 ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
268 assertTrue("Record is empty", r.getLength() > 0);
269 }
270 }
271
272 public void testWriteRecords() throws IOException {
273 final int recordCount = 2;
274 File f = writeRecords("writeRecord", false, DEFAULT_MAX_WARC_FILE_SIZE,
275 recordCount);
276 validate(f, recordCount + 1);
277 }
278
279 public void testRandomAccess() throws IOException {
280 final int recordCount = 3;
281 File f = writeRecords("writeRecord", true, DEFAULT_MAX_WARC_FILE_SIZE,
282 recordCount);
283 WARCReader reader = WARCReaderFactory.get(f);
284
285 boolean readFirst = false;
286 String url = null;
287 long offset = -1;
288 long totalRecords = 0;
289 boolean readSecond = false;
290 for (final Iterator i = reader.iterator(); i.hasNext();
291 totalRecords++) {
292 WARCRecord ar = (WARCRecord)i.next();
293 if (!readFirst) {
294 readFirst = true;
295 continue;
296 }
297 if (!readSecond) {
298 url = ar.getHeader().getUrl();
299 offset = ar.getHeader().getOffset();
300 readSecond = true;
301 }
302 }
303
304 reader = WARCReaderFactory.get(f, offset);
305 ArchiveRecord ar = reader.get();
306 assertEquals(ar.getHeader().getUrl(), url);
307 ar.close();
308
309
310 reader = WARCReaderFactory.get(f, offset);
311 int count = 0;
312 for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
313 count++;
314 }
315 reader.close();
316 assertEquals(totalRecords - 1, count);
317 }
318
319 public void testWriteRecordCompressed() throws IOException {
320 final int recordCount = 2;
321 File arcFile = writeRecords("writeRecordCompressed", true,
322 DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
323 validate(arcFile, recordCount + 1
324 }
325
326 protected WARCWriter createWARCWriter(String NAME,
327 boolean compress) {
328 File [] files = {getTmpDir()};
329 return new WARCWriter(SERIAL_NO,
330 Arrays.asList(files), NAME, "",
331 compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
332 }
333
334 protected static ByteArrayOutputStream getBaos(String str)
335 throws IOException {
336 ByteArrayOutputStream baos = new ByteArrayOutputStream();
337 baos.write(str.getBytes());
338 return baos;
339 }
340
341 protected static void writeRecord(WARCWriter w, String url,
342 String mimetype, int len, ByteArrayOutputStream baos)
343 throws IOException {
344 w.writeResourceRecord(url,
345 ArchiveUtils.get14DigitDate(),
346 mimetype,
347 null,
348 new ByteArrayInputStream(baos.toByteArray()),
349 len);
350 }
351
352 protected int iterateRecords(WARCReader r)
353 throws IOException {
354 int count = 0;
355 for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
356 ArchiveRecord ar = i.next();
357 ar.close();
358 if (count != 0) {
359 assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
360 ar.getHeader().getUrl().equals(SOME_URL));
361 }
362 count++;
363 }
364 return count;
365 }
366
367 protected WARCWriter createWithOneRecord(String name,
368 boolean compressed)
369 throws IOException {
370 WARCWriter writer = createWARCWriter(name, compressed);
371 String content = getContent();
372 writeRecord(writer, SOME_URL, "text/html",
373 content.length(), getBaos(content));
374 return writer;
375 }
376
377 public void testSpaceInURL() {
378 String eMessage = null;
379 try {
380 holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
381 } catch (IOException e) {
382 eMessage = e.getMessage();
383 }
384 assertTrue("Didn't get expected exception: " + eMessage,
385 eMessage.startsWith("Contains disallowed"));
386 }
387
388 public void testTabInURL() {
389 String eMessage = null;
390 try {
391 holeyUrl("testTabInURL-" + PREFIX, false, "\t");
392 } catch (IOException e) {
393 eMessage = e.getMessage();
394 }
395 assertTrue("Didn't get expected exception: " + eMessage,
396 eMessage.startsWith("Contains illegal"));
397 }
398
399 protected void holeyUrl(String name, boolean compress, String urlInsert)
400 throws IOException {
401 WARCWriter writer = createWithOneRecord(name, compress);
402
403 String content = getContent();
404 ByteArrayOutputStream baos = getBaos(content);
405 writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
406 content.length(), baos);
407 writer.close();
408 }
409
410 /***
411 * Write an arc file for other tests to use.
412 * @param arcdir Directory to write to.
413 * @param compress True if file should be compressed.
414 * @return ARC written.
415 * @throws IOException
416 */
417 public static File createWARCFile(File arcdir, boolean compress)
418 throws IOException {
419 File [] files = {arcdir};
420 WARCWriter writer =
421 new WARCWriter(SERIAL_NO, Arrays.asList(files),
422 "test", "", compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
423 String content = getContent();
424 writeRecord(writer, SOME_URL, "text/html", content.length(),
425 getBaos(content));
426 writer.close();
427 return writer.getFile();
428 }
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444 public void testArcRecordOffsetReads() throws Exception {
445
446 WriterPoolMember w =
447 createWithOneRecord("testArcRecordInBufferStream", true);
448 w.close();
449
450 WARCReader r = WARCReaderFactory.get(w.getFile());
451 final Iterator<ArchiveRecord> i = r.iterator();
452
453 ArchiveRecord ar = i.next();
454 i.hasNext();
455
456 ar = (WARCRecord) i.next();
457
458
459
460 final byte[] buffer = new byte[17];
461 final int maxRead = 4;
462 int totalRead = 0;
463 while (totalRead < maxRead) {
464 totalRead = totalRead
465 + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
466 assertTrue(totalRead > 0);
467 }
468 }
469 }