1   /*  $Id: ExperimentalWARCWriter.java 4604 2006-09-06 05:38:18Z stack-sf $
2    *
3    * Created on July 27th, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.warc;
24  
25  import java.io.ByteArrayInputStream;
26  import java.io.File;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.OutputStream;
30  import java.net.URI;
31  import java.net.URISyntaxException;
32  import java.util.Iterator;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.concurrent.atomic.AtomicInteger;
36  
37  import org.archive.io.WriterPoolMember;
38  import org.archive.uid.GeneratorFactory;
39  import org.archive.util.ArchiveUtils;
40  import org.archive.util.anvl.ANVLRecord;
41  
42  
43  /***
44   * <b>Experimental</b> WARC implementation.
45   *
46   * <p>Assumption is that the caller is managing access to this
47   * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance
48   * at any one time.
49   * 
50   * <p>While being written, WARCs have a '.open' suffix appended.
51   *
52   * @author stack
53   * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
54   */
55  public class WARCWriter extends WriterPoolMember
56  implements WARCConstants {
57  
58      /***
59       * NEWLINE as bytes.
60       */
61      public static byte [] CRLF_BYTES;
62      static {
63          try {
64              CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
65          } catch(Exception e) {
66              e.printStackTrace();
67          }
68      };
69      
70      /***
71       * Metadata.
72       */
73      private final List<String> fileMetadata;
74      
75      
76      /***
77       * Shutdown Constructor
78       * Has default access so can make instance to test utility methods.
79       */
80      WARCWriter() {
81          this(null, null, "", "", true, -1, null);
82      }
83      
84      /***
85       * Constructor.
86       * Takes a stream. Use with caution. There is no upperbound check on size.
87       * Will just keep writing.  Only pass Streams that are bounded. 
88       * @param serialNo  used to generate unique file name sequences
89       * @param out Where to write.
90       * @param f File the <code>out</code> is connected to.
91       * @param cmprs Compress the content written.
92       * @param a14DigitDate If null, we'll write current time.
93       * @throws IOException
94       */
95      public WARCWriter(final AtomicInteger serialNo,
96      		final OutputStream out, final File f,
97      		final boolean cmprs, final String a14DigitDate,
98              final List<String> warcinfoData)
99      throws IOException {
100         super(serialNo, out, f, cmprs, a14DigitDate);
101         this.fileMetadata = warcinfoData;
102     }
103             
104     /***
105      * Constructor.
106      *
107      * @param dirs Where to drop files.
108      * @param prefix File prefix to use.
109      * @param cmprs Compress the records written. 
110      * @param maxSize Maximum size for ARC files written.
111      * @param suffix File tail to use.  If null, unused.
112      * @param warcinfoData File metadata for warcinfo record.
113      */
114     public WARCWriter(final AtomicInteger serialNo,
115     		final List<File> dirs, final String prefix, 
116             final String suffix, final boolean cmprs,
117             final long maxSize, final List<String> warcinfoData) {
118         super(serialNo, dirs, prefix, suffix, cmprs, maxSize,
119         	WARC_FILE_EXTENSION);
120         this.fileMetadata = warcinfoData;
121     }
122     
123     @Override
124     protected String createFile(File file) throws IOException {
125     	String filename = super.createFile(file);
126     	writeWarcinfoRecord(filename);
127         return filename;
128     }
129     
130     protected void baseCharacterCheck(final char c, final String parameter)
131     throws IOException {
132         // TODO: Too strict?  UNICODE control characters?
133         if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
134             throw new IOException("Contains illegal character 0x" +
135                 Integer.toHexString(c) + ": " + parameter);
136         }
137     }
138     
139     protected String checkHeaderValue(final String value)
140     throws IOException {
141         for (int i = 0; i < value.length(); i++) {
142         	final char c = value.charAt(i);
143         	baseCharacterCheck(c, value);
144         	if (Character.isWhitespace(c)) {
145                 throw new IOException("Contains disallowed white space 0x" +
146                     Integer.toHexString(c) + ": " + value);
147         	}
148         }
149         return value;
150     }
151     
152     protected String checkHeaderLineMimetypeParameter(final String parameter)
153     throws IOException {
154     	StringBuilder sb = new StringBuilder(parameter.length());
155     	boolean wasWhitespace = false;
156         for (int i = 0; i < parameter.length(); i++) {
157         	char c = parameter.charAt(i);
158         	if (Character.isWhitespace(c)) {
159         		// Map all to ' ' and collapse multiples into one.
160         		// TODO: Make sure white space occurs in legal location --
161         		// before parameter or inside quoted-string.
162         		if (wasWhitespace) {
163         			continue;
164         		}
165         		wasWhitespace = true;
166         		c = ' ';
167         	} else {
168         		wasWhitespace = false;
169         		baseCharacterCheck(c, parameter);
170         	}
171         	sb.append(c);
172         }
173         
174         return sb.toString();
175     }
176 
177     protected String createRecordHeader(final String type,
178     		final String url, final String create14DigitDate,
179     		final String mimetype, final URI recordId,
180     		final ANVLRecord xtraHeaders, final long contentLength)
181     throws IOException {
182     	final StringBuilder sb =
183     		new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/);
184     	sb.append(WARC_ID).append(CRLF);
185         sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(type).
186             append(CRLF);
187         // Do not write a subject-uri if not one present.
188         if (url != null && url.length() > 0) {
189             sb.append(HEADER_KEY_URI).append(COLON_SPACE).
190                 append(checkHeaderValue(url)).append(CRLF);
191         }
192         sb.append(HEADER_KEY_DATE).append(COLON_SPACE).
193             append(create14DigitDate).append(CRLF);
194         if (xtraHeaders != null) {
195             for (final Iterator i = xtraHeaders.iterator(); i.hasNext();) {
196                 sb.append(i.next()).append(CRLF);
197             }
198         }
199 
200         sb.append(HEADER_KEY_ID).append(COLON_SPACE).append('<').
201             append(recordId.toString()).append('>').append(CRLF);
202         if (contentLength > 0) {
203             sb.append(CONTENT_TYPE).append(COLON_SPACE).append(
204                 checkHeaderLineMimetypeParameter(mimetype)).append(CRLF);
205         }
206         sb.append(CONTENT_LENGTH).append(COLON_SPACE).
207             append(Long.toString(contentLength)).append(CRLF);
208     	
209     	return sb.toString();
210     }
211 
212     /***
213      * @deprecated Use {@link #writeRecord(String,String,String,String,URI,ANVLRecord,InputStream,long,boolean)} instead
214      */
215     protected void writeRecord(final String type, final String url,
216     		final String create14DigitDate, final String mimetype,
217     		final URI recordId, ANVLRecord xtraHeaders,
218             final InputStream contentStream, final long contentLength)
219     throws IOException {
220         writeRecord(type, url, create14DigitDate, mimetype, recordId, xtraHeaders, contentStream, contentLength, true);
221     }
222 
223     protected void writeRecord(final String type, final String url,
224     		final String create14DigitDate, final String mimetype,
225     		final URI recordId, ANVLRecord xtraHeaders,
226             final InputStream contentStream, final long contentLength, boolean enforceLength)
227     throws IOException {
228     	if (!TYPES_LIST.contains(type)) {
229     		throw new IllegalArgumentException("Unknown record type: " + type);
230     	}
231     	if (contentLength == 0 &&
232                 (xtraHeaders == null || xtraHeaders.size() <= 0)) {
233     		throw new IllegalArgumentException("Cannot write record " +
234     		    "of content-length zero and base headers only.");
235     	}
236     	
237         preWriteRecordTasks();
238         try {
239             final String header = createRecordHeader(type, url,
240             	create14DigitDate, mimetype, recordId, xtraHeaders,
241             	contentLength);
242             // TODO: Revisit endcoding of header.
243             write(header.getBytes(WARC_HEADER_ENCODING));
244             
245             if (contentStream != null && contentLength > 0) {
246                 // Write out the header/body separator.
247                 write(CRLF_BYTES); // TODO: should this be written even for zero-length?
248             	copyFrom(contentStream, contentLength, enforceLength);
249             }
250             
251             // Write out the two blank lines at end of all records, per spec
252             write(CRLF_BYTES);
253             write(CRLF_BYTES);
254         } finally {
255             postWriteRecordTasks();
256         }
257     }
258     
259     protected URI generateRecordId(final Map<String, String> qualifiers)
260     throws IOException {
261     	URI rid = null;
262     	try {
263     		rid = GeneratorFactory.getFactory().
264     			getQualifiedRecordID(qualifiers);
265     	} catch (URISyntaxException e) {
266     		// Convert to IOE so can let it out.
267     		throw new IOException(e.getMessage());
268     	}
269     	return rid;
270     }
271     
272     protected URI generateRecordId(final String key, final String value)
273     throws IOException {
274     	URI rid = null;
275     	try {
276     		rid = GeneratorFactory.getFactory().
277     			getQualifiedRecordID(key, value);
278     	} catch (URISyntaxException e) {
279     		// Convert to IOE so can let it out.
280     		throw new IOException(e.getMessage());
281     	}
282     	return rid;
283     }
284     
285     public URI writeWarcinfoRecord(String filename)
286 	throws IOException {
287     	return writeWarcinfoRecord(filename, null);
288     }
289     
290     public URI writeWarcinfoRecord(String filename, final String description)
291         	throws IOException {
292         // Strip .open suffix if present.
293         if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
294         	filename = filename.substring(0,
295         		filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
296         }
297         
298         ANVLRecord headerrecord = new ANVLRecord(1);
299         headerrecord.addLabelValue(HEADER_KEY_FILENAME, filename);
300         
301         // Ugh, hate doing this but barring larger refactoring per-WARC
302         // 'metadata' is coming back as List<String> (?!?)
303         String blockfields = "";
304         if (this.fileMetadata == null) {
305             // only encountered in unit tests?
306             blockfields = "dummy: value";
307         } else {
308             for (String s :  (List<String>) fileMetadata) {
309                 blockfields += s;
310             }
311         }
312         byte[] warcinfoBody;
313         if (description != null && description.length() > 0) {
314             // reconstitute and add new description
315             ANVLRecord blockrecord = ANVLRecord.load(blockfields);
316             blockrecord.addLabelValue(CONTENT_DESCRIPTION, description);
317             warcinfoBody = blockrecord.toString().getBytes("UTF-8");
318         } else {
319             // just use in already rendered form
320             warcinfoBody = blockfields.getBytes("UTF-8");
321         }
322 
323         URI uri = writeWarcinfoRecord("application/warc-fields", headerrecord,
324             new ByteArrayInputStream(warcinfoBody), warcinfoBody.length);
325         return uri;
326     }
327     
328     /***
329      * Write a warcinfo to current file.
330      * TODO: Write crawl metadata or pointers to crawl description.
331      * @param mimetype Mimetype of the <code>fileMetadata</code> block.
332      * @param namedFields Named fields. Pass <code>null</code> if none.
333      * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
334      * @param fileMetadataLength Length of <code>fileMetadata</code>.
335      * @throws IOException
336      * @return Generated record-id made with
337      * <a href="http://en.wikipedia.org/wiki/Data:_URL">data: scheme</a> and
338      * the current filename.
339      */
340     public URI writeWarcinfoRecord(final String mimetype,
341     	final ANVLRecord namedFields, final InputStream fileMetadata,
342     	final long fileMetadataLength)
343     throws IOException {
344     	final URI recordid = generateRecordId(TYPE, WARCINFO);
345     	writeWarcinfoRecord(ArchiveUtils.getLog14Date(), mimetype, recordid,
346             namedFields, fileMetadata, fileMetadataLength);
347     	return recordid;
348     }
349     
350     /***
351      * Write a <code>warcinfo</code> to current file.
352      * The <code>warcinfo</code> type uses its <code>recordId</code> as its URL.
353      * @param recordId URI to use for this warcinfo.
354      * @param create14DigitDate Record creation date as 14 digit date.
355      * @param mimetype Mimetype of the <code>fileMetadata</code>.
356      * @param namedFields Named fields.
357      * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
358      * @param fileMetadataLength Length of <code>fileMetadata</code>.
359      * @throws IOException
360      */
361     public void writeWarcinfoRecord(final String create14DigitDate,
362         final String mimetype, final URI recordId, final ANVLRecord namedFields,
363     	final InputStream fileMetadata, final long fileMetadataLength)
364     throws IOException {
365     	writeRecord(WARCINFO, null, create14DigitDate, mimetype,
366         	recordId, namedFields, fileMetadata, fileMetadataLength, true);
367     }
368     
369     public void writeRequestRecord(final String url,
370         final String create14DigitDate, final String mimetype,
371         final URI recordId,
372         final ANVLRecord namedFields, final InputStream request,
373         final long requestLength)
374     throws IOException {
375         writeRecord(REQUEST, url, create14DigitDate,
376             mimetype, recordId, namedFields, request,
377             requestLength, true);
378     }
379     
380     public void writeResourceRecord(final String url,
381             final String create14DigitDate, final String mimetype,
382             final ANVLRecord namedFields, final InputStream response,
383             final long responseLength)
384     throws IOException {
385     	writeResourceRecord(url, create14DigitDate, mimetype, getRecordID(),
386     			namedFields, response, responseLength);
387     }
388     
389     public void writeResourceRecord(final String url,
390             final String create14DigitDate, final String mimetype,
391             final URI recordId,
392             final ANVLRecord namedFields, final InputStream response,
393             final long responseLength)
394     throws IOException {
395         writeRecord(RESOURCE, url, create14DigitDate,
396             mimetype, recordId, namedFields, response,
397             responseLength, true);
398     }
399 
400     public void writeResponseRecord(final String url,
401             final String create14DigitDate, final String mimetype,
402             final URI recordId,
403             final ANVLRecord namedFields, final InputStream response,
404             final long responseLength)
405     throws IOException {
406         writeRecord(RESPONSE, url, create14DigitDate,
407             mimetype, recordId, namedFields, response,
408             responseLength, true);
409     }
410     
411     public void writeRevisitRecord(final String url,
412             final String create14DigitDate, final String mimetype,
413             final URI recordId,
414             final ANVLRecord namedFields, final InputStream response,
415             final long responseLength)
416     throws IOException {
417         writeRecord(REVISIT, url, create14DigitDate,
418             mimetype, recordId, namedFields, response,
419             responseLength, false);
420     }
421     
422     public void writeMetadataRecord(final String url,
423             final String create14DigitDate, final String mimetype,
424             final URI recordId,
425             final ANVLRecord namedFields, final InputStream metadata,
426             final long metadataLength)
427     throws IOException {
428         writeRecord(METADATA, url, create14DigitDate,
429             mimetype, recordId, namedFields, metadata,
430             metadataLength, true);
431     }
432     
433     /***
434      * Convenience method for getting Record-Ids.
435      * @return A record ID.
436      * @throws IOException
437      */
438     public static URI getRecordID() throws IOException {
439         URI result;
440         try {
441             result = GeneratorFactory.getFactory().getRecordID();
442         } catch (URISyntaxException e) {
443             throw new IOException(e.toString());
444         }
445         return result;
446     }
447 }