1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io.warc;
24
25 import java.io.ByteArrayInputStream;
26 import java.io.File;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.OutputStream;
30 import java.net.URI;
31 import java.net.URISyntaxException;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.concurrent.atomic.AtomicInteger;
36
37 import org.archive.io.WriterPoolMember;
38 import org.archive.uid.GeneratorFactory;
39 import org.archive.util.ArchiveUtils;
40 import org.archive.util.anvl.ANVLRecord;
41
42
43 /***
44 * <b>Experimental</b> WARC implementation.
45 *
46 * <p>Assumption is that the caller is managing access to this
47 * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance
48 * at any one time.
49 *
50 * <p>While being written, WARCs have a '.open' suffix appended.
51 *
52 * @author stack
53 * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
54 */
55 public class WARCWriter extends WriterPoolMember
56 implements WARCConstants {
57
58 /***
59 * NEWLINE as bytes.
60 */
61 public static byte [] CRLF_BYTES;
62 static {
63 try {
64 CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
65 } catch(Exception e) {
66 e.printStackTrace();
67 }
68 };
69
70 /***
71 * Metadata.
72 */
73 private final List<String> fileMetadata;
74
75
76 /***
77 * Shutdown Constructor
78 * Has default access so can make instance to test utility methods.
79 */
80 WARCWriter() {
81 this(null, null, "", "", true, -1, null);
82 }
83
84 /***
85 * Constructor.
86 * Takes a stream. Use with caution. There is no upperbound check on size.
87 * Will just keep writing. Only pass Streams that are bounded.
88 * @param serialNo used to generate unique file name sequences
89 * @param out Where to write.
90 * @param f File the <code>out</code> is connected to.
91 * @param cmprs Compress the content written.
92 * @param a14DigitDate If null, we'll write current time.
93 * @throws IOException
94 */
95 public WARCWriter(final AtomicInteger serialNo,
96 final OutputStream out, final File f,
97 final boolean cmprs, final String a14DigitDate,
98 final List<String> warcinfoData)
99 throws IOException {
100 super(serialNo, out, f, cmprs, a14DigitDate);
101 this.fileMetadata = warcinfoData;
102 }
103
104 /***
105 * Constructor.
106 *
107 * @param dirs Where to drop files.
108 * @param prefix File prefix to use.
109 * @param cmprs Compress the records written.
110 * @param maxSize Maximum size for ARC files written.
111 * @param suffix File tail to use. If null, unused.
112 * @param warcinfoData File metadata for warcinfo record.
113 */
114 public WARCWriter(final AtomicInteger serialNo,
115 final List<File> dirs, final String prefix,
116 final String suffix, final boolean cmprs,
117 final long maxSize, final List<String> warcinfoData) {
118 super(serialNo, dirs, prefix, suffix, cmprs, maxSize,
119 WARC_FILE_EXTENSION);
120 this.fileMetadata = warcinfoData;
121 }
122
123 @Override
124 protected String createFile(File file) throws IOException {
125 String filename = super.createFile(file);
126 writeWarcinfoRecord(filename);
127 return filename;
128 }
129
130 protected void baseCharacterCheck(final char c, final String parameter)
131 throws IOException {
132
133 if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
134 throw new IOException("Contains illegal character 0x" +
135 Integer.toHexString(c) + ": " + parameter);
136 }
137 }
138
139 protected String checkHeaderValue(final String value)
140 throws IOException {
141 for (int i = 0; i < value.length(); i++) {
142 final char c = value.charAt(i);
143 baseCharacterCheck(c, value);
144 if (Character.isWhitespace(c)) {
145 throw new IOException("Contains disallowed white space 0x" +
146 Integer.toHexString(c) + ": " + value);
147 }
148 }
149 return value;
150 }
151
152 protected String checkHeaderLineMimetypeParameter(final String parameter)
153 throws IOException {
154 StringBuilder sb = new StringBuilder(parameter.length());
155 boolean wasWhitespace = false;
156 for (int i = 0; i < parameter.length(); i++) {
157 char c = parameter.charAt(i);
158 if (Character.isWhitespace(c)) {
159
160
161
162 if (wasWhitespace) {
163 continue;
164 }
165 wasWhitespace = true;
166 c = ' ';
167 } else {
168 wasWhitespace = false;
169 baseCharacterCheck(c, parameter);
170 }
171 sb.append(c);
172 }
173
174 return sb.toString();
175 }
176
177 protected String createRecordHeader(final String type,
178 final String url, final String create14DigitDate,
179 final String mimetype, final URI recordId,
180 final ANVLRecord xtraHeaders, final long contentLength)
181 throws IOException {
182 final StringBuilder sb =
183 new StringBuilder(2048
184 sb.append(WARC_ID).append(CRLF);
185 sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(type).
186 append(CRLF);
187
188 if (url != null && url.length() > 0) {
189 sb.append(HEADER_KEY_URI).append(COLON_SPACE).
190 append(checkHeaderValue(url)).append(CRLF);
191 }
192 sb.append(HEADER_KEY_DATE).append(COLON_SPACE).
193 append(create14DigitDate).append(CRLF);
194 if (xtraHeaders != null) {
195 for (final Iterator i = xtraHeaders.iterator(); i.hasNext();) {
196 sb.append(i.next()).append(CRLF);
197 }
198 }
199
200 sb.append(HEADER_KEY_ID).append(COLON_SPACE).append('<').
201 append(recordId.toString()).append('>').append(CRLF);
202 if (contentLength > 0) {
203 sb.append(CONTENT_TYPE).append(COLON_SPACE).append(
204 checkHeaderLineMimetypeParameter(mimetype)).append(CRLF);
205 }
206 sb.append(CONTENT_LENGTH).append(COLON_SPACE).
207 append(Long.toString(contentLength)).append(CRLF);
208
209 return sb.toString();
210 }
211
212 /***
213 * @deprecated Use {@link #writeRecord(String,String,String,String,URI,ANVLRecord,InputStream,long,boolean)} instead
214 */
215 protected void writeRecord(final String type, final String url,
216 final String create14DigitDate, final String mimetype,
217 final URI recordId, ANVLRecord xtraHeaders,
218 final InputStream contentStream, final long contentLength)
219 throws IOException {
220 writeRecord(type, url, create14DigitDate, mimetype, recordId, xtraHeaders, contentStream, contentLength, true);
221 }
222
223 protected void writeRecord(final String type, final String url,
224 final String create14DigitDate, final String mimetype,
225 final URI recordId, ANVLRecord xtraHeaders,
226 final InputStream contentStream, final long contentLength, boolean enforceLength)
227 throws IOException {
228 if (!TYPES_LIST.contains(type)) {
229 throw new IllegalArgumentException("Unknown record type: " + type);
230 }
231 if (contentLength == 0 &&
232 (xtraHeaders == null || xtraHeaders.size() <= 0)) {
233 throw new IllegalArgumentException("Cannot write record " +
234 "of content-length zero and base headers only.");
235 }
236
237 preWriteRecordTasks();
238 try {
239 final String header = createRecordHeader(type, url,
240 create14DigitDate, mimetype, recordId, xtraHeaders,
241 contentLength);
242
243 write(header.getBytes(WARC_HEADER_ENCODING));
244
245 if (contentStream != null && contentLength > 0) {
246
247 write(CRLF_BYTES);
248 copyFrom(contentStream, contentLength, enforceLength);
249 }
250
251
252 write(CRLF_BYTES);
253 write(CRLF_BYTES);
254 } finally {
255 postWriteRecordTasks();
256 }
257 }
258
259 protected URI generateRecordId(final Map<String, String> qualifiers)
260 throws IOException {
261 URI rid = null;
262 try {
263 rid = GeneratorFactory.getFactory().
264 getQualifiedRecordID(qualifiers);
265 } catch (URISyntaxException e) {
266
267 throw new IOException(e.getMessage());
268 }
269 return rid;
270 }
271
272 protected URI generateRecordId(final String key, final String value)
273 throws IOException {
274 URI rid = null;
275 try {
276 rid = GeneratorFactory.getFactory().
277 getQualifiedRecordID(key, value);
278 } catch (URISyntaxException e) {
279
280 throw new IOException(e.getMessage());
281 }
282 return rid;
283 }
284
285 public URI writeWarcinfoRecord(String filename)
286 throws IOException {
287 return writeWarcinfoRecord(filename, null);
288 }
289
290 public URI writeWarcinfoRecord(String filename, final String description)
291 throws IOException {
292
293 if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
294 filename = filename.substring(0,
295 filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
296 }
297
298 ANVLRecord headerrecord = new ANVLRecord(1);
299 headerrecord.addLabelValue(HEADER_KEY_FILENAME, filename);
300
301
302
303 String blockfields = "";
304 if (this.fileMetadata == null) {
305
306 blockfields = "dummy: value";
307 } else {
308 for (String s : (List<String>) fileMetadata) {
309 blockfields += s;
310 }
311 }
312 byte[] warcinfoBody;
313 if (description != null && description.length() > 0) {
314
315 ANVLRecord blockrecord = ANVLRecord.load(blockfields);
316 blockrecord.addLabelValue(CONTENT_DESCRIPTION, description);
317 warcinfoBody = blockrecord.toString().getBytes("UTF-8");
318 } else {
319
320 warcinfoBody = blockfields.getBytes("UTF-8");
321 }
322
323 URI uri = writeWarcinfoRecord("application/warc-fields", headerrecord,
324 new ByteArrayInputStream(warcinfoBody), warcinfoBody.length);
325 return uri;
326 }
327
328 /***
329 * Write a warcinfo to current file.
330 * TODO: Write crawl metadata or pointers to crawl description.
331 * @param mimetype Mimetype of the <code>fileMetadata</code> block.
332 * @param namedFields Named fields. Pass <code>null</code> if none.
333 * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
334 * @param fileMetadataLength Length of <code>fileMetadata</code>.
335 * @throws IOException
336 * @return Generated record-id made with
337 * <a href="http://en.wikipedia.org/wiki/Data:_URL">data: scheme</a> and
338 * the current filename.
339 */
340 public URI writeWarcinfoRecord(final String mimetype,
341 final ANVLRecord namedFields, final InputStream fileMetadata,
342 final long fileMetadataLength)
343 throws IOException {
344 final URI recordid = generateRecordId(TYPE, WARCINFO);
345 writeWarcinfoRecord(ArchiveUtils.getLog14Date(), mimetype, recordid,
346 namedFields, fileMetadata, fileMetadataLength);
347 return recordid;
348 }
349
350 /***
351 * Write a <code>warcinfo</code> to current file.
352 * The <code>warcinfo</code> type uses its <code>recordId</code> as its URL.
353 * @param recordId URI to use for this warcinfo.
354 * @param create14DigitDate Record creation date as 14 digit date.
355 * @param mimetype Mimetype of the <code>fileMetadata</code>.
356 * @param namedFields Named fields.
357 * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
358 * @param fileMetadataLength Length of <code>fileMetadata</code>.
359 * @throws IOException
360 */
361 public void writeWarcinfoRecord(final String create14DigitDate,
362 final String mimetype, final URI recordId, final ANVLRecord namedFields,
363 final InputStream fileMetadata, final long fileMetadataLength)
364 throws IOException {
365 writeRecord(WARCINFO, null, create14DigitDate, mimetype,
366 recordId, namedFields, fileMetadata, fileMetadataLength, true);
367 }
368
369 public void writeRequestRecord(final String url,
370 final String create14DigitDate, final String mimetype,
371 final URI recordId,
372 final ANVLRecord namedFields, final InputStream request,
373 final long requestLength)
374 throws IOException {
375 writeRecord(REQUEST, url, create14DigitDate,
376 mimetype, recordId, namedFields, request,
377 requestLength, true);
378 }
379
380 public void writeResourceRecord(final String url,
381 final String create14DigitDate, final String mimetype,
382 final ANVLRecord namedFields, final InputStream response,
383 final long responseLength)
384 throws IOException {
385 writeResourceRecord(url, create14DigitDate, mimetype, getRecordID(),
386 namedFields, response, responseLength);
387 }
388
389 public void writeResourceRecord(final String url,
390 final String create14DigitDate, final String mimetype,
391 final URI recordId,
392 final ANVLRecord namedFields, final InputStream response,
393 final long responseLength)
394 throws IOException {
395 writeRecord(RESOURCE, url, create14DigitDate,
396 mimetype, recordId, namedFields, response,
397 responseLength, true);
398 }
399
400 public void writeResponseRecord(final String url,
401 final String create14DigitDate, final String mimetype,
402 final URI recordId,
403 final ANVLRecord namedFields, final InputStream response,
404 final long responseLength)
405 throws IOException {
406 writeRecord(RESPONSE, url, create14DigitDate,
407 mimetype, recordId, namedFields, response,
408 responseLength, true);
409 }
410
411 public void writeRevisitRecord(final String url,
412 final String create14DigitDate, final String mimetype,
413 final URI recordId,
414 final ANVLRecord namedFields, final InputStream response,
415 final long responseLength)
416 throws IOException {
417 writeRecord(REVISIT, url, create14DigitDate,
418 mimetype, recordId, namedFields, response,
419 responseLength, false);
420 }
421
422 public void writeMetadataRecord(final String url,
423 final String create14DigitDate, final String mimetype,
424 final URI recordId,
425 final ANVLRecord namedFields, final InputStream metadata,
426 final long metadataLength)
427 throws IOException {
428 writeRecord(METADATA, url, create14DigitDate,
429 mimetype, recordId, namedFields, metadata,
430 metadataLength, true);
431 }
432
433 /***
434 * Convenience method for getting Record-Ids.
435 * @return A record ID.
436 * @throws IOException
437 */
438 public static URI getRecordID() throws IOException {
439 URI result;
440 try {
441 result = GeneratorFactory.getFactory().getRecordID();
442 } catch (URISyntaxException e) {
443 throw new IOException(e.toString());
444 }
445 return result;
446 }
447 }