1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
26
27 import java.io.File;
28 import java.io.FileOutputStream;
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.io.OutputStream;
32 import java.text.DecimalFormat;
33 import java.text.NumberFormat;
34 import java.util.Iterator;
35 import java.util.List;
36 import java.util.concurrent.atomic.AtomicInteger;
37 import java.util.logging.Logger;
38 import java.util.zip.GZIPOutputStream;
39
40 import org.archive.util.ArchiveUtils;
41 import org.archive.util.IoUtils;
42 import org.archive.util.TimestampSerialno;
43
44
45 /***
46 * Member of {@link WriterPool}.
47 * Implements rotating off files, file naming with some guarantee of
48 * uniqueness, and position in file. Subclass to pick up functionality for a
49 * particular Writer type.
50 * @author stack
51 * @version $Date: 2008-01-31 02:06:18 +0000 (Thu, 31 Jan 2008) $ $Revision: 5707 $
52 */
53 public abstract class WriterPoolMember implements ArchiveFileConstants {
54 private final Logger logger = Logger.getLogger(this.getClass().getName());
55
56 public static final String UTF8 = "UTF-8";
57
58 /***
59 * Default file prefix.
60 *
61 * Stands for Internet Archive Heritrix.
62 */
63 public static final String DEFAULT_PREFIX = "IAH";
64
65 /***
66 * Value to interpolate with actual hostname.
67 */
68 public static final String HOSTNAME_VARIABLE = "${HOSTNAME}";
69
70 /***
71 * Default for file suffix.
72 */
73 public static final String DEFAULT_SUFFIX = HOSTNAME_VARIABLE;
74
75 /***
76 * Reference to file we're currently writing.
77 */
78 private File f = null;
79
80 /***
81 * Output stream for file.
82 */
83 private OutputStream out = null;
84
85 /***
86 * File output stream.
87 * This is needed so can get at channel to find current position in file.
88 */
89 private FileOutputStream fos;
90
91 private final boolean compressed;
92 private List<File> writeDirs = null;
93 private String prefix = DEFAULT_PREFIX;
94 private String suffix = DEFAULT_SUFFIX;
95 private final long maxSize;
96 private final String extension;
97
98 /***
99 * Creation date for the current file.
100 * Set by {@link #createFile()}.
101 */
102 private String createTimestamp = "UNSET!!!";
103
104 /***
105 * A running sequence used making unique file names.
106 */
107 final private AtomicInteger serialNo;
108
109 /***
110 * Directories round-robin index.
111 */
112 private static int roundRobinIndex = 0;
113
114 /***
115 * NumberFormat instance for formatting serial number.
116 *
117 * Pads serial number with zeros.
118 */
119 private static NumberFormat serialNoFormatter = new DecimalFormat("00000");
120
121
122 /***
123 * Buffer to reuse writing streams.
124 */
125 private final byte [] scratchbuffer = new byte[4 * 1024];
126
127
128 /***
129 * Constructor.
130 * Takes a stream. Use with caution. There is no upperbound check on size.
131 * Will just keep writing.
132 *
133 * @param serialNo used to create unique filename sequences
134 * @param out Where to write.
135 * @param file File the <code>out</code> is connected to.
136 * @param cmprs Compress the content written.
137 * @param a14DigitDate If null, we'll write current time.
138 * @throws IOException
139 */
140 protected WriterPoolMember(AtomicInteger serialNo,
141 final OutputStream out, final File file,
142 final boolean cmprs, String a14DigitDate)
143 throws IOException {
144 this(serialNo, null, null, cmprs, -1, null);
145 this.out = out;
146 this.f = file;
147 }
148
149 /***
150 * Constructor.
151 *
152 * @param serialNo used to create unique filename sequences
153 * @param dirs Where to drop files.
154 * @param prefix File prefix to use.
155 * @param cmprs Compress the records written.
156 * @param maxSize Maximum size for ARC files written.
157 * @param extension Extension to give file.
158 */
159 public WriterPoolMember(AtomicInteger serialNo,
160 final List<File> dirs, final String prefix,
161 final boolean cmprs, final long maxSize, final String extension) {
162 this(serialNo, dirs, prefix, "", cmprs, maxSize, extension);
163 }
164
165 /***
166 * Constructor.
167 *
168 * @param serialNo used to create unique filename sequences
169 * @param dirs Where to drop files.
170 * @param prefix File prefix to use.
171 * @param cmprs Compress the records written.
172 * @param maxSize Maximum size for ARC files written.
173 * @param suffix File tail to use. If null, unused.
174 * @param extension Extension to give file.
175 */
176 public WriterPoolMember(AtomicInteger serialNo,
177 final List<File> dirs, final String prefix,
178 final String suffix, final boolean cmprs,
179 final long maxSize, final String extension) {
180 this.suffix = suffix;
181 this.prefix = prefix;
182 this.maxSize = maxSize;
183 this.writeDirs = dirs;
184 this.compressed = cmprs;
185 this.extension = extension;
186 this.serialNo = serialNo;
187 }
188
189 /***
190 * Call this method just before/after any significant write.
191 *
192 * Call at the end of the writing of a record or just before we start
193 * writing a new record. Will close current file and open a new file
194 * if file size has passed out maxSize.
195 *
196 * <p>Creates and opens a file if none already open. One use of this method
197 * then is after construction, call this method to add the metadata, then
198 * call {@link #getPosition()} to find offset of first record.
199 *
200 * @exception IOException
201 */
202 public void checkSize() throws IOException {
203 if (this.out == null ||
204 (this.maxSize != -1 && (this.f.length() > this.maxSize))) {
205 createFile();
206 }
207 }
208
209 /***
210 * Create a new file.
211 * Rotates off the current Writer and creates a new in its place
212 * to take subsequent writes. Usually called from {@link #checkSize()}.
213 * @return Name of file created.
214 * @throws IOException
215 */
216 protected String createFile() throws IOException {
217 TimestampSerialno tsn = getTimestampSerialNo();
218 String name = this.prefix + '-' + getUniqueBasename(tsn) +
219 ((this.suffix == null || this.suffix.length() <= 0)?
220 "": "-" + this.suffix) + '.' + this.extension +
221 ((this.compressed)? '.' + COMPRESSED_FILE_EXTENSION: "") +
222 OCCUPIED_SUFFIX;
223 this.createTimestamp = tsn.getTimestamp();
224 File dir = getNextDirectory(this.writeDirs);
225 return createFile(new File(dir, name));
226 }
227
228 protected String createFile(final File file) throws IOException {
229 close();
230 this.f = file;
231 this.fos = new FileOutputStream(this.f);
232 this.out = new FastBufferedOutputStream(this.fos);
233 logger.info("Opened " + this.f.getAbsolutePath());
234 return this.f.getName();
235 }
236
237 /***
238 * @param dirs List of File objects that point at directories.
239 * @return Find next directory to write an arc too. If more
240 * than one, it tries to round-robin through each in turn.
241 * @throws IOException
242 */
243 protected File getNextDirectory(List<File> dirs)
244 throws IOException {
245 if (WriterPoolMember.roundRobinIndex >= dirs.size()) {
246 WriterPoolMember.roundRobinIndex = 0;
247 }
248 File d = null;
249 try {
250 d = checkWriteable((File)dirs.
251 get(WriterPoolMember.roundRobinIndex));
252 } catch (IndexOutOfBoundsException e) {
253
254
255 }
256 if (d == null && dirs.size() > 1) {
257 for (Iterator i = dirs.iterator(); d == null && i.hasNext();) {
258 d = checkWriteable((File)i.next());
259 }
260 } else {
261 WriterPoolMember.roundRobinIndex++;
262 }
263 if (d == null) {
264 throw new IOException("Directories unusable.");
265 }
266 return d;
267 }
268
269 protected File checkWriteable(File d) {
270 if (d == null) {
271 return d;
272 }
273
274 try {
275 IoUtils.ensureWriteableDirectory(d);
276 } catch(IOException e) {
277 logger.warning("Directory " + d.getPath() + " is not" +
278 " writeable or cannot be created: " + e.getMessage());
279 d = null;
280 }
281 return d;
282 }
283
284 protected synchronized TimestampSerialno getTimestampSerialNo() {
285 return getTimestampSerialNo(null);
286 }
287
288 /***
289 * Do static synchronization around getting of counter and timestamp so
290 * no chance of a thread getting in between the getting of timestamp and
291 * allocation of serial number throwing the two out of alignment.
292 *
293 * @param timestamp If non-null, use passed timestamp (must be 14 digit
294 * ARC format), else if null, timestamp with now.
295 * @return Instance of data structure that has timestamp and serial no.
296 */
297 protected synchronized TimestampSerialno
298 getTimestampSerialNo(final String timestamp) {
299 return new TimestampSerialno((timestamp != null)?
300 timestamp: ArchiveUtils.get14DigitDate(),
301 serialNo.getAndIncrement());
302 }
303
304 /***
305 * Return a unique basename.
306 *
307 * Name is timestamp + an every increasing sequence number.
308 *
309 * @param tsn Structure with timestamp and serial number.
310 *
311 * @return Unique basename.
312 */
313 private String getUniqueBasename(TimestampSerialno tsn) {
314 return tsn.getTimestamp() + "-" +
315 WriterPoolMember.serialNoFormatter.format(tsn.getSerialNumber());
316 }
317
318
319 /***
320 * Get the file name
321 *
322 * @return the filename, as if uncompressed
323 */
324 protected String getBaseFilename() {
325 String name = this.f.getName();
326 if (this.compressed && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) {
327 return name.substring(0,name.length() - 3);
328 } else if(this.compressed &&
329 name.endsWith(DOT_COMPRESSED_FILE_EXTENSION +
330 OCCUPIED_SUFFIX)) {
331 return name.substring(0, name.length() -
332 (3 + OCCUPIED_SUFFIX.length()));
333 } else {
334 return name;
335 }
336 }
337
338 /***
339 * Get this file.
340 *
341 * Used by junit test to test for creation and when {@link WriterPool} wants
342 * to invalidate a file.
343 *
344 * @return The current file.
345 */
346 public File getFile() {
347 return this.f;
348 }
349
350 /***
351 * Post write tasks.
352 *
353 * Has side effects. Will open new file if we're at the upperbound.
354 * If we're writing compressed files, it will wrap output stream with a
355 * GZIP writer with side effect that GZIP header is written out on the
356 * stream.
357 *
358 * @exception IOException
359 */
360 protected void preWriteRecordTasks()
361 throws IOException {
362 checkSize();
363 if (this.compressed) {
364
365
366
367 this.out = new CompressedStream(this.out);
368 }
369 }
370
371 /***
372 * Post file write tasks.
373 * If compressed, finishes up compression and flushes stream so any
374 * subsequent checks get good reading.
375 *
376 * @exception IOException
377 */
378 protected void postWriteRecordTasks()
379 throws IOException {
380 if (this.compressed) {
381 CompressedStream o = (CompressedStream)this.out;
382 o.finish();
383 o.flush();
384 o.end();
385 this.out = o.getWrappedStream();
386 }
387 }
388
389 /***
390 * Postion in current physical file.
391 * Used making accounting of bytes written.
392 * @return Position in underlying file. Call before or after writing
393 * records *only* to be safe.
394 * @throws IOException
395 */
396 public long getPosition() throws IOException {
397 long position = 0;
398 if (this.out != null) {
399 this.out.flush();
400 }
401 if (this.fos != null) {
402
403
404 this.fos.flush();
405 position = this.fos.getChannel().position();
406 }
407 return position;
408 }
409
410 public boolean isCompressed() {
411 return compressed;
412 }
413
414 protected void write(final byte [] b) throws IOException {
415 this.out.write(b);
416 }
417
418 protected void flush() throws IOException {
419 this.out.flush();
420 }
421
422 protected void write(byte[] b, int off, int len) throws IOException {
423 this.out.write(b, off, len);
424 }
425
426 protected void write(int b) throws IOException {
427 this.out.write(b);
428 }
429
430 /***
431 * @deprecated Use {@link #copyFrom(InputStream,long,boolean)} instead
432 */
433 protected void readFullyFrom(final InputStream is, final long recordLength,
434 final byte [] b)
435 throws IOException {
436 copyFrom(is, recordLength, true);
437 }
438
439 /***
440 * @deprecated Use {@link #copyFrom(InputStream,long,boolean)} instead
441 */
442 protected void readToLimitFrom(final InputStream is, final long limit,
443 final byte [] b)
444 throws IOException {
445 copyFrom(is, limit, true);
446 }
447
448 /***
449 * Copy bytes from the provided InputStream to the target file/stream being
450 * written.
451 *
452 * @param is
453 * InputStream to copy bytes from
454 * @param recordLength
455 * expected number of bytes to copy
456 * @param enforceLength
457 * whether to throw an exception if too many/too few bytes are
458 * available from stream
459 * @throws IOException
460 */
461 protected void copyFrom(final InputStream is, final long recordLength,
462 boolean enforceLength) throws IOException {
463 int read = scratchbuffer.length;
464 long tot = 0;
465 while ((tot < recordLength)
466 && (read = is.read(scratchbuffer)) != -1) {
467 int write = read;
468
469 write = (int) Math.min(write, recordLength - tot);
470 tot += read;
471 write(scratchbuffer, 0, write);
472 }
473 if (enforceLength && tot != recordLength) {
474
475 throw new IOException("Read " + tot + " but expected "
476 + recordLength);
477 }
478 }
479
480 public void close() throws IOException {
481 if (this.out == null) {
482 return;
483 }
484 this.out.close();
485 this.out = null;
486 this.fos = null;
487 if (this.f != null && this.f.exists()) {
488 String path = this.f.getAbsolutePath();
489 if (path.endsWith(OCCUPIED_SUFFIX)) {
490 File f = new File(path.substring(0,
491 path.length() - OCCUPIED_SUFFIX.length()));
492 if (!this.f.renameTo(f)) {
493 logger.warning("Failed rename of " + path);
494 }
495 this.f = f;
496 }
497
498 logger.info("Closed " + this.f.getAbsolutePath() +
499 ", size " + this.f.length());
500 }
501 }
502
503 protected OutputStream getOutputStream() {
504 return this.out;
505 }
506
507 protected String getCreateTimestamp() {
508 return createTimestamp;
509 }
510
511
512 /***
513 * An override so we get access to underlying output stream
514 * and offer an end() that does not accompany closing underlying
515 * stream.
516 * @author stack
517 */
518 private class CompressedStream extends GZIPOutputStream {
519 public CompressedStream(OutputStream out)
520 throws IOException {
521 super(out);
522 }
523
524 /***
525 * @return Reference to stream being compressed.
526 */
527 OutputStream getWrappedStream() {
528 return this.out;
529 }
530
531 /***
532 * Release the deflater's native process resources,
533 * which otherwise would not occur until either
534 * finalization or DeflaterOutputStream.close()
535 * (which would also close underlying stream).
536 */
537 public void end() {
538 def.end();
539 }
540
541
542 }
543 }