1   /* $Id: WriterPoolMember.java 5707 2008-01-31 02:06:18Z Gojomo $
2    *
3    * Created on July 21st, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
26  
27  import java.io.File;
28  import java.io.FileOutputStream;
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.io.OutputStream;
32  import java.text.DecimalFormat;
33  import java.text.NumberFormat;
34  import java.util.Iterator;
35  import java.util.List;
36  import java.util.concurrent.atomic.AtomicInteger;
37  import java.util.logging.Logger;
38  import java.util.zip.GZIPOutputStream;
39  
40  import org.archive.util.ArchiveUtils;
41  import org.archive.util.IoUtils;
42  import org.archive.util.TimestampSerialno;
43  
44  
45  /***
46   * Member of {@link WriterPool}.
47   * Implements rotating off files, file naming with some guarantee of
48   * uniqueness, and position in file. Subclass to pick up functionality for a
49   * particular Writer type.
50   * @author stack
51   * @version $Date: 2008-01-31 02:06:18 +0000 (Thu, 31 Jan 2008) $ $Revision: 5707 $
52   */
53  public abstract class WriterPoolMember implements ArchiveFileConstants {
54      private final Logger logger = Logger.getLogger(this.getClass().getName());
55      
56      public static final String UTF8 = "UTF-8";
57      
58      /***
59       * Default file prefix.
60       * 
61       * Stands for Internet Archive Heritrix.
62       */
63      public static final String DEFAULT_PREFIX = "IAH";
64      
65      /***
66       * Value to interpolate with actual hostname.
67       */
68      public static final String HOSTNAME_VARIABLE = "${HOSTNAME}";
69      
70      /***
71       * Default for file suffix.
72       */
73      public static final String DEFAULT_SUFFIX = HOSTNAME_VARIABLE;
74  
75      /***
76       * Reference to file we're currently writing.
77       */
78      private File f = null;
79  
80      /***
81       *  Output stream for file.
82       */
83      private OutputStream out = null;
84      
85      /***
86       * File output stream.
87       * This is needed so can get at channel to find current position in file.
88       */
89      private FileOutputStream fos;
90      
91      private final boolean compressed;
92      private List<File> writeDirs = null;
93      private String prefix = DEFAULT_PREFIX;
94      private String suffix = DEFAULT_SUFFIX;
95      private final long maxSize;
96      private final String extension;
97  
98      /***
99       * Creation date for the current file.
100      * Set by {@link #createFile()}.
101      */
102 	private String createTimestamp = "UNSET!!!";
103     
104     /***
105      * A running sequence used making unique file names.
106      */
107     final private AtomicInteger serialNo;
108     
109     /***
110      * Directories round-robin index.
111      */
112     private static int roundRobinIndex = 0;
113 
114     /***
115      * NumberFormat instance for formatting serial number.
116      *
117      * Pads serial number with zeros.
118      */
119     private static NumberFormat serialNoFormatter = new DecimalFormat("00000");
120     
121     
122     /***
123      * Buffer to reuse writing streams.
124      */
125     private final byte [] scratchbuffer = new byte[4 * 1024];
126  
127     
128     /***
129      * Constructor.
130      * Takes a stream. Use with caution. There is no upperbound check on size.
131      * Will just keep writing.
132      * 
133      * @param serialNo  used to create unique filename sequences
134      * @param out Where to write.
135      * @param file File the <code>out</code> is connected to.
136      * @param cmprs Compress the content written.
137      * @param a14DigitDate If null, we'll write current time.
138      * @throws IOException
139      */
140     protected WriterPoolMember(AtomicInteger serialNo, 
141             final OutputStream out, final File file,
142             final boolean cmprs, String a14DigitDate)
143     throws IOException {
144         this(serialNo, null, null, cmprs, -1, null);
145         this.out = out;
146         this.f = file;
147     }
148     
149     /***
150      * Constructor.
151      *
152      * @param serialNo  used to create unique filename sequences
153      * @param dirs Where to drop files.
154      * @param prefix File prefix to use.
155      * @param cmprs Compress the records written. 
156      * @param maxSize Maximum size for ARC files written.
157      * @param extension Extension to give file.
158      */
159     public WriterPoolMember(AtomicInteger serialNo, 
160             final List<File> dirs, final String prefix, 
161             final boolean cmprs, final long maxSize, final String extension) {
162         this(serialNo, dirs, prefix, "", cmprs, maxSize, extension);
163     }
164             
165     /***
166      * Constructor.
167      *
168      * @param serialNo  used to create unique filename sequences
169      * @param dirs Where to drop files.
170      * @param prefix File prefix to use.
171      * @param cmprs Compress the records written. 
172      * @param maxSize Maximum size for ARC files written.
173      * @param suffix File tail to use.  If null, unused.
174      * @param extension Extension to give file.
175      */
176     public WriterPoolMember(AtomicInteger serialNo,
177             final List<File> dirs, final String prefix, 
178             final String suffix, final boolean cmprs,
179             final long maxSize, final String extension) {
180         this.suffix = suffix;
181         this.prefix = prefix;
182         this.maxSize = maxSize;
183         this.writeDirs = dirs;
184         this.compressed = cmprs;
185         this.extension = extension;
186         this.serialNo = serialNo;
187     }
188 
189 	/***
190 	 * Call this method just before/after any significant write.
191 	 *
192 	 * Call at the end of the writing of a record or just before we start
193 	 * writing a new record.  Will close current file and open a new file
194 	 * if file size has passed out maxSize.
195 	 * 
196 	 * <p>Creates and opens a file if none already open.  One use of this method
197 	 * then is after construction, call this method to add the metadata, then
198 	 * call {@link #getPosition()} to find offset of first record.
199 	 *
200 	 * @exception IOException
201 	 */
202     public void checkSize() throws IOException {
203         if (this.out == null ||
204                 (this.maxSize != -1 && (this.f.length() > this.maxSize))) {
205             createFile();
206         }
207     }
208 
209     /***
210      * Create a new file.
211      * Rotates off the current Writer and creates a new in its place
212      * to take subsequent writes.  Usually called from {@link #checkSize()}.
213      * @return Name of file created.
214      * @throws IOException
215      */
216     protected String createFile() throws IOException {
217         TimestampSerialno tsn = getTimestampSerialNo();
218         String name = this.prefix + '-' + getUniqueBasename(tsn) +
219             ((this.suffix == null || this.suffix.length() <= 0)?
220                 "": "-" + this.suffix) + '.' + this.extension  +
221             ((this.compressed)? '.' + COMPRESSED_FILE_EXTENSION: "") +
222             OCCUPIED_SUFFIX;
223         this.createTimestamp = tsn.getTimestamp();
224         File dir = getNextDirectory(this.writeDirs);
225         return createFile(new File(dir, name));
226     }
227     
228     protected String createFile(final File file) throws IOException {
229     	close();
230         this.f = file;
231         this.fos = new FileOutputStream(this.f);
232         this.out = new FastBufferedOutputStream(this.fos);
233         logger.info("Opened " + this.f.getAbsolutePath());
234         return this.f.getName();
235     }
236     
237     /***
238      * @param dirs List of File objects that point at directories.
239      * @return Find next directory to write an arc too.  If more
240      * than one, it tries to round-robin through each in turn.
241      * @throws IOException
242      */
243     protected File getNextDirectory(List<File> dirs)
244     throws IOException {
245         if (WriterPoolMember.roundRobinIndex >= dirs.size()) {
246             WriterPoolMember.roundRobinIndex = 0;
247         }
248         File d = null;
249         try {
250             d = checkWriteable((File)dirs.
251                 get(WriterPoolMember.roundRobinIndex));
252         } catch (IndexOutOfBoundsException e) {
253             // Dirs list might be altered underneath us.
254             // If so, we get this exception -- just keep on going.
255         }
256         if (d == null && dirs.size() > 1) {
257             for (Iterator i = dirs.iterator(); d == null && i.hasNext();) {
258                 d = checkWriteable((File)i.next());
259             }
260         } else {
261             WriterPoolMember.roundRobinIndex++;
262         }
263         if (d == null) {
264             throw new IOException("Directories unusable.");
265         }
266         return d;
267     }
268         
269     protected File checkWriteable(File d) {
270         if (d == null) {
271             return d;
272         }
273         
274         try {
275             IoUtils.ensureWriteableDirectory(d);
276         } catch(IOException e) {
277             logger.warning("Directory " + d.getPath() + " is not" +
278                 " writeable or cannot be created: " + e.getMessage());
279             d = null;
280         }
281         return d;
282     }
283     
284     protected synchronized TimestampSerialno getTimestampSerialNo() {
285         return getTimestampSerialNo(null);
286     }
287     
288     /***
289      * Do static synchronization around getting of counter and timestamp so
290      * no chance of a thread getting in between the getting of timestamp and
291      * allocation of serial number throwing the two out of alignment.
292      * 
293      * @param timestamp If non-null, use passed timestamp (must be 14 digit
294      * ARC format), else if null, timestamp with now.
295      * @return Instance of data structure that has timestamp and serial no.
296      */
297     protected synchronized TimestampSerialno
298             getTimestampSerialNo(final String timestamp) {
299         return new TimestampSerialno((timestamp != null)?
300                 timestamp: ArchiveUtils.get14DigitDate(),
301                 serialNo.getAndIncrement());
302     }
303 
304     /***
305      * Return a unique basename.
306      *
307      * Name is timestamp + an every increasing sequence number.
308      *
309      * @param tsn Structure with timestamp and serial number.
310      *
311      * @return Unique basename.
312      */
313     private String getUniqueBasename(TimestampSerialno tsn) {
314         return tsn.getTimestamp() + "-" +
315            WriterPoolMember.serialNoFormatter.format(tsn.getSerialNumber());
316     }
317 
318 
319     /***
320      * Get the file name
321      * 
322      * @return the filename, as if uncompressed
323      */
324     protected String getBaseFilename() {
325         String name = this.f.getName();
326         if (this.compressed && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) {
327             return name.substring(0,name.length() - 3);
328         } else if(this.compressed &&
329                 name.endsWith(DOT_COMPRESSED_FILE_EXTENSION +
330                     OCCUPIED_SUFFIX)) {
331             return name.substring(0, name.length() -
332                 (3 + OCCUPIED_SUFFIX.length()));
333         } else {
334             return name;
335         }
336     }
337 
338 	/***
339 	 * Get this file.
340 	 *
341 	 * Used by junit test to test for creation and when {@link WriterPool} wants
342      * to invalidate a file.
343 	 *
344 	 * @return The current file.
345 	 */
346     public File getFile() {
347         return this.f;
348     }
349 
350     /***
351      * Post write tasks.
352      * 
353      * Has side effects.  Will open new file if we're at the upperbound.
354      * If we're writing compressed files, it will wrap output stream with a
355      * GZIP writer with side effect that GZIP header is written out on the
356      * stream.
357      *
358      * @exception IOException
359      */
360     protected void preWriteRecordTasks()
361     throws IOException {
362         checkSize();
363         if (this.compressed) {
364             // Wrap stream in GZIP Writer.
365             // The below construction immediately writes the GZIP 'default'
366             // header out on the underlying stream.
367             this.out = new CompressedStream(this.out);
368         }
369     }
370 
371     /***
372      * Post file write tasks.
373      * If compressed, finishes up compression and flushes stream so any
374      * subsequent checks get good reading.
375      *
376      * @exception IOException
377      */
378     protected void postWriteRecordTasks()
379     throws IOException {
380         if (this.compressed) {
381             CompressedStream o = (CompressedStream)this.out;
382             o.finish();
383             o.flush();
384             o.end();
385             this.out = o.getWrappedStream();
386         }
387     }
388     
389 	/***
390      * Postion in current physical file.
391      * Used making accounting of bytes written.
392 	 * @return Position in underlying file.  Call before or after writing
393      * records *only* to be safe.
394 	 * @throws IOException
395 	 */
396     public long getPosition() throws IOException {
397         long position = 0;
398         if (this.out != null) {
399             this.out.flush();
400         }
401         if (this.fos != null) {
402             // Call flush on underlying file though probably not needed assuming
403             // above this.out.flush called through to this.fos.
404             this.fos.flush();
405             position = this.fos.getChannel().position();
406         }
407         return position;
408     }
409 
410     public boolean isCompressed() {
411         return compressed;
412     }
413     
414     protected void write(final byte [] b) throws IOException {
415     	this.out.write(b);
416     }
417     
418 	protected void flush() throws IOException {
419 		this.out.flush();
420 	}
421 
422 	protected void write(byte[] b, int off, int len) throws IOException {
423 		this.out.write(b, off, len);
424 	}
425 
426 	protected void write(int b) throws IOException {
427 		this.out.write(b);
428 	}
429 	
430 	/***
431      * @deprecated Use {@link #copyFrom(InputStream,long,boolean)} instead
432      */
433     protected void readFullyFrom(final InputStream is, final long recordLength,
434     		final byte [] b)
435     throws IOException {
436         copyFrom(is, recordLength, true);
437     }
438 
439     /***
440      * @deprecated Use {@link #copyFrom(InputStream,long,boolean)} instead
441      */
442 	protected void readToLimitFrom(final InputStream is, final long limit,
443 			final byte [] b)
444 	throws IOException {
445         copyFrom(is, limit, true);
446 	}
447 
448     /***
449      * Copy bytes from the provided InputStream to the target file/stream being
450      * written.
451      * 
452      * @param is
453      *            InputStream to copy bytes from
454      * @param recordLength
455      *            expected number of bytes to copy
456      * @param enforceLength
457      *            whether to throw an exception if too many/too few bytes are
458      *            available from stream
459      * @throws IOException
460      */
461     protected void copyFrom(final InputStream is, final long recordLength,
462             boolean enforceLength) throws IOException {
463         int read = scratchbuffer.length;
464         long tot = 0;
465         while ((tot < recordLength)
466                 && (read = is.read(scratchbuffer)) != -1) {
467             int write = read; 
468             // never write more than declared length
469             write = (int) Math.min(write, recordLength - tot);
470             tot += read;
471             write(scratchbuffer, 0, write);
472         }
473         if (enforceLength && tot != recordLength) {
474             // throw exception if desired for read vs. declared mismatches
475             throw new IOException("Read " + tot + " but expected "
476                     + recordLength);
477         }
478     }
479 
480     public void close() throws IOException {
481         if (this.out == null) {
482             return;
483         }
484         this.out.close();
485         this.out = null;
486         this.fos = null;
487         if (this.f != null && this.f.exists()) {
488             String path = this.f.getAbsolutePath();
489             if (path.endsWith(OCCUPIED_SUFFIX)) {
490                 File f = new File(path.substring(0,
491                         path.length() - OCCUPIED_SUFFIX.length()));
492                 if (!this.f.renameTo(f)) {
493                     logger.warning("Failed rename of " + path);
494                 }
495                 this.f = f;
496             }
497             
498             logger.info("Closed " + this.f.getAbsolutePath() +
499                     ", size " + this.f.length());
500         }
501     }
502     
503     protected OutputStream getOutputStream() {
504     	return this.out;
505     }
506     
507 	protected String getCreateTimestamp() {
508 		return createTimestamp;
509 	}
510     
511     
512     /***
513      * An override so we get access to underlying output stream
514      * and offer an end() that does not accompany closing underlying
515      * stream. 
516      * @author stack
517      */
518     private class CompressedStream extends GZIPOutputStream {
519         public CompressedStream(OutputStream out)
520         throws IOException {
521             super(out);
522         }
523         
524         /***
525          * @return Reference to stream being compressed.
526          */
527         OutputStream getWrappedStream() {
528             return this.out;
529         }
530 
531         /***
532          * Release the deflater's native process resources,
533          * which otherwise would not occur until either
534          * finalization or DeflaterOutputStream.close() 
535          * (which would also close underlying stream). 
536          */
537         public void end() {
538             def.end();
539         }
540         
541         
542     }
543 }