1   /* CrawlerJournal.java
2    *
3    * Created on Mar 6, 2007
4    *
5    * Copyright (C) 2007 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.io;
24  
25  import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
26  import it.unimi.dsi.mg4j.util.MutableString;
27  
28  import java.io.BufferedInputStream;
29  import java.io.BufferedReader;
30  import java.io.File;
31  import java.io.FileInputStream;
32  import java.io.FileNotFoundException;
33  import java.io.FileOutputStream;
34  import java.io.IOException;
35  import java.io.InputStreamReader;
36  import java.io.OutputStreamWriter;
37  import java.io.Writer;
38  import java.util.zip.GZIPInputStream;
39  import java.util.zip.GZIPOutputStream;
40  
41  import org.archive.util.ArchiveUtils;
42  
43  /***
44   * Utility class for a crawler journal/log that is compressed and 
45   * rotates by serial number at checkpoints. 
46   * 
47   * @author gojomo
48   */
49  public class CrawlerJournal {
50  
51      /*** prefix for error lines*/
52      public static final String LOG_ERROR = "E ";
53      /*** prefix for timestamp lines */
54      public static final String LOG_TIMESTAMP = "T ";
55      
56      /***
57       * Get a BufferedReader on the crawler journal given
58       * 
59       * @param source File journal
60       * @return journal buffered reader.
61       * @throws IOException
62       */
63      public static BufferedReader getBufferedReader(File source) throws IOException {
64          boolean isGzipped = source.getName().toLowerCase().
65              endsWith(GZIP_SUFFIX);
66          FileInputStream fis = new FileInputStream(source);
67          return new BufferedReader(isGzipped?
68              new InputStreamReader(new GZIPInputStream(fis)):
69              new InputStreamReader(fis));   
70      }
71  
72      /***
73       * Get a BufferedInputStream on the recovery file given.
74       *
75       * @param source file to open
76       * @return journal buffered input stream.
77       * @throws IOException
78       */
79      public static BufferedInputStream getBufferedInput(File source) throws IOException {
80          boolean isGzipped = source.getName().toLowerCase().
81              endsWith(GZIP_SUFFIX);
82          FileInputStream fis = new FileInputStream(source);
83          return isGzipped ? new BufferedInputStream(new GZIPInputStream(fis))
84                  : new BufferedInputStream(fis);
85      }
86  
87      /***
88       * Stream on which we record frontier events.
89       */
90      protected Writer out = null;
91      
92      /*** line count */ 
93      protected long lines = 0;
94      /*** number of lines between timestamps */ 
95      protected int timestamp_interval = 0; // 0 means no timestamps
96  
97      
98      /*** suffix to recognize gzipped files */
99      public static final String GZIP_SUFFIX = ".gz";
100     
101     /***
102      * File we're writing journal to.
103      * Keep a reference in case we want to rotate it off.
104      */
105     protected File gzipFile = null;
106     
107     /***
108      * Create a new crawler journal at the given location
109      * 
110      * @param path Directory to make thejournal in.
111      * @param filename Name to use for journal file.
112      * @throws IOException
113      */
114     public CrawlerJournal(String path, String filename)
115     throws IOException {
116         this.gzipFile = new File(path, filename);
117         this.out = initialize(gzipFile);
118     }
119     
120     /***
121      * Create a new crawler journal at the given location
122      * 
123      * @param file path at which to make journal
124      * @throws IOException
125      */
126     public CrawlerJournal(File file) throws IOException {
127         this.gzipFile = file;
128         this.out = initialize(gzipFile);
129     }
130     
131     /***
132      * Allocate a buffer for accumulating lines to write and reuse it.
133      */
134     protected MutableString accumulatingBuffer = new MutableString(1024);
135 
136     protected Writer initialize(final File f) throws FileNotFoundException, IOException {
137         return new OutputStreamWriter(new GZIPOutputStream(
138             new FastBufferedOutputStream(new FileOutputStream(f))));
139     }
140 
141     /***
142      * Write a line
143      * 
144      * @param string String
145      */
146     public synchronized void writeLine(String string) {
147         try {
148             this.out.write("\n");
149             this.out.write(string);
150             noteLine();
151         } catch (IOException e) {
152             e.printStackTrace();
153         }
154     }
155 
156     /***
157      * Write a line of two strings
158      * 
159      * @param s1 String
160      * @param s2 String
161      */
162     public synchronized void writeLine(String s1, String s2) {
163         try {
164             this.out.write("\n");
165             this.out.write(s1);
166             this.out.write(s2);
167             noteLine();
168         } catch (IOException e) {
169             e.printStackTrace();
170         }
171     }
172     
173     /***
174      * Write a line of three strings
175      * 
176      * @param s1 String
177      * @param s2 String
178      * @param s3 String
179      */
180     public synchronized void writeLine(String s1, String s2, String s3) {
181         try {
182             this.out.write("\n");
183             this.out.write(s1);
184             this.out.write(s2);
185             this.out.write(s3);
186             noteLine();
187         } catch (IOException e) {
188             e.printStackTrace();
189         }
190     }
191 
192     /***
193      * Write a line. 
194      * 
195      * @param mstring MutableString to write
196      */
197     public synchronized void writeLine(MutableString mstring) {
198         if (this.out == null) {
199             return;
200         }
201         try {
202             this.out.write("\n");
203             mstring.write(out);
204             noteLine();
205         } catch (IOException e) {
206             e.printStackTrace();
207         }
208     }
209 
210     /***
211      * Count and note a line
212      * 
213      * @throws IOException
214      */
215     protected void noteLine() throws IOException {
216         lines++;
217         considerTimestamp();
218     }
219 
220     /***
221      * Write a timestamp line if appropriate
222      * 
223      * @throws IOException
224      */
225     protected void considerTimestamp() throws IOException {
226         if(timestamp_interval > 0 && lines % timestamp_interval == 0) {
227             out.write("\n");
228             out.write(LOG_TIMESTAMP);
229             out.write(ArchiveUtils.getLog14Date());
230         }
231     }
232 
233     /***
234      * Flush and close the underlying IO objects.
235      */
236     public void close() {
237         if (this.out == null) {
238             return;
239         }
240         try {
241             this.out.flush();
242             this.out.close();
243             this.out = null;
244         } catch (IOException e) {
245             e.printStackTrace();
246         }
247     }
248 
249     /***
250      * Note a serious error vioa a special log line
251      * 
252      * @param err
253      */
254     public void seriousError(String err) {
255         writeLine("\n"+LOG_ERROR+ArchiveUtils.getLog14Date()+" "+err);
256     }
257 
258     /***
259      * Handle a checkpoint by rotating the current log to a checkpoint-named
260      * file and starting a new log. 
261      * 
262      * @param checkpointDir
263      * @throws IOException
264      */
265     public synchronized void checkpoint(final File checkpointDir) throws IOException {
266         if (this.out == null || !this.gzipFile.exists()) {
267             return;
268         }
269         close();
270         // Rename gzipFile with the checkpoint name as suffix.
271         this.gzipFile.renameTo(new File(this.gzipFile.getParentFile(),
272                 this.gzipFile.getName() + "." + checkpointDir.getName()));
273         // Open new gzip file.
274         this.out = initialize(this.gzipFile);
275     }
276 
277 }