1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.io;
24
25 import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
26 import it.unimi.dsi.mg4j.util.MutableString;
27
28 import java.io.BufferedInputStream;
29 import java.io.BufferedReader;
30 import java.io.File;
31 import java.io.FileInputStream;
32 import java.io.FileNotFoundException;
33 import java.io.FileOutputStream;
34 import java.io.IOException;
35 import java.io.InputStreamReader;
36 import java.io.OutputStreamWriter;
37 import java.io.Writer;
38 import java.util.zip.GZIPInputStream;
39 import java.util.zip.GZIPOutputStream;
40
41 import org.archive.util.ArchiveUtils;
42
43 /***
44 * Utility class for a crawler journal/log that is compressed and
45 * rotates by serial number at checkpoints.
46 *
47 * @author gojomo
48 */
49 public class CrawlerJournal {
50
51 /*** prefix for error lines*/
52 public static final String LOG_ERROR = "E ";
53 /*** prefix for timestamp lines */
54 public static final String LOG_TIMESTAMP = "T ";
55
56 /***
57 * Get a BufferedReader on the crawler journal given
58 *
59 * @param source File journal
60 * @return journal buffered reader.
61 * @throws IOException
62 */
63 public static BufferedReader getBufferedReader(File source) throws IOException {
64 boolean isGzipped = source.getName().toLowerCase().
65 endsWith(GZIP_SUFFIX);
66 FileInputStream fis = new FileInputStream(source);
67 return new BufferedReader(isGzipped?
68 new InputStreamReader(new GZIPInputStream(fis)):
69 new InputStreamReader(fis));
70 }
71
72 /***
73 * Get a BufferedInputStream on the recovery file given.
74 *
75 * @param source file to open
76 * @return journal buffered input stream.
77 * @throws IOException
78 */
79 public static BufferedInputStream getBufferedInput(File source) throws IOException {
80 boolean isGzipped = source.getName().toLowerCase().
81 endsWith(GZIP_SUFFIX);
82 FileInputStream fis = new FileInputStream(source);
83 return isGzipped ? new BufferedInputStream(new GZIPInputStream(fis))
84 : new BufferedInputStream(fis);
85 }
86
87 /***
88 * Stream on which we record frontier events.
89 */
90 protected Writer out = null;
91
92 /*** line count */
93 protected long lines = 0;
94 /*** number of lines between timestamps */
95 protected int timestamp_interval = 0;
96
97
98 /*** suffix to recognize gzipped files */
99 public static final String GZIP_SUFFIX = ".gz";
100
101 /***
102 * File we're writing journal to.
103 * Keep a reference in case we want to rotate it off.
104 */
105 protected File gzipFile = null;
106
107 /***
108 * Create a new crawler journal at the given location
109 *
110 * @param path Directory to make thejournal in.
111 * @param filename Name to use for journal file.
112 * @throws IOException
113 */
114 public CrawlerJournal(String path, String filename)
115 throws IOException {
116 this.gzipFile = new File(path, filename);
117 this.out = initialize(gzipFile);
118 }
119
120 /***
121 * Create a new crawler journal at the given location
122 *
123 * @param file path at which to make journal
124 * @throws IOException
125 */
126 public CrawlerJournal(File file) throws IOException {
127 this.gzipFile = file;
128 this.out = initialize(gzipFile);
129 }
130
131 /***
132 * Allocate a buffer for accumulating lines to write and reuse it.
133 */
134 protected MutableString accumulatingBuffer = new MutableString(1024);
135
136 protected Writer initialize(final File f) throws FileNotFoundException, IOException {
137 return new OutputStreamWriter(new GZIPOutputStream(
138 new FastBufferedOutputStream(new FileOutputStream(f))));
139 }
140
141 /***
142 * Write a line
143 *
144 * @param string String
145 */
146 public synchronized void writeLine(String string) {
147 try {
148 this.out.write("\n");
149 this.out.write(string);
150 noteLine();
151 } catch (IOException e) {
152 e.printStackTrace();
153 }
154 }
155
156 /***
157 * Write a line of two strings
158 *
159 * @param s1 String
160 * @param s2 String
161 */
162 public synchronized void writeLine(String s1, String s2) {
163 try {
164 this.out.write("\n");
165 this.out.write(s1);
166 this.out.write(s2);
167 noteLine();
168 } catch (IOException e) {
169 e.printStackTrace();
170 }
171 }
172
173 /***
174 * Write a line of three strings
175 *
176 * @param s1 String
177 * @param s2 String
178 * @param s3 String
179 */
180 public synchronized void writeLine(String s1, String s2, String s3) {
181 try {
182 this.out.write("\n");
183 this.out.write(s1);
184 this.out.write(s2);
185 this.out.write(s3);
186 noteLine();
187 } catch (IOException e) {
188 e.printStackTrace();
189 }
190 }
191
192 /***
193 * Write a line.
194 *
195 * @param mstring MutableString to write
196 */
197 public synchronized void writeLine(MutableString mstring) {
198 if (this.out == null) {
199 return;
200 }
201 try {
202 this.out.write("\n");
203 mstring.write(out);
204 noteLine();
205 } catch (IOException e) {
206 e.printStackTrace();
207 }
208 }
209
210 /***
211 * Count and note a line
212 *
213 * @throws IOException
214 */
215 protected void noteLine() throws IOException {
216 lines++;
217 considerTimestamp();
218 }
219
220 /***
221 * Write a timestamp line if appropriate
222 *
223 * @throws IOException
224 */
225 protected void considerTimestamp() throws IOException {
226 if(timestamp_interval > 0 && lines % timestamp_interval == 0) {
227 out.write("\n");
228 out.write(LOG_TIMESTAMP);
229 out.write(ArchiveUtils.getLog14Date());
230 }
231 }
232
233 /***
234 * Flush and close the underlying IO objects.
235 */
236 public void close() {
237 if (this.out == null) {
238 return;
239 }
240 try {
241 this.out.flush();
242 this.out.close();
243 this.out = null;
244 } catch (IOException e) {
245 e.printStackTrace();
246 }
247 }
248
249 /***
250 * Note a serious error vioa a special log line
251 *
252 * @param err
253 */
254 public void seriousError(String err) {
255 writeLine("\n"+LOG_ERROR+ArchiveUtils.getLog14Date()+" "+err);
256 }
257
258 /***
259 * Handle a checkpoint by rotating the current log to a checkpoint-named
260 * file and starting a new log.
261 *
262 * @param checkpointDir
263 * @throws IOException
264 */
265 public synchronized void checkpoint(final File checkpointDir) throws IOException {
266 if (this.out == null || !this.gzipFile.exists()) {
267 return;
268 }
269 close();
270
271 this.gzipFile.renameTo(new File(this.gzipFile.getParentFile(),
272 this.gzipFile.getName() + "." + checkpointDir.getName()));
273
274 this.out = initialize(this.gzipFile);
275 }
276
277 }