1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.io;
26
27 import it.unimi.dsi.mg4j.util.MutableString;
28
29 import java.util.logging.Formatter;
30 import java.util.logging.LogRecord;
31
32 import org.archive.crawler.datamodel.CoreAttributeConstants;
33 import org.archive.crawler.datamodel.CrawlURI;
34 import org.archive.util.ArchiveUtils;
35 import org.archive.util.Base32;
36 import org.archive.util.MimetypeUtils;
37
38 /***
39 * Formatter for 'crawl.log'. Expects completed CrawlURI as parameter.
40 *
41 * @author gojomo
42 */
43 public class UriProcessingFormatter
44 extends Formatter implements CoreAttributeConstants {
45 private final static String NA = "-";
46 /***
47 * Guess at line length (URIs are assumed avg. of 128 bytes).
48 * Used to preallocated the buffer we accumulate the log line
49 * in. Hopefully we get it right most of the time and no need
50 * to enlarge except in the rare case.
51 */
52 private final static int GUESS_AT_LOG_LENGTH =
53 17 + 1 + 3 + 1 + 10 + 128 + + 1 + 10 + 1 + 128 + 1 + 10 + 1 + 3 +
54 14 + 1 + 32 + 4 + 128 + 1;
55
56 /***
57 * Reuseable assembly buffer.
58 */
59 private final MutableString buffer =
60 new MutableString(GUESS_AT_LOG_LENGTH);
61
62 public String format(LogRecord lr) {
63 CrawlURI curi = (CrawlURI)lr.getParameters()[0];
64 String length = NA;
65 String mime = null;
66 if (curi.isHttpTransaction()) {
67 if(curi.getContentLength() >= 0) {
68 length = Long.toString(curi.getContentLength());
69 } else if (curi.getContentSize() > 0) {
70 length = Long.toString(curi.getContentSize());
71 }
72 mime = curi.getContentType();
73 } else {
74 if (curi.getContentSize() > 0) {
75 length = Long.toString(curi.getContentSize());
76 }
77 mime = curi.getContentType();
78 }
79 mime = MimetypeUtils.truncate(mime);
80
81 long time = System.currentTimeMillis();
82 String arcTimeAndDuration;
83 if(curi.containsKey(A_FETCH_COMPLETED_TIME)) {
84 long completedTime = curi.getLong(A_FETCH_COMPLETED_TIME);
85 long beganTime = curi.getLong(A_FETCH_BEGAN_TIME);
86 arcTimeAndDuration = ArchiveUtils.get17DigitDate(beganTime) + "+"
87 + Long.toString(completedTime - beganTime);
88 } else {
89 arcTimeAndDuration = NA;
90 }
91
92 String via = curi.flattenVia();
93
94 String digest = curi.getContentDigestSchemeString();
95
96 String sourceTag = curi.containsKey(A_SOURCE_TAG)
97 ? curi.getString(A_SOURCE_TAG)
98 : null;
99
100 this.buffer.length(0);
101 return this.buffer.append(ArchiveUtils.getLog17Date(time))
102 .append(" ")
103 .append(ArchiveUtils.padTo(curi.getFetchStatus(), 5))
104 .append(" ")
105 .append(ArchiveUtils.padTo(length, 10))
106 .append(" ")
107 .append(curi.getUURI().toString())
108 .append(" ")
109 .append(checkForNull(curi.getPathFromSeed()))
110 .append(" ")
111 .append(checkForNull(via))
112 .append(" ")
113 .append(mime)
114 .append(" ")
115 .append("#")
116
117 .append(ArchiveUtils.padTo(
118 Integer.toString(curi.getThreadNumber()), 3, '0'))
119 .append(" ")
120 .append(arcTimeAndDuration)
121 .append(" ")
122 .append(checkForNull(digest))
123 .append(" ")
124 .append(checkForNull(sourceTag))
125 .append(" ")
126 .append(checkForNull(curi.getAnnotations()))
127 .append("\n").toString();
128 }
129
130 /***
131 * @param str String to check.
132 * @return Return passed string or <code>NA</code> if null.
133 */
134 protected String checkForNull(String str) {
135 return (str == null || str.length() <= 0)? NA: str;
136 }
137 }
138
139