1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.io.warc;
27
28 import java.util.Arrays;
29 import java.util.List;
30
31 import org.archive.io.ArchiveFileConstants;
32
33 /***
34 * WARC Constants used by WARC readers and writers.
35 * Below constants are used by version 0.10 and 0.12 of WARC Reader/Writer.
36 * @author stack
37 * @version $Revision: 5478 $ $Date: 2007-09-19 01:37:07 +0000 (Wed, 19 Sep 2007) $
38 */
39 public interface WARCConstants extends ArchiveFileConstants {
40 /***
41 * Default maximum WARC file size.
42 * 1Gig.
43 */
44 public static final int DEFAULT_MAX_WARC_FILE_SIZE = 1024 * 1024 * 1024;
45
46 /***
47 * WARC MAGIC
48 * WARC files and records begin with this sequence.
49 */
50 public static final String WARC_MAGIC = "WARC/";
51 public static final String WARC_010_MAGIC = "WARC/";
52
53 /***
54 * Hard-coded version for WARC files made with this code.
55 * Setting to 0.10 because differs from 0.9 spec. See accompanying
56 * package documentation.
57 */
58 public static final String WARC_VERSION = "0.17";
59
60 /***
61 * Assumed maximum size of a Header Line.
62 *
63 * This 100k which seems massive but its the same as the LINE_LENGTH from
64 * <code>alexa/include/a_arcio.h</code>:
65 * <pre>
66 * #define LINE_LENGTH (100*1024)
67 * </pre>
68 */
69 public static final int MAX_WARC_HEADER_LINE_LENGTH = 1024 * 100;
70 public static final int MAX_LINE_LENGTH = MAX_WARC_HEADER_LINE_LENGTH;
71
72 /***
73 * WARC file extention.
74 */
75 public static final String WARC_FILE_EXTENSION = "warc";
76
77 /***
78 * Dot WARC file extension.
79 */
80 public static final String DOT_WARC_FILE_EXTENSION =
81 "." + WARC_FILE_EXTENSION;
82
83 public static final String DOT_COMPRESSED_FILE_EXTENSION =
84 ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
85
86 /***
87 * Compressed WARC file extension.
88 */
89 public static final String COMPRESSED_WARC_FILE_EXTENSION =
90 WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
91
92 /***
93 * Compressed dot WARC file extension.
94 */
95 public static final String DOT_COMPRESSED_WARC_FILE_EXTENSION =
96 DOT_WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
97
98 /***
99 * Encoding to use getting bytes from strings.
100 *
101 * Specify an encoding rather than leave it to chance: i.e whatever the
102 * JVMs encoding. Use an encoding that gets the stream as bytes, not chars.
103 *
104 * ARC uses ISO-8859-1. By specification, WARC uses UTF-8.
105 */
106 public static final String DEFAULT_ENCODING = "UTF-8";
107 public static final String HEADER_LINE_ENCODING = DEFAULT_ENCODING;
108
109
110
111 public static final String WARC_HEADER_ENCODING = HEADER_LINE_ENCODING;
112
113 public static final String [] HEADER_FIELD_KEYS = {
114 VERSION_FIELD_KEY,
115 LENGTH_FIELD_KEY,
116 TYPE_FIELD_KEY,
117 URL_FIELD_KEY,
118 DATE_FIELD_KEY,
119 RECORD_IDENTIFIER_FIELD_KEY,
120 MIMETYPE_FIELD_KEY
121 };
122
123 /***
124 * WARC Record Types.
125 */
126 public static final String WARCINFO = "warcinfo";
127 public static final String RESPONSE = "response";
128 public static final String RESOURCE = "resource";
129 public static final String REQUEST = "request";
130 public static final String METADATA = "metadata";
131 public static final String REVISIT = "revisit";
132 public static final String CONVERSION = "conversion";
133 public static final String CONTINUATION = "continuation";
134
135 public static final String TYPE = "type";
136
137
138 public static final String [] TYPES = {WARCINFO, RESPONSE, RESOURCE,
139 REQUEST, METADATA, REVISIT, CONVERSION, CONTINUATION};
140
141
142 public static final int WARCINFO_INDEX = 0;
143 public static final int RESPONSE_INDEX = 1;
144 public static final int RESOURCE_INDEX = 2;
145 public static final int REQUEST_INDEX = 3;
146 public static final int METADATA_INDEX = 4;
147 public static final int REVISIT_INDEX = 5;
148 public static final int CONVERSION_INDEX = 6;
149 public static final int CONTINUATION_INDEX = 7;
150
151
152 public static final List TYPES_LIST = Arrays.asList(TYPES);
153
154 /***
155 * WARC-ID
156 */
157 public static final String WARC_ID = WARC_MAGIC + WARC_VERSION;
158 public static final String WARC_010_ID = WARC_010_MAGIC + "0.10";
159
160 /***
161 * Header field seperator character.
162 */
163 public static final char HEADER_FIELD_SEPARATOR = ' ';
164
165 /***
166 * WSP
167 * One of a space or horizontal tab character.
168 * TODO: WSP undefined. Fix.
169 */
170 public static final Character [] WSP = {HEADER_FIELD_SEPARATOR, '\t'};
171
172 /***
173 * Placeholder for length in Header line.
174 * Placeholder is same size as the fixed field size allocated for length,
175 * 12 characters. 12 characters allows records of size almost 1TB.
176 */
177 public static final String PLACEHOLDER_RECORD_LENGTH_STRING =
178 "000000000000";
179
180 public static final String NAMED_FIELD_IP_LABEL = "IP-Address";
181 public static final String NAMED_FIELD_CHECKSUM_LABEL = "Checksum";
182 public static final String NAMED_FIELD_RELATED_LABEL = "References";
183 public static final String NAMED_FIELD_WARCFILENAME = "Filename";
184 public static final String NAMED_FIELD_DESCRIPTION = "Description";
185 public static final String NAMED_FIELD_FILEDESC = "ARC-FileDesc";
186 public static final String NAMED_FIELD_TRUNCATED = "Truncated";
187 public static final String NAMED_FIELD_TRUNCATED_VALUE_TIME = "time";
188 public static final String NAMED_FIELD_TRUNCATED_VALUE_LENGTH = "length";
189 public static final String NAMED_FIELD_TRUNCATED_VALUE_HEAD =
190 "long-headers";
191 public static final String NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED = null;
192
193
194 public static final String HEADER_KEY_DATE = "WARC-Date";
195 public static final String HEADER_KEY_TYPE = "WARC-Type";
196 public static final String HEADER_KEY_ID = "WARC-Record-ID";
197
198 public static final String HEADER_KEY_URI = "WARC-Target-URI";
199 public static final String HEADER_KEY_IP = "WARC-IP-Address";
200 public static final String HEADER_KEY_BLOCK_DIGEST = "WARC-Block-Digest";
201 public static final String HEADER_KEY_PAYLOAD_DIGEST = "WARC-Payload-Digest";
202 public static final String HEADER_KEY_CONCURRENT_TO =
203 "WARC-Concurrent-To";
204 public static final String HEADER_KEY_TRUNCATED = "WARC-Truncated";
205 public static final String HEADER_KEY_PROFILE = "WARC-Profile";
206 public static final String HEADER_KEY_FILENAME = "WARC-Filename";
207 public static final String HEADER_KEY_ETAG = "WARC-Etag";
208 public static final String HEADER_KEY_LAST_MODIFIED = "WARC-Last-Modified";
209
210 public static final String PROFILE_REVISIT_IDENTICAL_DIGEST =
211 "http://netpreserve.org/warc/0.17/revisit/identical-payload-digest";
212 public static final String PROFILE_REVISIT_NOT_MODIFIED =
213 "http://netpreserve.org/warc/0.17/revisit/server-not-modified";
214
215 public static final String CONTENT_LENGTH = "Content-Length";
216 public static final String CONTENT_TYPE = "Content-Type";
217 public static final String CONTENT_DESCRIPTION = "Content-Description";
218
219 public static final String COLON_SPACE = ": ";
220
221 public static final String TRUNCATED_VALUE_UNSPECIFIED = "unspecified";
222
223
224 /***
225 * To be safe, lets use application type rather than message. Regards
226 * 'message/http', RFC says "...provided that it obeys the MIME restrictions
227 * for all 'message' types regarding line length and encodings." This
228 * usually means lines of 1000 octets max (unless a
229 * 'Content-Transfer-Encoding: binary' mime header is present).
230 * @see <a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html#sec19.1">rfc2616 section 19.1</a>
231 */
232 public static final String HTTP_REQUEST_MIMETYPE =
233 "application/http; msgtype=request";
234 public static final String HTTP_RESPONSE_MIMETYPE =
235 "application/http; msgtype=response";
236 }