1   /*
2    * WARCConstants
3    *
4    * $Id: WARCConstants.java 5478 2007-09-19 01:37:07Z gojomo $
5    *
6    * Created on July 27th, 2006
7    *
8    * Copyright (C) 2006 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  package org.archive.io.warc;
27  
28  import java.util.Arrays;
29  import java.util.List;
30  
31  import org.archive.io.ArchiveFileConstants;
32  
33  /***
34   * WARC Constants used by WARC readers and writers.
35   * Below constants are used by version 0.10 and 0.12 of WARC Reader/Writer.
36   * @author stack
37   * @version $Revision: 5478 $ $Date: 2007-09-19 01:37:07 +0000 (Wed, 19 Sep 2007) $
38   */
39  public interface WARCConstants extends ArchiveFileConstants {
40      /***
41       * Default maximum WARC file size.
42       * 1Gig.
43       */
44      public static final int DEFAULT_MAX_WARC_FILE_SIZE = 1024 * 1024 * 1024;
45      
46  	/***
47  	 * WARC MAGIC
48  	 * WARC files and records begin with this sequence.
49  	 */
50  	public static final String WARC_MAGIC = "WARC/";
51      public static final String WARC_010_MAGIC = "WARC/";
52      
53      /***
54       * Hard-coded version for WARC files made with this code.
55       * Setting to 0.10 because differs from 0.9 spec.  See accompanying
56       * package documentation.
57       */
58  	public static final String WARC_VERSION = "0.17";
59      
60      /***
61       * Assumed maximum size of a Header Line.
62       *
63       * This 100k which seems massive but its the same as the LINE_LENGTH from
64       * <code>alexa/include/a_arcio.h</code>:
65       * <pre>
66       * #define LINE_LENGTH     (100*1024)
67       * </pre>
68       */
69      public static final int MAX_WARC_HEADER_LINE_LENGTH = 1024 * 100;
70      public static final int MAX_LINE_LENGTH = MAX_WARC_HEADER_LINE_LENGTH;
71      
72      /***
73       * WARC file extention.
74       */
75      public static final String WARC_FILE_EXTENSION = "warc";
76      
77      /***
78       * Dot WARC file extension.
79       */
80      public static final String DOT_WARC_FILE_EXTENSION =
81          "." + WARC_FILE_EXTENSION;
82      
83      public static final String DOT_COMPRESSED_FILE_EXTENSION =
84          ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
85  
86      /***
87       * Compressed WARC file extension.
88       */
89      public static final String COMPRESSED_WARC_FILE_EXTENSION =
90          WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
91      
92      /***
93       * Compressed dot WARC file extension.
94       */
95      public static final String DOT_COMPRESSED_WARC_FILE_EXTENSION =
96          DOT_WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
97      
98      /***
99       * Encoding to use getting bytes from strings.
100      *
101      * Specify an encoding rather than leave it to chance: i.e whatever the
102      * JVMs encoding.  Use an encoding that gets the stream as bytes, not chars.
103      * 
104      * ARC uses ISO-8859-1. By specification, WARC uses UTF-8. 
105      */
106     public static final String DEFAULT_ENCODING = "UTF-8";
107     public static final String HEADER_LINE_ENCODING = DEFAULT_ENCODING;
108     
109     // TODO: Revisit. 8859 isn't correct, especially if we settle on RFC822
110     // headers
111     public static final String WARC_HEADER_ENCODING = HEADER_LINE_ENCODING;
112     
113     public static final String [] HEADER_FIELD_KEYS = {
114         VERSION_FIELD_KEY,
115         LENGTH_FIELD_KEY,
116         TYPE_FIELD_KEY,
117         URL_FIELD_KEY,
118         DATE_FIELD_KEY,
119         RECORD_IDENTIFIER_FIELD_KEY,
120         MIMETYPE_FIELD_KEY
121     };
122     
123     /***
124      * WARC Record Types.
125      */
126     public static final String WARCINFO = "warcinfo";
127     public static final String RESPONSE = "response";
128     public static final String RESOURCE = "resource";
129     public static final String REQUEST = "request";
130     public static final String METADATA = "metadata";
131     public static final String REVISIT = "revisit";
132     public static final String CONVERSION = "conversion";
133     public static final String CONTINUATION = "continuation";
134     
135     public static final String TYPE = "type";
136     
137     // List of all WARC Record TYPES
138     public static final String [] TYPES = {WARCINFO, RESPONSE, RESOURCE,
139     	REQUEST, METADATA, REVISIT, CONVERSION, CONTINUATION};
140     
141     // Indices into TYPES array.
142     public static final int WARCINFO_INDEX = 0;
143     public static final int RESPONSE_INDEX = 1;
144     public static final int RESOURCE_INDEX = 2;
145     public static final int REQUEST_INDEX = 3;
146     public static final int METADATA_INDEX = 4;
147     public static final int REVISIT_INDEX = 5;
148     public static final int CONVERSION_INDEX = 6;
149     public static final int CONTINUATION_INDEX = 7;
150     
151     // TYPES as List.
152     public static final List TYPES_LIST = Arrays.asList(TYPES);
153     
154     /***
155      * WARC-ID
156      */
157     public static final String WARC_ID = WARC_MAGIC + WARC_VERSION;
158     public static final String WARC_010_ID = WARC_010_MAGIC + "0.10";
159         
160     /***
161      * Header field seperator character.
162      */
163     public static final char HEADER_FIELD_SEPARATOR = ' ';
164     
165     /***
166      * WSP
167      * One of a space or horizontal tab character.
168      * TODO: WSP undefined.  Fix.
169      */
170     public static final Character [] WSP = {HEADER_FIELD_SEPARATOR, '\t'};
171 
172     /***
173      * Placeholder for length in Header line.
174      * Placeholder is same size as the fixed field size allocated for length,
175      * 12 characters.  12 characters allows records of size almost 1TB.
176      */
177     public static final String PLACEHOLDER_RECORD_LENGTH_STRING =
178         "000000000000";
179     
180     public static final String NAMED_FIELD_IP_LABEL = "IP-Address";
181     public static final String NAMED_FIELD_CHECKSUM_LABEL = "Checksum";
182     public static final String NAMED_FIELD_RELATED_LABEL = "References";
183     public static final String NAMED_FIELD_WARCFILENAME = "Filename";
184     public static final String NAMED_FIELD_DESCRIPTION = "Description";
185     public static final String NAMED_FIELD_FILEDESC = "ARC-FileDesc";
186     public static final String NAMED_FIELD_TRUNCATED = "Truncated";
187     public static final String NAMED_FIELD_TRUNCATED_VALUE_TIME = "time";
188     public static final String NAMED_FIELD_TRUNCATED_VALUE_LENGTH = "length";
189     public static final String NAMED_FIELD_TRUNCATED_VALUE_HEAD =
190         "long-headers";
191     public static final String NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED = null;
192     
193     // Headers for version 0.17 of spec.
194     public static final String HEADER_KEY_DATE = "WARC-Date";
195     public static final String HEADER_KEY_TYPE = "WARC-Type";
196     public static final String HEADER_KEY_ID = "WARC-Record-ID";
197 
198     public static final String HEADER_KEY_URI = "WARC-Target-URI";   
199     public static final String HEADER_KEY_IP = "WARC-IP-Address";   
200     public static final String HEADER_KEY_BLOCK_DIGEST = "WARC-Block-Digest";
201     public static final String HEADER_KEY_PAYLOAD_DIGEST = "WARC-Payload-Digest";
202     public static final String HEADER_KEY_CONCURRENT_TO =
203         "WARC-Concurrent-To";
204     public static final String HEADER_KEY_TRUNCATED = "WARC-Truncated";
205     public static final String HEADER_KEY_PROFILE = "WARC-Profile";
206     public static final String HEADER_KEY_FILENAME = "WARC-Filename";
207     public static final String HEADER_KEY_ETAG = "WARC-Etag";
208     public static final String HEADER_KEY_LAST_MODIFIED = "WARC-Last-Modified";
209     
210     public static final String PROFILE_REVISIT_IDENTICAL_DIGEST = 
211     	"http://netpreserve.org/warc/0.17/revisit/identical-payload-digest";
212     public static final String PROFILE_REVISIT_NOT_MODIFIED = 
213     	"http://netpreserve.org/warc/0.17/revisit/server-not-modified";
214     
215     public static final String CONTENT_LENGTH = "Content-Length";
216     public static final String CONTENT_TYPE = "Content-Type";
217     public static final String CONTENT_DESCRIPTION = "Content-Description";
218     
219     public static final String COLON_SPACE = ": ";
220     // TODO: This is not in spec. Fix.
221     public static final String TRUNCATED_VALUE_UNSPECIFIED = "unspecified";
222     
223     
224     /***
225      * To be safe, lets use application type rather than message. Regards 
226      * 'message/http', RFC says "...provided that it obeys the MIME restrictions
227      * for all 'message' types regarding line length and encodings."  This
228      * usually means lines of 1000 octets max (unless a 
229      * 'Content-Transfer-Encoding: binary' mime header is present).
230      * @see <a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html#sec19.1">rfc2616 section 19.1</a>
231      */
232     public static final String HTTP_REQUEST_MIMETYPE =
233     	"application/http; msgtype=request";
234     public static final String HTTP_RESPONSE_MIMETYPE =
235     	"application/http; msgtype=response";
236 }