1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io.warc;
24
25 import it.unimi.dsi.fastutil.io.RepositionableStream;
26
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.util.HashMap;
30 import java.util.Map;
31 import java.util.Set;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35 import org.apache.commons.httpclient.Header;
36 import org.apache.commons.httpclient.HttpParser;
37 import org.archive.io.ArchiveRecord;
38 import org.archive.io.ArchiveRecordHeader;
39
40
41 /***
42 * A WARC file Record.
43 *
44 * @author stack
45 */
46 public class WARCRecord extends ArchiveRecord implements WARCConstants {
47 private Pattern WHITESPACE = Pattern.compile("//s");
48
49 /***
50 * Constructor.
51 *
52 * @param in Stream cue'd up to be at the start of the record this instance
53 * is to represent.
54 * @throws IOException
55 */
56 public WARCRecord(InputStream in, final String identifier,
57 final long offset)
58 throws IOException {
59 this(in, identifier, offset, true, false);
60 }
61
62 /***
63 * Constructor.
64 * @param in Stream cue'd up just past Header Line and Named Fields.
65 * @param headers Header Line and ANVL Named fields.
66 * @throws IOException
67 */
68 public WARCRecord(InputStream in, ArchiveRecordHeader headers)
69 throws IOException {
70 super(in, headers, 0, true, false);
71 }
72
73 /***
74 * Constructor.
75 *
76 * @param in Stream cue'd up to be at the start of the record this instance
77 * is to represent or, if <code>headers</code> is not null, just past the
78 * Header Line and Named Fields.
79 * @param identifier Identifier for this the hosting Reader.
80 * @param offset Current offset into <code>in</code> (Used to keep
81 * <code>position</code> properly aligned). Usually 0.
82 * @param digest True if we're to calculate digest for this record. Not
83 * digesting saves about ~15% of cpu during parse.
84 * @param strict Be strict parsing (Parsing stops if file inproperly
85 * formatted).
86 * @throws IOException
87 */
88 public WARCRecord(final InputStream in, final String identifier,
89 final long offset, boolean digest, boolean strict)
90 throws IOException {
91 super(in, null, 0, digest, strict);
92 setHeader(parseHeaders(in, identifier, offset, strict));
93 }
94
95 /***
96 * Parse WARC Header Line and Named Fields.
97 * @param in Stream to read.
98 * @param identifier Identifier for the hosting Reader.
99 * @param offset Absolute offset into Reader.
100 * @param strict Whether to be loose parsing or not.
101 * @return An ArchiveRecordHeader.
102 * @throws IOException
103 */
104 protected ArchiveRecordHeader parseHeaders(final InputStream in,
105 final String identifier, final long offset, final boolean strict)
106 throws IOException {
107 final Map<Object, Object> m = new HashMap<Object, Object>();
108 m.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
109 m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
110
111 long startPosition = -1;
112 if (in instanceof RepositionableStream) {
113 startPosition = ((RepositionableStream)in).position();
114 }
115 String firstLine =
116 new String(HttpParser.readLine(in, WARC_HEADER_ENCODING));
117 if (firstLine == null || firstLine.length() <=0) {
118 throw new IOException("Failed to read WARC_MAGIC");
119 }
120 if (!firstLine.startsWith(WARC_MAGIC)) {
121 throw new IOException("Failed to find WARC MAGIC: " + firstLine);
122 }
123
124
125
126
127
128 Header [] h = HttpParser.parseHeaders(in, WARC_HEADER_ENCODING);
129 for (int i = 0; i < h.length; i++) {
130 m.put(h[i].getName(), h[i].getValue());
131 }
132 int headerLength = -1;
133 if (in instanceof RepositionableStream) {
134 headerLength =
135 (int)(((RepositionableStream)in).position() - startPosition);
136 }
137 final int contentOffset = headerLength;
138 incrementPosition(contentOffset);
139
140 return new ArchiveRecordHeader() {
141 private Map<Object, Object> headers = m;
142 private int contentBegin = contentOffset;
143
144 public String getDate() {
145 return (String)this.headers.get(HEADER_KEY_DATE);
146 }
147
148 public String getDigest() {
149 return null;
150
151
152
153
154 }
155
156 public String getReaderIdentifier() {
157 return (String)this.headers.get(READER_IDENTIFIER_FIELD_KEY);
158 }
159
160 public Set getHeaderFieldKeys() {
161 return this.headers.keySet();
162 }
163
164 public Map getHeaderFields() {
165 return this.headers;
166 }
167
168 public Object getHeaderValue(String key) {
169 return this.headers.get(key);
170 }
171
172 public long getLength() {
173 Object o = this.headers.get(CONTENT_LENGTH);
174 if (o == null) {
175 return -1;
176 }
177 long contentLength = (o instanceof Long)?
178 ((Long)o).longValue(): Long.parseLong((String)o);
179 return contentLength + contentOffset;
180 }
181
182 public String getMimetype() {
183 return (String)this.headers.get(CONTENT_TYPE);
184 }
185
186 public long getOffset() {
187 Object o = this.headers.get(ABSOLUTE_OFFSET_KEY);
188 if (o == null) {
189 return -1;
190 }
191 return (o instanceof Long)?
192 ((Long)o).longValue(): Long.parseLong((String)o);
193 }
194
195 public String getRecordIdentifier() {
196 return (String)this.headers.get(RECORD_IDENTIFIER_FIELD_KEY);
197 }
198
199 public String getUrl() {
200 return (String)this.headers.get(HEADER_KEY_URI);
201 }
202
203 public String getVersion() {
204 return (String)this.headers.get(VERSION_FIELD_KEY);
205 }
206
207 public int getContentBegin() {
208 return this.contentBegin;
209 }
210
211 @Override
212 public String toString() {
213 return this.headers.toString();
214 }
215 };
216 }
217
218 @Override
219 protected String getMimetype4Cdx(ArchiveRecordHeader h) {
220 final String m = super.getMimetype4Cdx(h);
221
222
223
224 Matcher matcher = WHITESPACE.matcher(m);
225 return matcher.replaceAll("");
226 }
227 }