1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.io.arc;
26
27 import java.io.ByteArrayInputStream;
28 import java.io.ByteArrayOutputStream;
29 import java.io.IOException;
30 import java.io.InputStream;
31
32 import org.apache.commons.httpclient.Header;
33 import org.apache.commons.httpclient.HttpParser;
34 import org.apache.commons.httpclient.StatusLine;
35 import org.apache.commons.httpclient.util.EncodingUtil;
36 import org.archive.io.ArchiveRecord;
37 import org.archive.io.ArchiveRecordHeader;
38 import org.archive.io.RecoverableIOException;
39
40
41 /***
42 * An ARC file record.
43 * Does not compass the ARCRecord metadata line, just the record content.
44 * @author stack
45 */
46 public class ARCRecord extends ArchiveRecord implements ARCConstants {
47 /***
48 * Http status line object.
49 *
50 * May be null if record is not http.
51 */
52 private StatusLine httpStatus = null;
53
54 /***
55 * Http header bytes.
56 *
57 * If non-null and bytes available, give out its contents before we
58 * go back to the underlying stream.
59 */
60 private InputStream httpHeaderStream = null;
61
62 /***
63 * Http headers.
64 *
65 * Only populated after reading of headers.
66 */
67 private Header [] httpHeaders = null;
68
69
70 /***
71 * Minimal http header length.
72 *
73 * I've seen in arcs content length of 1 with no
74 * header.
75 */
76 private static final long MIN_HTTP_HEADER_LENGTH =
77 "HTTP/1.1 200 OK\r\n".length();
78
79 /***
80 * Constructor.
81 *
82 * @param in Stream cue'd up to be at the start of the record this instance
83 * is to represent.
84 * @param metaData Meta data.
85 * @throws IOException
86 */
87 public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
88 throws IOException {
89 this(in, metaData, 0, true, false, true);
90 }
91
92 /***
93 * Constructor.
94 *
95 * @param in Stream cue'd up to be at the start of the record this instance
96 * is to represent.
97 * @param metaData Meta data.
98 * @param bodyOffset Offset into the body. Usually 0.
99 * @param digest True if we're to calculate digest for this record. Not
100 * digesting saves about ~15% of cpu during an ARC parse.
101 * @param strict Be strict parsing (Parsing stops if ARC inproperly
102 * formatted).
103 * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
104 * about ~20% of CPU during an ARC parse.
105 * @throws IOException
106 */
107 public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
108 int bodyOffset, boolean digest, boolean strict,
109 final boolean parseHttpHeaders)
110 throws IOException {
111 super(in, metaData, bodyOffset, digest, strict);
112 if (parseHttpHeaders) {
113 this.httpHeaderStream = readHttpHeader();
114 }
115 }
116
117 /***
118 * Skip over the the http header if one present.
119 *
120 * Subsequent reads will get the body.
121 *
122 * <p>Calling this method in the midst of reading the header
123 * will make for strange results. Otherwise, safe to call
124 * at any time though before reading any of the arc record
125 * content is only time that it makes sense.
126 *
127 * <p>After calling this method, you can call
128 * {@link #getHttpHeaders()} to get the read http header.
129 *
130 * @throws IOException
131 */
132 public void skipHttpHeader() throws IOException {
133 if (this.httpHeaderStream != null) {
134
135 for (int available = this.httpHeaderStream.available();
136 this.httpHeaderStream != null &&
137 (available = this.httpHeaderStream.available()) > 0;) {
138
139
140 byte [] buffer = new byte[available];
141
142
143 read(buffer, 0, available);
144 }
145 }
146 }
147
148 public void dumpHttpHeader() throws IOException {
149 if (this.httpHeaderStream == null) {
150 return;
151 }
152
153 for (int available = this.httpHeaderStream.available();
154 this.httpHeaderStream != null
155 && (available = this.httpHeaderStream.available()) > 0;) {
156
157
158 byte[] buffer = new byte[available];
159
160
161 int read = read(buffer, 0, available);
162 System.out.write(buffer, 0, read);
163 }
164 }
165
166 /***
167 * Read http header if present. Technique borrowed from HttpClient HttpParse
168 * class.
169 *
170 * @return ByteArrayInputStream with the http header in it or null if no
171 * http header.
172 * @throws IOException
173 */
174 private InputStream readHttpHeader() throws IOException {
175
176
177 if(!getHeader().getUrl().startsWith("http") ||
178 getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
179 return null;
180 }
181 byte [] statusBytes = HttpParser.readRawLine(getIn());
182 int eolCharCount = getEolCharsCount(statusBytes);
183 if (eolCharCount <= 0) {
184 throw new IOException("Failed to read http status where one " +
185 " was expected: " + new String(statusBytes));
186 }
187 String statusLine = EncodingUtil.getString(statusBytes, 0,
188 statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
189 if ((statusLine == null) ||
190 !StatusLine.startsWithHTTP(statusLine)) {
191 if (statusLine.startsWith("DELETED")) {
192
193
194
195
196
197
198
199
200 throw new DeletedARCRecordIOException(statusLine);
201 } else {
202 throw new IOException("Failed parse of http status line.");
203 }
204 }
205 this.httpStatus = new StatusLine(statusLine);
206
207
208
209
210
211 ByteArrayOutputStream baos =
212 new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
213 baos.write(statusBytes);
214
215
216
217 for (byte [] lineBytes = null; true;) {
218 lineBytes = HttpParser.readRawLine(getIn());
219 eolCharCount = getEolCharsCount(lineBytes);
220 if (eolCharCount <= 0) {
221 throw new IOException("Failed reading http headers: " +
222 ((lineBytes != null)? new String(lineBytes): null));
223 }
224
225 baos.write(lineBytes);
226 if ((lineBytes.length - eolCharCount) <= 0) {
227
228 break;
229 }
230 }
231
232 byte [] headerBytes = baos.toByteArray();
233
234 this.getMetaData().setContentBegin(headerBytes.length);
235 ByteArrayInputStream bais =
236 new ByteArrayInputStream(headerBytes);
237 if (!bais.markSupported()) {
238 throw new IOException("ByteArrayInputStream does not support mark");
239 }
240 bais.mark(headerBytes.length);
241
242
243 bais.read(statusBytes, 0, statusBytes.length);
244 this.httpHeaders = HttpParser.parseHeaders(bais,
245 ARCConstants.DEFAULT_ENCODING);
246 this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
247 bais.reset();
248 return bais;
249 }
250
251 private static class DeletedARCRecordIOException
252 extends RecoverableIOException {
253 public DeletedARCRecordIOException(final String reason) {
254 super(reason);
255 }
256 }
257
258 /***
259 * Return status code for this record.
260 *
261 * This method will return -1 until the http header has been read.
262 * @return Status code.
263 */
264 public int getStatusCode() {
265 return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
266 }
267
268 /***
269 * @param bytes Array of bytes to examine for an EOL.
270 * @return Count of end-of-line characters or zero if none.
271 */
272 private int getEolCharsCount(byte [] bytes) {
273 int count = 0;
274 if (bytes != null && bytes.length >=1 &&
275 bytes[bytes.length - 1] == '\n') {
276 count++;
277 if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
278 count++;
279 }
280 }
281 return count;
282 }
283
284 /***
285 * @return Meta data for this record.
286 */
287 public ARCRecordMetaData getMetaData() {
288 return (ARCRecordMetaData)getHeader();
289 }
290
291 /***
292 * @return http headers (Only available after header has been read).
293 */
294 public Header [] getHttpHeaders() {
295 return this.httpHeaders;
296 }
297
298 /***
299 * @return Next character in this ARCRecord's content else -1 if at end of
300 * this record.
301 * @throws IOException
302 */
303 public int read() throws IOException {
304 int c = -1;
305 if (this.httpHeaderStream != null &&
306 (this.httpHeaderStream.available() > 0)) {
307
308
309 c = this.httpHeaderStream.read();
310
311 if (this.httpHeaderStream.available() <= 0) {
312 this.httpHeaderStream = null;
313 }
314 incrementPosition();
315 } else {
316 c = super.read();
317 }
318 return c;
319 }
320
321 public int read(byte [] b, int offset, int length) throws IOException {
322 int read = -1;
323 if (this.httpHeaderStream != null &&
324 (this.httpHeaderStream.available() > 0)) {
325
326
327 read = Math.min(length, this.httpHeaderStream.available());
328 if (read == 0) {
329 read = -1;
330 } else {
331 read = this.httpHeaderStream.read(b, offset, read);
332 }
333
334 if (this.httpHeaderStream.available() <= 0) {
335 this.httpHeaderStream = null;
336 }
337 incrementPosition(read);
338 } else {
339 read = super.read(b, offset, length);
340 }
341 return read;
342 }
343
344 /***
345 * @return Offset at which the body begins (Only known after
346 * header has been read) or -1 if none or if we haven't read
347 * headers yet. Usually length of HTTP headers (does not include ARC
348 * metadata line length).
349 */
350 public int getBodyOffset() {
351 return this.getMetaData().getContentBegin();
352 }
353
354 @Override
355 protected String getIp4Cdx(ArchiveRecordHeader h) {
356 String result = null;
357 if (h instanceof ARCRecordMetaData) {
358 result = ((ARCRecordMetaData)h).getIp();
359 }
360 return (result != null)? result: super.getIp4Cdx(h);
361 }
362
363 @Override
364 protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
365 String result = null;
366 if (h instanceof ARCRecordMetaData) {
367 result = ((ARCRecordMetaData) h).getStatusCode();
368 }
369 return (result != null) ? result: super.getStatusCode4Cdx(h);
370 }
371
372 @Override
373 protected String getDigest4Cdx(ArchiveRecordHeader h) {
374 String result = null;
375 if (h instanceof ARCRecordMetaData) {
376 result = ((ARCRecordMetaData) h).getDigest();
377 }
378 return (result != null) ? result: super.getDigest4Cdx(h);
379 }
380 }