View Javadoc

1   /* ARCRecord
2    *
3    * $Id: ARCRecord.java 4988 2007-03-12 21:18:08Z stack-sf $
4    *
5    * Created on Jan 7, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io.arc;
26  
27  import java.io.ByteArrayInputStream;
28  import java.io.ByteArrayOutputStream;
29  import java.io.IOException;
30  import java.io.InputStream;
31  
32  import org.apache.commons.httpclient.Header;
33  import org.apache.commons.httpclient.HttpParser;
34  import org.apache.commons.httpclient.StatusLine;
35  import org.apache.commons.httpclient.util.EncodingUtil;
36  import org.archive.io.ArchiveRecord;
37  import org.archive.io.ArchiveRecordHeader;
38  import org.archive.io.RecoverableIOException;
39  
40  
41  /***
42   * An ARC file record.
43   * Does not compass the ARCRecord metadata line, just the record content.
44   * @author stack
45   */
46  public class ARCRecord extends ArchiveRecord implements ARCConstants {
47      /***
48       * Http status line object.
49       * 
50       * May be null if record is not http.
51       */
52      private StatusLine httpStatus = null;
53  
54      /***
55       * Http header bytes.
56       * 
57       * If non-null and bytes available, give out its contents before we
58       * go back to the underlying stream.
59       */
60      private InputStream httpHeaderStream = null;
61      
62      /***
63       * Http headers.
64       * 
65       * Only populated after reading of headers.
66       */
67      private Header [] httpHeaders = null;
68  
69      
70      /***
71       * Minimal http header length.
72       * 
73       * I've seen in arcs content length of 1 with no 
74       * header.
75       */
76      private static final long MIN_HTTP_HEADER_LENGTH =
77          "HTTP/1.1 200 OK\r\n".length();
78      
79      /***
80       * Constructor.
81       *
82       * @param in Stream cue'd up to be at the start of the record this instance
83       * is to represent.
84       * @param metaData Meta data.
85       * @throws IOException
86       */
87      public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
88      		throws IOException {
89          this(in, metaData, 0, true, false, true);
90      }
91  
92      /***
93       * Constructor.
94       *
95       * @param in Stream cue'd up to be at the start of the record this instance
96       * is to represent.
97       * @param metaData Meta data.
98       * @param bodyOffset Offset into the body.  Usually 0.
99       * @param digest True if we're to calculate digest for this record.  Not
100      * digesting saves about ~15% of cpu during an ARC parse.
101      * @param strict Be strict parsing (Parsing stops if ARC inproperly
102      * formatted).
103      * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
104      * about ~20% of CPU during an ARC parse.
105      * @throws IOException
106      */
107     public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
108         int bodyOffset, boolean digest, boolean strict,
109         final boolean parseHttpHeaders) 
110     throws IOException {
111     	super(in, metaData, bodyOffset, digest, strict);
112         if (parseHttpHeaders) {
113             this.httpHeaderStream = readHttpHeader();
114         }
115     }
116     
117     /***
118      * Skip over the the http header if one present.
119      * 
120      * Subsequent reads will get the body.
121      * 
122      * <p>Calling this method in the midst of reading the header
123      * will make for strange results.  Otherwise, safe to call
124      * at any time though before reading any of the arc record
125      * content is only time that it makes sense.
126      * 
127      * <p>After calling this method, you can call
128      * {@link #getHttpHeaders()} to get the read http header.
129      * 
130      * @throws IOException
131      */
132     public void skipHttpHeader() throws IOException {
133         if (this.httpHeaderStream != null) {
134             // Empty the httpHeaderStream
135             for (int available = this.httpHeaderStream.available();
136             		this.httpHeaderStream != null &&
137             			(available = this.httpHeaderStream.available()) > 0;) {
138                 // We should be in this loop once only we should only do this
139                 // buffer allocation once.
140                 byte [] buffer = new byte[available];
141                 // The read nulls out httpHeaderStream when done with it so
142                 // need check for null in the loop control line.
143                 read(buffer, 0, available);
144             }
145         }
146     }
147     
148     public void dumpHttpHeader() throws IOException {
149 		if (this.httpHeaderStream == null) {
150 			return;
151 		}
152 		// Dump the httpHeaderStream to STDOUT
153 		for (int available = this.httpHeaderStream.available();
154 			this.httpHeaderStream != null
155 				&& (available = this.httpHeaderStream.available()) > 0;) {
156 			// We should be in this loop only once and should do this
157 			// buffer allocation once.
158 			byte[] buffer = new byte[available];
159 			// The read nulls out httpHeaderStream when done with it so
160 			// need check for null in the loop control line.
161 			int read = read(buffer, 0, available);
162 			System.out.write(buffer, 0, read);
163 		}
164 	}
165     
166     /***
167 	 * Read http header if present. Technique borrowed from HttpClient HttpParse
168 	 * class.
169 	 * 
170 	 * @return ByteArrayInputStream with the http header in it or null if no
171 	 *         http header.
172 	 * @throws IOException
173 	 */
174     private InputStream readHttpHeader() throws IOException {
175         // If judged a record that doesn't have an http header, return
176         // immediately.
177         if(!getHeader().getUrl().startsWith("http") ||
178             getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
179             return null;
180         }
181         byte [] statusBytes = HttpParser.readRawLine(getIn());
182         int eolCharCount = getEolCharsCount(statusBytes);
183         if (eolCharCount <= 0) {
184             throw new IOException("Failed to read http status where one " +
185                 " was expected: " + new String(statusBytes));
186         }
187         String statusLine = EncodingUtil.getString(statusBytes, 0,
188             statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
189         if ((statusLine == null) ||
190                 !StatusLine.startsWithHTTP(statusLine)) {
191             if (statusLine.startsWith("DELETED")) {
192                 // Some old ARCs have deleted records like following:
193                 // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
194                 // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
195                 // (follows ~29K spaces)
196                 // For now, throw a RecoverableIOException so if iterating over
197                 // records, we keep going.  TODO: Later make a legitimate
198                 // ARCRecord from the deleted record rather than throw
199                 // exception.
200                 throw new DeletedARCRecordIOException(statusLine);
201             } else {
202                 throw new IOException("Failed parse of http status line.");
203             }
204         }
205         this.httpStatus = new StatusLine(statusLine);
206         
207         // Save off all bytes read.  Keep them as bytes rather than
208         // convert to strings so we don't have to worry about encodings
209         // though this should never be a problem doing http headers since
210         // its all supposed to be ascii.
211         ByteArrayOutputStream baos =
212             new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
213         baos.write(statusBytes);
214         
215         // Now read rest of the header lines looking for the separation
216         // between header and body.
217         for (byte [] lineBytes = null; true;) {
218             lineBytes = HttpParser.readRawLine(getIn());
219             eolCharCount = getEolCharsCount(lineBytes);
220             if (eolCharCount <= 0) {
221                 throw new IOException("Failed reading http headers: " +
222                     ((lineBytes != null)? new String(lineBytes): null));
223             }
224             // Save the bytes read.
225             baos.write(lineBytes);
226             if ((lineBytes.length - eolCharCount) <= 0) {
227                 // We've finished reading the http header.
228                 break;
229             }
230         }
231         
232         byte [] headerBytes = baos.toByteArray();
233         // Save off where body starts.
234         this.getMetaData().setContentBegin(headerBytes.length);
235         ByteArrayInputStream bais =
236             new ByteArrayInputStream(headerBytes);
237         if (!bais.markSupported()) {
238             throw new IOException("ByteArrayInputStream does not support mark");
239         }
240         bais.mark(headerBytes.length);
241         // Read the status line.  Don't let it into the parseHeaders function.
242         // It doesn't know what to do with it.
243         bais.read(statusBytes, 0, statusBytes.length);
244         this.httpHeaders = HttpParser.parseHeaders(bais,
245             ARCConstants.DEFAULT_ENCODING);
246         this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
247         bais.reset();
248         return bais;
249     }
250     
251     private static class DeletedARCRecordIOException
252     extends RecoverableIOException {
253         public DeletedARCRecordIOException(final String reason) {
254             super(reason);
255         }
256     }
257     
258     /***
259      * Return status code for this record.
260      * 
261      * This method will return -1 until the http header has been read.
262      * @return Status code.
263      */
264     public int getStatusCode() {
265         return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
266     }
267     
268     /***
269      * @param bytes Array of bytes to examine for an EOL.
270      * @return Count of end-of-line characters or zero if none.
271      */
272     private int getEolCharsCount(byte [] bytes) {
273         int count = 0;
274         if (bytes != null && bytes.length >=1 &&
275                 bytes[bytes.length - 1] == '\n') {
276             count++;
277             if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
278                 count++;
279             }
280         }
281         return count;
282     }
283 
284     /***
285      * @return Meta data for this record.
286      */
287     public ARCRecordMetaData getMetaData() {
288         return (ARCRecordMetaData)getHeader();
289     }
290     
291     /***
292      * @return http headers (Only available after header has been read).
293      */
294     public Header [] getHttpHeaders() {
295         return this.httpHeaders;
296     }
297 
298     /***
299      * @return Next character in this ARCRecord's content else -1 if at end of
300      * this record.
301      * @throws IOException
302      */
303     public int read() throws IOException {
304         int c = -1;
305         if (this.httpHeaderStream != null &&
306                 (this.httpHeaderStream.available() > 0)) {
307             // If http header, return bytes from it before we go to underlying
308             // stream.
309             c = this.httpHeaderStream.read();
310             // If done with the header stream, null it out.
311             if (this.httpHeaderStream.available() <= 0) {
312                 this.httpHeaderStream = null;
313             }
314             incrementPosition();
315         } else {
316             c = super.read();
317         }
318         return c;
319     }
320 
321     public int read(byte [] b, int offset, int length) throws IOException {
322         int read = -1;
323         if (this.httpHeaderStream != null &&
324                 (this.httpHeaderStream.available() > 0)) {
325             // If http header, return bytes from it before we go to underlying
326             // stream.
327             read = Math.min(length, this.httpHeaderStream.available());
328             if (read == 0) {
329                 read = -1;
330             } else {
331                 read = this.httpHeaderStream.read(b, offset, read);
332             }
333             // If done with the header stream, null it out.
334             if (this.httpHeaderStream.available() <= 0) {
335                 this.httpHeaderStream = null;
336             }
337             incrementPosition(read);
338         } else {
339             read = super.read(b, offset, length);
340         }
341         return read;
342     }
343 
344     /***
345      * @return Offset at which the body begins (Only known after
346      * header has been read) or -1 if none or if we haven't read
347      * headers yet.  Usually length of HTTP headers (does not include ARC
348      * metadata line length).
349      */
350     public int getBodyOffset() {
351         return this.getMetaData().getContentBegin();
352     }
353     
354     @Override
355     protected String getIp4Cdx(ArchiveRecordHeader h) {
356     	String result = null;
357     	if (h instanceof ARCRecordMetaData) {
358     		result = ((ARCRecordMetaData)h).getIp();
359     	}
360     	return (result != null)? result: super.getIp4Cdx(h);
361     }
362     
363     @Override
364 	protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
365 		String result = null;
366 		if (h instanceof ARCRecordMetaData) {
367 			result = ((ARCRecordMetaData) h).getStatusCode();
368 		}
369 		return (result != null) ? result: super.getStatusCode4Cdx(h);
370 	}
371     
372     @Override
373 	protected String getDigest4Cdx(ArchiveRecordHeader h) {
374 		String result = null;
375 		if (h instanceof ARCRecordMetaData) {
376 			result = ((ARCRecordMetaData) h).getDigest();
377 		}
378 		return (result != null) ? result: super.getDigest4Cdx(h);
379 	}
380 }