View Javadoc

1   /* $Id: WARCRecord.java 4566 2006-08-31 16:51:41Z stack-sf $
2    *
3    * Created on August 25th, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.warc;
24  
25  import it.unimi.dsi.fastutil.io.RepositionableStream;
26  
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.util.HashMap;
30  import java.util.Map;
31  import java.util.Set;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  import org.apache.commons.httpclient.Header;
36  import org.apache.commons.httpclient.HttpParser;
37  import org.archive.io.ArchiveRecord;
38  import org.archive.io.ArchiveRecordHeader;
39  
40  
41  /***
42   * A WARC file Record.
43   *
44   * @author stack
45   */
46  public class WARCRecord extends ArchiveRecord implements WARCConstants {
47      private Pattern WHITESPACE = Pattern.compile("//s");
48      
49      /***
50       * Constructor.
51       *
52       * @param in Stream cue'd up to be at the start of the record this instance
53       * is to represent.
54       * @throws IOException
55       */
56      public WARCRecord(InputStream in, final String identifier,
57      	final long offset)
58      throws IOException {
59          this(in, identifier, offset, true, false);
60      }
61      
62      /***
63       * Constructor.
64       * @param in Stream cue'd up just past Header Line and Named Fields.
65       * @param headers Header Line and ANVL Named fields.
66       * @throws IOException
67       */
68      public WARCRecord(InputStream in, ArchiveRecordHeader headers)
69      		throws IOException {
70          super(in, headers, 0, true, false);
71      }
72  
73      /***
74       * Constructor.
75       *
76       * @param in Stream cue'd up to be at the start of the record this instance
77       * is to represent or, if <code>headers</code> is not null, just past the
78       * Header Line and Named Fields.
79       * @param identifier Identifier for this the hosting Reader.
80       * @param offset Current offset into <code>in</code> (Used to keep
81       * <code>position</code> properly aligned).  Usually 0.
82       * @param digest True if we're to calculate digest for this record.  Not
83       * digesting saves about ~15% of cpu during parse.
84       * @param strict Be strict parsing (Parsing stops if file inproperly
85       * formatted).
86       * @throws IOException
87       */
88      public WARCRecord(final InputStream in, final String identifier,
89      	final long offset, boolean digest, boolean strict) 
90      throws IOException {
91          super(in, null, 0, digest, strict);
92          setHeader(parseHeaders(in, identifier, offset, strict));
93      }
94      
95      /***
96       * Parse WARC Header Line and Named Fields.
97       * @param in Stream to read.
98       * @param identifier Identifier for the hosting Reader.
99       * @param offset Absolute offset into Reader.
100      * @param strict Whether to be loose parsing or not.
101      * @return An ArchiveRecordHeader.
102      * @throws IOException 
103      */
104     protected ArchiveRecordHeader parseHeaders(final InputStream in,
105         final String identifier, final long offset, final boolean strict)
106     throws IOException {
107     	final Map<Object, Object> m = new HashMap<Object, Object>();
108     	m.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
109     	m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
110         
111         long startPosition = -1;
112         if (in instanceof RepositionableStream) {
113             startPosition = ((RepositionableStream)in).position();
114         }
115         String firstLine =
116             new String(HttpParser.readLine(in, WARC_HEADER_ENCODING));
117         if (firstLine == null || firstLine.length() <=0) {
118             throw new IOException("Failed to read WARC_MAGIC");
119         }
120         if (!firstLine.startsWith(WARC_MAGIC)) {
121             throw new IOException("Failed to find WARC MAGIC: " + firstLine);
122         }
123         // Here we start reading off the inputstream but we're reading the
124         // stream direct rather than going via WARCRecord#read.  The latter will
125         // keep count of bytes read, digest and fail properly if EOR too soon...
126         // We don't want digesting while reading Headers.
127         // 
128         Header [] h = HttpParser.parseHeaders(in, WARC_HEADER_ENCODING);
129         for (int i = 0; i < h.length; i++) {
130             m.put(h[i].getName(), h[i].getValue());
131         }
132         int headerLength = -1;
133         if (in instanceof RepositionableStream) {
134             headerLength =
135                 (int)(((RepositionableStream)in).position() - startPosition);
136         }
137         final int contentOffset = headerLength;
138         incrementPosition(contentOffset);
139    
140     	return new ArchiveRecordHeader() {
141     		private Map<Object, Object> headers = m;
142             private int contentBegin = contentOffset;
143 
144 			public String getDate() {
145 				return (String)this.headers.get(HEADER_KEY_DATE);
146 			}
147 
148 			public String getDigest() {
149                 return null;
150                 // TODO: perhaps return block-digest? 
151                 // superclass def implies this is calculated ("only after
152                 // read in totality"), not pulled from header
153 //				return (String)this.headers.get(HEADER_KEY_CHECKSUM);
154 			}
155 
156 			public String getReaderIdentifier() {
157 				return (String)this.headers.get(READER_IDENTIFIER_FIELD_KEY);
158 			}
159 
160 			public Set getHeaderFieldKeys() {
161 				return this.headers.keySet();
162 			}
163 
164 			public Map getHeaderFields() {
165 				return this.headers;
166 			}
167 
168 			public Object getHeaderValue(String key) {
169 				return this.headers.get(key);
170 			}
171 
172 			public long getLength() {
173 				Object o = this.headers.get(CONTENT_LENGTH);
174 				if (o == null) {
175 					return -1;
176 				}
177 				long contentLength = (o instanceof Long)?
178                     ((Long)o).longValue(): Long.parseLong((String)o);
179                 return contentLength + contentOffset;
180 			}
181 
182 			public String getMimetype() {
183 				return (String)this.headers.get(CONTENT_TYPE);
184 			}
185 
186 			public long getOffset() {
187 				Object o = this.headers.get(ABSOLUTE_OFFSET_KEY);
188 				if (o == null) {
189 					return -1;
190 				}
191 				return (o instanceof Long)?
192                     ((Long)o).longValue(): Long.parseLong((String)o);
193 			}
194 
195 			public String getRecordIdentifier() {
196 				return (String)this.headers.get(RECORD_IDENTIFIER_FIELD_KEY);
197 			}
198 
199 			public String getUrl() {
200 				return (String)this.headers.get(HEADER_KEY_URI);
201 			}
202 
203 			public String getVersion() {
204 				return (String)this.headers.get(VERSION_FIELD_KEY);
205 			}
206             
207             public int getContentBegin() {
208                 return this.contentBegin;
209             }
210             
211             @Override
212             public String toString() {
213                 return this.headers.toString();
214             }
215     	};
216     }
217     
218     @Override
219     protected String getMimetype4Cdx(ArchiveRecordHeader h) {
220         final String m = super.getMimetype4Cdx(h);
221         // Mimetypes can have spaces in WARCs.  Emitting for CDX, just
222         // squash them for now.  Later, quote them since squashing spaces won't
223         // work for params that have quoted-string values.
224         Matcher matcher = WHITESPACE.matcher(m);
225         return matcher.replaceAll("");
226     }
227 }