1   /* ANVLRecord
2   *
3   * $Id: ANVLRecord.java 4545 2006-08-26 00:33:38Z stack-sf $
4   *
5   * Created on July 26, 2006.
6   *
7   * Copyright (C) 2006 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.util.anvl;
26  
27  import java.io.ByteArrayOutputStream;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.io.UnsupportedEncodingException;
31  import java.util.ArrayList;
32  import java.util.Collection;
33  import java.util.HashMap;
34  import java.util.Iterator;
35  import java.util.List;
36  import java.util.Map;
37  
38  import org.archive.io.UTF8Bytes;
39  
40  /***
41   * An ordered {@link List} with 'data' {@link Element} values.
42   * ANVLRecords end with a blank line.
43   * 
44   * @see <a
45   * href="http://www.cdlib.org/inside/diglib/ark/anvlspec.pdf">A Name-Value
46   * Language (ANVL)</a>
47   * @author stack
48   */
49  public class ANVLRecord extends ArrayList<Element> implements UTF8Bytes {
50  	private static final long serialVersionUID = -4610638888453052958L;
51  	
52  	public static final String MIMETYPE = "text/anvl";
53  	
54  	public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord();
55      
56      /***
57       * Arbitrary upper bound on maximum size of ANVL Record.
58       * Will throw an IOException if exceed this size.
59       */
60      public static final long MAXIMUM_SIZE = 1024 * 10;
61  	
62  	/***
63  	 * An ANVL 'newline'.
64  	 * @see <a href="http://en.wikipedia.org/wiki/CRLF">http://en.wikipedia.org/wiki/CRLF</a>
65  	 */
66      static final String CRLF = "\r\n";
67      
68      static final String FOLD_PREFIX = CRLF + ' ';
69      
70      public ANVLRecord() {
71          super();
72      }
73  
74      public ANVLRecord(Collection<? extends Element> c) {
75          super(c);
76      }
77  
78      public ANVLRecord(int initialCapacity) {
79          super(initialCapacity);
80      }
81      
82      public boolean addLabel(final String l) {
83      	return super.add(new Element(new Label(l)));
84      }
85  
86      public boolean addLabelValue(final String l, final String v) {
87      	return super.add(new Element(new Label(l), new Value(v)));
88      }
89      
90      @Override
91      public String toString() {
92          // TODO: What to emit for empty ANVLRecord?
93          StringBuilder sb = new StringBuilder();
94          for (final Iterator<Element> i = iterator(); i.hasNext();) {
95              sb.append(i.next());
96              sb.append(CRLF);
97          }
98          // 'ANVL Records end in a blank line'.
99          sb.append(CRLF);
100         return sb.toString();
101     }
102     
103     public Map<String, String> asMap() {
104         Map<String, String> m = new HashMap<String, String>(size());
105         for (final Iterator<Element> i = iterator(); i.hasNext();) {
106             Element e = i.next();
107             m.put(e.getLabel().toString(),
108                 e.isValue()? e.getValue().toString(): (String)null);
109         }
110         return m;
111     }
112     
113     @Override
114     public ANVLRecord clone() {
115         return new ANVLRecord(this);
116     }
117     
118     /***
119      * @return This ANVLRecord as UTF8 bytes.
120      */
121     public byte [] getUTF8Bytes()
122     throws UnsupportedEncodingException {
123         return toString().getBytes(UTF8);
124     }
125     
126     /***
127      * Parses a single ANVLRecord from passed InputStream.
128      * Read as a single-byte stream until we get to a CRLFCRLF which
129      * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream.
130      * Doing it this way, while requiring a double-scan, it  makes it so do not
131      * need to be passed a RepositionableStream or a Stream that supports
132      * marking.  Also no danger of over-reading which can happen when we
133      * wrap passed Stream with an InputStreamReader for doing UTF-8
134      * character conversion (See the ISR class comment).
135      * @param is InputStream
136      * @return An ANVLRecord instance.
137      * @throws IOException
138      */
139     public static ANVLRecord load(final InputStream is)
140     throws IOException {
141         // It doesn't look like a CRLF sequence is possible in UTF-8 without
142     	// it signifying CRLF: The top bits are set in multibyte characters.
143     	// Was thinking of recording CRLF as I was running through this first
144     	// parse but the offsets would then be incorrect if any multibyte
145     	// characters in the intervening gaps between CRLF.
146         boolean isCRLF = false;
147         boolean recordStart = false;
148         ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
149         boolean done = false;
150         int read = 0;
151         for (int c  = -1, previousCharacter; !done;) {
152             if (read++ >= MAXIMUM_SIZE) {
153                 throw new IOException("Read " + MAXIMUM_SIZE +
154                     " bytes without finding  //r//n//r//n " +
155                     "End-Of-ANVLRecord");
156             }
157             previousCharacter = c;
158             c = is.read();
159             if (c == -1) {
160                 throw new IOException("End-Of-Stream before //r//n//r//n " +
161                     "End-Of-ANVLRecord:\n" +
162                     new String(baos.toByteArray(), UTF8));
163             }
164             if (isLF((char)c) && isCR((char)previousCharacter)) {
165                 if (isCRLF) {
166                     // If we just had a CRLF, then its two CRLFs and its end of
167                     // record.  We're done.
168                     done = true;
169                 } else {
170                     isCRLF = true;
171                 }
172             } else if (!recordStart && Character.isWhitespace(c)) {
173                 // Skip any whitespace at start of ANVLRecord.
174                 continue;
175             } else {
176                 // Clear isCRLF flag if this character is NOT a '\r'.
177                 if (isCRLF && !isCR((char)c)) {
178                     isCRLF = false;
179                 }
180                 // Not whitespace so start record if we haven't already.
181                 if (!recordStart) {
182                     recordStart = true;
183                 }
184             }
185             baos.write(c);
186         }
187         return load(new String(baos.toByteArray(), UTF8));
188     }
189     
190     /*** 
191      * Parse passed String for an ANVL Record.
192      * Looked at writing javacc grammer but preprocessing is required to
193      * handle folding: See
194      * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173.
195      * Looked at Terence Parr's ANTLR.  More capable.  Can set lookahead count.
196      * A value of 3 would help with folding.  But its a pain defining UNICODE
197      * grammers -- needed by ANVL -- and support seems incomplete
198      * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode.
199      * For now, go with the below hand-rolled parser.
200      * @param s String with an ANVLRecord.
201      * @return ANVLRecord parsed from passed String.
202      * @throws IOException 
203      */
204     public static ANVLRecord load(final String s)
205     throws IOException {
206         ANVLRecord record = new ANVLRecord();
207         boolean inValue = false, inLabel = false, inComment = false, 
208             inNewLine = false;
209         String label = null;
210         StringBuilder sb = new StringBuilder(s.length());
211         for (int i = 0;  i < s.length(); i++) {
212             char c = s.charAt(i);
213            
214             // Assert I can do look-ahead.
215             if ((i + 1) > s.length()) {
216                 throw new IOException("Premature End-of-ANVLRecord:\n" +
217                     s.substring(i));
218             }
219             
220             // If at LF of a CRLF, just go around again. Eat up the LF.
221             if (inNewLine && isLF(c)) {
222                 continue;
223             }
224             
225             // If we're at a CRLF and we were just on one, exit. Found Record.
226             if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) {
227                 break;
228             }
229             
230             // Check if we're on a fold inside a Value. Skip multiple white
231             // space after CRLF. 
232             if (inNewLine && inValue && Character.isWhitespace(c)) {
233                 continue;
234             }
235             
236             // Else set flag if we're at a CRLF.
237             inNewLine = isCR(c) && isLF(s.charAt(i + 1));
238             
239             if (inNewLine) {
240                 if (inComment) {
241                     inComment = false;
242                 } else if (label != null && !inValue) {
243 					// Label only 'data element'.
244 					record.addLabel(label);
245 					label = null;
246 					sb.setLength(0);
247 				} else if (inValue) {
248 					// Assert I can do look-ahead past current CRLF.
249 					if ((i + 3) > s.length()) {
250 						throw new IOException("Premature End-of-ANVLRecord "
251 							+ "(2):\n" + s.substring(i));
252 					}
253 					if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3))
254 							&& Character.isWhitespace(s.charAt(i + 2))) {
255 						// Its a fold.  Let it go around. But add in a CRLF and
256 						// space and do it here.  We don't let CRLF fall through
257 						// to the sb.append on the end of this loop.
258 						sb.append(CRLF);
259 						sb.append(' ');
260 					} else {
261 						// Next line is a new SubElement, a new Comment or
262 						// Label.
263 						record.addLabelValue(label, sb.toString());
264 						sb.setLength(0);
265 						label = null;
266 						inValue = false;
267 					}
268 				} else {
269 					// We're whitespace between label and value or whitespace
270 					// before we've figured whether label or comment.
271 				}
272 				// Don't let the '\r' or CRLF through.
273 				continue;
274 			}
275             
276             if (inComment) {
277             	continue;
278             } else if (inLabel) {
279             	if (c == Label.COLON) {
280             		label = sb.toString();
281             		sb.setLength(0);
282             		inLabel = false;
283             		continue;
284             	}
285             } else {
286             	if (!inLabel && !inValue && !inComment) {
287             		// We have no state. Figure one.
288             		if (Character.isWhitespace(c)) {
289             			// If no state, and whitespace, skip. Don't record.
290             			continue;
291             		} else if (label == null && c == '#') {
292             			inComment = true;
293             			// Don't record comments.
294             			continue;
295             		} else if (label == null) {
296             			inLabel = true;
297             		} else {
298             			inValue = true;
299             		}
300             	}
301             }
302 			sb.append(c);
303         }
304         return record;
305     }
306     
307     /***
308      * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is
309      * CRLFCRLF so is of size 4.  Also, expensive, since it makes String of
310      * the record so it can count bytes.
311      */
312     public synchronized int getLength() {
313         int length = -1;
314         try {
315             length = getUTF8Bytes().length;
316         } catch (UnsupportedEncodingException e) {
317             throw new RuntimeException(e);
318         }
319         return length;
320     }
321     
322     public static boolean isCROrLF(final char c) {
323         return isCR(c) || isLF(c);
324     }
325     
326     public static boolean isCR(final char c) {
327         return c == ANVLRecord.CRLF.charAt(0);
328     }
329     
330     public static boolean isLF(final char c) {
331         return c == ANVLRecord.CRLF.charAt(1);
332     }
333 }