1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.util.anvl;
26
27 import java.io.ByteArrayOutputStream;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.UnsupportedEncodingException;
31 import java.util.ArrayList;
32 import java.util.Collection;
33 import java.util.HashMap;
34 import java.util.Iterator;
35 import java.util.List;
36 import java.util.Map;
37
38 import org.archive.io.UTF8Bytes;
39
40 /***
41 * An ordered {@link List} with 'data' {@link Element} values.
42 * ANVLRecords end with a blank line.
43 *
44 * @see <a
45 * href="http://www.cdlib.org/inside/diglib/ark/anvlspec.pdf">A Name-Value
46 * Language (ANVL)</a>
47 * @author stack
48 */
49 public class ANVLRecord extends ArrayList<Element> implements UTF8Bytes {
50 private static final long serialVersionUID = -4610638888453052958L;
51
52 public static final String MIMETYPE = "text/anvl";
53
54 public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord();
55
56 /***
57 * Arbitrary upper bound on maximum size of ANVL Record.
58 * Will throw an IOException if exceed this size.
59 */
60 public static final long MAXIMUM_SIZE = 1024 * 10;
61
62 /***
63 * An ANVL 'newline'.
64 * @see <a href="http://en.wikipedia.org/wiki/CRLF">http://en.wikipedia.org/wiki/CRLF</a>
65 */
66 static final String CRLF = "\r\n";
67
68 static final String FOLD_PREFIX = CRLF + ' ';
69
70 public ANVLRecord() {
71 super();
72 }
73
74 public ANVLRecord(Collection<? extends Element> c) {
75 super(c);
76 }
77
78 public ANVLRecord(int initialCapacity) {
79 super(initialCapacity);
80 }
81
82 public boolean addLabel(final String l) {
83 return super.add(new Element(new Label(l)));
84 }
85
86 public boolean addLabelValue(final String l, final String v) {
87 return super.add(new Element(new Label(l), new Value(v)));
88 }
89
90 @Override
91 public String toString() {
92
93 StringBuilder sb = new StringBuilder();
94 for (final Iterator<Element> i = iterator(); i.hasNext();) {
95 sb.append(i.next());
96 sb.append(CRLF);
97 }
98
99 sb.append(CRLF);
100 return sb.toString();
101 }
102
103 public Map<String, String> asMap() {
104 Map<String, String> m = new HashMap<String, String>(size());
105 for (final Iterator<Element> i = iterator(); i.hasNext();) {
106 Element e = i.next();
107 m.put(e.getLabel().toString(),
108 e.isValue()? e.getValue().toString(): (String)null);
109 }
110 return m;
111 }
112
113 @Override
114 public ANVLRecord clone() {
115 return new ANVLRecord(this);
116 }
117
118 /***
119 * @return This ANVLRecord as UTF8 bytes.
120 */
121 public byte [] getUTF8Bytes()
122 throws UnsupportedEncodingException {
123 return toString().getBytes(UTF8);
124 }
125
126 /***
127 * Parses a single ANVLRecord from passed InputStream.
128 * Read as a single-byte stream until we get to a CRLFCRLF which
129 * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream.
130 * Doing it this way, while requiring a double-scan, it makes it so do not
131 * need to be passed a RepositionableStream or a Stream that supports
132 * marking. Also no danger of over-reading which can happen when we
133 * wrap passed Stream with an InputStreamReader for doing UTF-8
134 * character conversion (See the ISR class comment).
135 * @param is InputStream
136 * @return An ANVLRecord instance.
137 * @throws IOException
138 */
139 public static ANVLRecord load(final InputStream is)
140 throws IOException {
141
142
143
144
145
146 boolean isCRLF = false;
147 boolean recordStart = false;
148 ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
149 boolean done = false;
150 int read = 0;
151 for (int c = -1, previousCharacter; !done;) {
152 if (read++ >= MAXIMUM_SIZE) {
153 throw new IOException("Read " + MAXIMUM_SIZE +
154 " bytes without finding //r//n//r//n " +
155 "End-Of-ANVLRecord");
156 }
157 previousCharacter = c;
158 c = is.read();
159 if (c == -1) {
160 throw new IOException("End-Of-Stream before //r//n//r//n " +
161 "End-Of-ANVLRecord:\n" +
162 new String(baos.toByteArray(), UTF8));
163 }
164 if (isLF((char)c) && isCR((char)previousCharacter)) {
165 if (isCRLF) {
166
167
168 done = true;
169 } else {
170 isCRLF = true;
171 }
172 } else if (!recordStart && Character.isWhitespace(c)) {
173
174 continue;
175 } else {
176
177 if (isCRLF && !isCR((char)c)) {
178 isCRLF = false;
179 }
180
181 if (!recordStart) {
182 recordStart = true;
183 }
184 }
185 baos.write(c);
186 }
187 return load(new String(baos.toByteArray(), UTF8));
188 }
189
190 /***
191 * Parse passed String for an ANVL Record.
192 * Looked at writing javacc grammer but preprocessing is required to
193 * handle folding: See
194 * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173.
195 * Looked at Terence Parr's ANTLR. More capable. Can set lookahead count.
196 * A value of 3 would help with folding. But its a pain defining UNICODE
197 * grammers -- needed by ANVL -- and support seems incomplete
198 * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode.
199 * For now, go with the below hand-rolled parser.
200 * @param s String with an ANVLRecord.
201 * @return ANVLRecord parsed from passed String.
202 * @throws IOException
203 */
204 public static ANVLRecord load(final String s)
205 throws IOException {
206 ANVLRecord record = new ANVLRecord();
207 boolean inValue = false, inLabel = false, inComment = false,
208 inNewLine = false;
209 String label = null;
210 StringBuilder sb = new StringBuilder(s.length());
211 for (int i = 0; i < s.length(); i++) {
212 char c = s.charAt(i);
213
214
215 if ((i + 1) > s.length()) {
216 throw new IOException("Premature End-of-ANVLRecord:\n" +
217 s.substring(i));
218 }
219
220
221 if (inNewLine && isLF(c)) {
222 continue;
223 }
224
225
226 if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) {
227 break;
228 }
229
230
231
232 if (inNewLine && inValue && Character.isWhitespace(c)) {
233 continue;
234 }
235
236
237 inNewLine = isCR(c) && isLF(s.charAt(i + 1));
238
239 if (inNewLine) {
240 if (inComment) {
241 inComment = false;
242 } else if (label != null && !inValue) {
243
244 record.addLabel(label);
245 label = null;
246 sb.setLength(0);
247 } else if (inValue) {
248
249 if ((i + 3) > s.length()) {
250 throw new IOException("Premature End-of-ANVLRecord "
251 + "(2):\n" + s.substring(i));
252 }
253 if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3))
254 && Character.isWhitespace(s.charAt(i + 2))) {
255
256
257
258 sb.append(CRLF);
259 sb.append(' ');
260 } else {
261
262
263 record.addLabelValue(label, sb.toString());
264 sb.setLength(0);
265 label = null;
266 inValue = false;
267 }
268 } else {
269
270
271 }
272
273 continue;
274 }
275
276 if (inComment) {
277 continue;
278 } else if (inLabel) {
279 if (c == Label.COLON) {
280 label = sb.toString();
281 sb.setLength(0);
282 inLabel = false;
283 continue;
284 }
285 } else {
286 if (!inLabel && !inValue && !inComment) {
287
288 if (Character.isWhitespace(c)) {
289
290 continue;
291 } else if (label == null && c == '#') {
292 inComment = true;
293
294 continue;
295 } else if (label == null) {
296 inLabel = true;
297 } else {
298 inValue = true;
299 }
300 }
301 }
302 sb.append(c);
303 }
304 return record;
305 }
306
307 /***
308 * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is
309 * CRLFCRLF so is of size 4. Also, expensive, since it makes String of
310 * the record so it can count bytes.
311 */
312 public synchronized int getLength() {
313 int length = -1;
314 try {
315 length = getUTF8Bytes().length;
316 } catch (UnsupportedEncodingException e) {
317 throw new RuntimeException(e);
318 }
319 return length;
320 }
321
322 public static boolean isCROrLF(final char c) {
323 return isCR(c) || isLF(c);
324 }
325
326 public static boolean isCR(final char c) {
327 return c == ANVLRecord.CRLF.charAt(0);
328 }
329
330 public static boolean isLF(final char c) {
331 return c == ANVLRecord.CRLF.charAt(1);
332 }
333 }