1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import java.io.BufferedOutputStream;
26 import java.io.ByteArrayOutputStream;
27 import java.io.File;
28 import java.io.FileOutputStream;
29 import java.io.IOException;
30 import java.util.ArrayList;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.logging.Level;
34 import java.util.logging.Logger;
35
36 import org.apache.commons.cli.CommandLine;
37 import org.apache.commons.cli.HelpFormatter;
38 import org.apache.commons.cli.Option;
39 import org.apache.commons.cli.Options;
40 import org.apache.commons.cli.ParseException;
41 import org.apache.commons.cli.PosixParser;
42 import org.archive.io.arc.ARCConstants;
43 import org.archive.io.arc.ARCReader;
44 import org.archive.io.arc.ARCReaderFactory;
45 import org.archive.io.arc.ARCRecord;
46 import org.archive.io.warc.WARCConstants;
47 import org.archive.io.warc.WARCWriter;
48 import org.archive.util.FileUtils;
49 import org.archive.util.anvl.ANVLRecord;
50
51
52 /***
53 * Convert ARCs to (sortof) WARCs.
54 * @author stack
55 * @version $Date: 2008-03-26 01:04:12 +0000 (Wed, 26 Mar 2008) $ $Revision: 5800 $
56 */
57 public class Arc2Warc {
58 private static void usage(HelpFormatter formatter, Options options,
59 int exitCode) {
60 formatter.printHelp("java org.archive.io.arc.Arc2Warc " +
61 "[--force] ARC_INPUT WARC_OUTPUT", options);
62 System.exit(exitCode);
63 }
64
65 private static String getRevision() {
66 return Warc2Arc.parseRevision("$Revision: 5800 $");
67 }
68
69 public void transform(final File arc, final File warc, final boolean force)
70 throws IOException {
71 FileUtils.isReadable(arc);
72 if (warc.exists() && !force) {
73 throw new IOException("Target WARC already exists. " +
74 "Will not overwrite.");
75 }
76
77 ARCReader reader = ARCReaderFactory.get(arc, false, 0);
78 transform(reader, warc);
79 }
80
81 protected void transform(final ARCReader reader, final File warc)
82 throws IOException {
83 WARCWriter writer = null;
84
85
86 reader.setDigest(false);
87 try {
88 BufferedOutputStream bos =
89 new BufferedOutputStream(new FileOutputStream(warc));
90
91
92 final Iterator<ArchiveRecord> i = reader.iterator();
93 ARCRecord firstRecord = (ARCRecord)i.next();
94 ByteArrayOutputStream baos =
95 new ByteArrayOutputStream((int)firstRecord.getHeader().
96 getLength());
97 firstRecord.dump(baos);
98
99 ANVLRecord ar = new ANVLRecord(1);
100 ar.addLabelValue("Filedesc", baos.toString());
101 List<String> metadata = new ArrayList<String>(1);
102 metadata.add(ar.toString());
103
104
105 writer = new WARCWriter(null, bos, warc,
106 reader.isCompressed(), null, metadata);
107
108
109 writer.writeWarcinfoRecord(warc.getName(),
110 "Made from " + reader.getReaderIdentifier() + " by " +
111 this.getClass().getName() + "/" + getRevision());
112 for (; i.hasNext();) {
113 write(writer, (ARCRecord)i.next());
114 }
115 } finally {
116 if (reader != null) {
117 reader.close();
118 }
119 if (writer != null) {
120
121
122
123
124
125 Logger l = Logger.getLogger(writer.getClass().getName());
126 Level oldLevel = l.getLevel();
127 l.setLevel(Level.WARNING);
128 try {
129 writer.close();
130 } finally {
131 l.setLevel(oldLevel);
132 }
133 }
134 }
135 }
136
137 protected void write(final WARCWriter writer,
138 final ARCRecord r)
139 throws IOException {
140 ANVLRecord ar = new ANVLRecord();
141 String ip = (String)r.getHeader().
142 getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
143 if (ip != null && ip.length() > 0) {
144 ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
145 }
146
147
148 writer.writeResourceRecord(r.getHeader().getUrl(),
149 r.getHeader().getDate(),
150 (r.getHeader().getContentBegin() > 0)?
151 WARCConstants.HTTP_RESPONSE_MIMETYPE:
152 r.getHeader().getMimetype(),
153 ar, r, r.getHeader().getLength());
154 }
155
156 /***
157 * Command-line interface to Arc2Warc.
158 *
159 * @param args Command-line arguments.
160 * @throws ParseException Failed parse of the command line.
161 * @throws IOException
162 * @throws java.text.ParseException
163 */
164 public static void main(String [] args)
165 throws ParseException, IOException, java.text.ParseException {
166 Options options = new Options();
167 options.addOption(new Option("h","help", false,
168 "Prints this message and exits."));
169 options.addOption(new Option("f","force", false,
170 "Force overwrite of target file."));
171 PosixParser parser = new PosixParser();
172 CommandLine cmdline = parser.parse(options, args, false);
173 List cmdlineArgs = cmdline.getArgList();
174 Option [] cmdlineOptions = cmdline.getOptions();
175 HelpFormatter formatter = new HelpFormatter();
176
177
178 if (cmdlineArgs.size() <= 0) {
179 usage(formatter, options, 0);
180 }
181
182
183 boolean force = false;
184 for (int i = 0; i < cmdlineOptions.length; i++) {
185 switch(cmdlineOptions[i].getId()) {
186 case 'h':
187 usage(formatter, options, 0);
188 break;
189
190 case 'f':
191 force = true;
192 break;
193
194 default:
195 throw new RuntimeException("Unexpected option: " +
196 + cmdlineOptions[i].getId());
197 }
198 }
199
200
201 if (cmdlineArgs.size() != 2) {
202 usage(formatter, options, 0);
203 }
204 (new Arc2Warc()).transform(new File(cmdlineArgs.get(0).toString()),
205 new File(cmdlineArgs.get(1).toString()), force);
206 }
207 }