1   /* $Id: Arc2Warc.java 5800 2008-03-26 01:04:12Z gojomo $
2    *
3    * Created Aug 29, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import java.io.BufferedOutputStream;
26  import java.io.ByteArrayOutputStream;
27  import java.io.File;
28  import java.io.FileOutputStream;
29  import java.io.IOException;
30  import java.util.ArrayList;
31  import java.util.Iterator;
32  import java.util.List;
33  import java.util.logging.Level;
34  import java.util.logging.Logger;
35  
36  import org.apache.commons.cli.CommandLine;
37  import org.apache.commons.cli.HelpFormatter;
38  import org.apache.commons.cli.Option;
39  import org.apache.commons.cli.Options;
40  import org.apache.commons.cli.ParseException;
41  import org.apache.commons.cli.PosixParser;
42  import org.archive.io.arc.ARCConstants;
43  import org.archive.io.arc.ARCReader;
44  import org.archive.io.arc.ARCReaderFactory;
45  import org.archive.io.arc.ARCRecord;
46  import org.archive.io.warc.WARCConstants;
47  import org.archive.io.warc.WARCWriter;
48  import org.archive.util.FileUtils;
49  import org.archive.util.anvl.ANVLRecord;
50  
51  
52  /***
53   * Convert ARCs to (sortof) WARCs.
54   * @author stack
55   * @version $Date: 2008-03-26 01:04:12 +0000 (Wed, 26 Mar 2008) $ $Revision: 5800 $
56   */
57  public class Arc2Warc {
58     private static void usage(HelpFormatter formatter, Options options,
59             int exitCode) {
60         formatter.printHelp("java org.archive.io.arc.Arc2Warc " +
61         		"[--force] ARC_INPUT WARC_OUTPUT", options);
62         System.exit(exitCode);
63     }
64     
65     private static String getRevision() {
66         return Warc2Arc.parseRevision("$Revision: 5800 $");
67     }
68     
69     public void transform(final File arc, final File warc, final boolean force)
70     throws IOException {
71         FileUtils.isReadable(arc);
72         if (warc.exists() && !force) {
73      	   throw new IOException("Target WARC already exists. " +
74      	       "Will not overwrite.");
75         }
76  
77         ARCReader reader = ARCReaderFactory.get(arc, false, 0);
78         transform(reader, warc);
79     }
80     
81     protected void transform(final ARCReader reader, final File warc)
82     throws IOException {
83  	   WARCWriter writer = null;
84  	   // No point digesting. Digest is available after reading of ARC which
85  	   // is too late for inclusion in WARC.
86  	   reader.setDigest(false);
87  	   try {
88  		   BufferedOutputStream bos =
89  			   new BufferedOutputStream(new FileOutputStream(warc));
90  		   // Get the body of the first ARC record as a String so can dump it
91  		   // into first record of WARC.
92  		   final Iterator<ArchiveRecord> i = reader.iterator();
93  		   ARCRecord firstRecord = (ARCRecord)i.next();
94  		   ByteArrayOutputStream baos =
95  			   new ByteArrayOutputStream((int)firstRecord.getHeader().
96  			       getLength());
97  		   firstRecord.dump(baos);
98  	       // Add ARC first record content as an ANVLRecord.
99  	       ANVLRecord ar = new ANVLRecord(1);
100 	       ar.addLabelValue("Filedesc", baos.toString());
101 	       List<String> metadata = new ArrayList<String>(1);
102 	       metadata.add(ar.toString());
103 	       // Now create the writer.  If reader was compressed, lets write
104 	       // a compressed WARC.
105 		   writer = new WARCWriter(null, bos, warc,
106 		       reader.isCompressed(), null, metadata);
107 		   // Write a warcinfo record with description about how this WARC
108 		   // was made.
109 		   writer.writeWarcinfoRecord(warc.getName(),
110 		       "Made from " + reader.getReaderIdentifier() + " by " +
111 	               this.getClass().getName() + "/" + getRevision());
112 		   for (; i.hasNext();) {
113 			   write(writer, (ARCRecord)i.next());
114 		   }
115 	   } finally {
116 		   if (reader != null) {
117 			   reader.close();
118 		   }
119 		   if (writer != null) {
120 			   // I don't want the close being logged -- least, not w/o log of
121 			   // an opening (and that'd be a little silly for simple script
122 			   // like this). Currently, it logs at level INFO so that close
123 			   // of files gets written to log files.  Up the log level just
124 			   // for the close.
125 			   Logger l = Logger.getLogger(writer.getClass().getName());
126 			   Level oldLevel = l.getLevel();
127 			   l.setLevel(Level.WARNING);
128 			   try {
129 				   writer.close();
130 			   } finally {
131 				   l.setLevel(oldLevel);
132 			   }
133 		   }
134 	   }
135    }
136    
137    protected void write(final WARCWriter writer,
138 		   final ARCRecord r)
139    throws IOException {
140 	   ANVLRecord ar = new ANVLRecord();
141 	   String ip = (String)r.getHeader().
142 	       getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
143 	   if (ip != null && ip.length() > 0) {
144 		   ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
145 	   }
146 	   // If contentBody > 0, assume http headers.  Make the mimetype
147 	   // be application/http.  Otherwise, give it ARC mimetype.
148 	   writer.writeResourceRecord(r.getHeader().getUrl(),
149 	       r.getHeader().getDate(),
150 	       (r.getHeader().getContentBegin() > 0)?
151 	    	   WARCConstants.HTTP_RESPONSE_MIMETYPE:
152 	    	   r.getHeader().getMimetype(),
153 	    	   ar, r, r.getHeader().getLength());
154    }
155 
156    /***
157     * Command-line interface to Arc2Warc.
158     *
159     * @param args Command-line arguments.
160     * @throws ParseException Failed parse of the command line.
161     * @throws IOException
162     * @throws java.text.ParseException
163     */
164    public static void main(String [] args)
165    throws ParseException, IOException, java.text.ParseException {
166        Options options = new Options();
167        options.addOption(new Option("h","help", false,
168            "Prints this message and exits."));
169        options.addOption(new Option("f","force", false,
170        	   "Force overwrite of target file."));
171        PosixParser parser = new PosixParser();
172        CommandLine cmdline = parser.parse(options, args, false);
173        List cmdlineArgs = cmdline.getArgList();
174        Option [] cmdlineOptions = cmdline.getOptions();
175        HelpFormatter formatter = new HelpFormatter();
176        
177        // If no args, print help.
178        if (cmdlineArgs.size() <= 0) {
179            usage(formatter, options, 0);
180        }
181 
182        // Now look at options passed.
183        boolean force = false;
184        for (int i = 0; i < cmdlineOptions.length; i++) {
185            switch(cmdlineOptions[i].getId()) {
186                case 'h':
187                    usage(formatter, options, 0);
188                    break;
189                    
190                case 'f':
191                    force = true;
192                    break;
193                    
194                default:
195                    throw new RuntimeException("Unexpected option: " +
196                        + cmdlineOptions[i].getId());
197            }
198        }
199        
200        // If no args, print help.
201        if (cmdlineArgs.size() != 2) {
202            usage(formatter, options, 0);
203        }
204        (new Arc2Warc()).transform(new File(cmdlineArgs.get(0).toString()),
205            new File(cmdlineArgs.get(1).toString()), force);
206    }
207 }