1   /*
2    * ARCWriter
3    *
4    * $Id: ARCWriterProcessor.java 5478 2007-09-19 01:37:07Z gojomo $
5    *
6    * Created on Jun 5, 2003
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  package org.archive.crawler.writer;
27  
28  import java.io.File;
29  import java.io.FileInputStream;
30  import java.io.FileNotFoundException;
31  import java.io.IOException;
32  import java.io.InputStream;
33  import java.io.StringWriter;
34  import java.net.InetAddress;
35  import java.net.UnknownHostException;
36  import java.util.ArrayList;
37  import java.util.List;
38  import java.util.concurrent.atomic.AtomicInteger;
39  import java.util.logging.Level;
40  import java.util.logging.Logger;
41  
42  import javax.xml.transform.SourceLocator;
43  import javax.xml.transform.Templates;
44  import javax.xml.transform.Transformer;
45  import javax.xml.transform.TransformerConfigurationException;
46  import javax.xml.transform.TransformerException;
47  import javax.xml.transform.TransformerFactory;
48  import javax.xml.transform.stream.StreamResult;
49  import javax.xml.transform.stream.StreamSource;
50  
51  import org.apache.commons.io.IOUtils;
52  import org.archive.crawler.Heritrix;
53  import org.archive.crawler.datamodel.CoreAttributeConstants;
54  import org.archive.crawler.datamodel.CrawlURI;
55  import org.archive.crawler.datamodel.FetchStatusCodes;
56  import org.archive.crawler.event.CrawlStatusListener;
57  import org.archive.crawler.framework.WriterPoolProcessor;
58  import org.archive.crawler.settings.XMLSettingsHandler;
59  import org.archive.io.ReplayInputStream;
60  import org.archive.io.WriterPoolMember;
61  import org.archive.io.WriterPoolSettings;
62  import org.archive.io.arc.ARCConstants;
63  import org.archive.io.arc.ARCWriter;
64  import org.archive.io.arc.ARCWriterPool;
65  
66  
67  /***
68   * Processor module for writing the results of successful fetches (and
69   * perhaps someday, certain kinds of network failures) to the Internet Archive
70   * ARC file format.
71   *
72   * Assumption is that there is only one of these ARCWriterProcessors per
73   * Heritrix instance.
74   *
75   * @author Parker Thompson
76   */
77  public class ARCWriterProcessor extends WriterPoolProcessor
78  implements CoreAttributeConstants, ARCConstants, CrawlStatusListener,
79  WriterPoolSettings, FetchStatusCodes {
80  	private static final long serialVersionUID = 1957518408532644531L;
81  
82  	private final Logger logger = Logger.getLogger(this.getClass().getName());
83      
84      /***
85       * Default path list.
86       */
87      private static final String [] DEFAULT_PATH = {"arcs"};
88  
89      /***
90       * @param name Name of this writer.
91       */
92      public ARCWriterProcessor(String name) {
93          super(name, "ARCWriter processor");
94      }
95      
96      protected String [] getDefaultPath() {
97      	return DEFAULT_PATH;
98  	}
99  
100     protected void setupPool(final AtomicInteger serialNo) {
101 		setPool(new ARCWriterPool(serialNo, this, getPoolMaximumActive(),
102             getPoolMaximumWait()));
103     }
104     
105     /***
106      * Writes a CrawlURI and its associated data to store file.
107      *
108      * Currently this method understands the following uri types: dns, http, 
109      * and https.
110      *
111      * @param curi CrawlURI to process.
112      */
113     protected void innerProcess(CrawlURI curi) {
114         // If failure, or we haven't fetched the resource yet, return
115         if (curi.getFetchStatus() <= 0) {
116             return;
117         }
118         
119         // If no recorded content at all, don't write record.
120         long recordLength = curi.getHttpRecorder().getRecordedInput().getSize();
121         if (recordLength <= 0) {
122         	// getContentSize() should be > 0 if any material (even just
123             // HTTP headers with zero-length body) is available. 
124         	return;
125         }
126         
127         ReplayInputStream ris = null; 
128         try {
129             if(shouldWrite(curi)) {
130                 ris = curi.getHttpRecorder().getRecordedInput()
131                         .getReplayInputStream();
132                 write(curi, recordLength, ris, getHostAddress(curi));
133             } else {
134                 logger.info("does not write " + curi.toString());
135             }
136         } catch (IOException e) {
137             curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
138                 curi.toString());
139             logger.log(Level.SEVERE, "Failed write of Record: " +
140                 curi.toString(), e);
141         } finally {
142             IOUtils.closeQuietly(ris); 
143         }
144     }
145     
146     protected void write(CrawlURI curi, long recordLength, InputStream in,
147         String ip)
148     throws IOException {
149         WriterPoolMember writer = getPool().borrowFile();
150         long position = writer.getPosition();
151         // See if we need to open a new file because we've exceeed maxBytes.
152         // Call to checkFileSize will open new file if we're at maximum for
153         // current file.
154         writer.checkSize();
155         if (writer.getPosition() != position) {
156             // We just closed the file because it was larger than maxBytes.
157             // Add to the totalBytesWritten the size of the first record
158             // in the file, if any.
159             setTotalBytesWritten(getTotalBytesWritten() +
160             	(writer.getPosition() - position));
161             position = writer.getPosition();
162         }
163         
164         ARCWriter w = (ARCWriter)writer;
165         try {
166             if (in instanceof ReplayInputStream) {
167                 w.write(curi.toString(), curi.getContentType(),
168                     ip, curi.getLong(A_FETCH_BEGAN_TIME),
169                     recordLength, (ReplayInputStream)in);
170             } else {
171                 w.write(curi.toString(), curi.getContentType(),
172                     ip, curi.getLong(A_FETCH_BEGAN_TIME),
173                     recordLength, in);
174             }
175         } catch (IOException e) {
176             // Invalidate this file (It gets a '.invalid' suffix).
177             getPool().invalidateFile(writer);
178             // Set the writer to null otherwise the pool accounting
179             // of how many active writers gets skewed if we subsequently
180             // do a returnWriter call on this object in the finally block.
181             writer = null;
182             throw e;
183         } finally {
184             if (writer != null) {
185             	setTotalBytesWritten(getTotalBytesWritten() +
186             	     (writer.getPosition() - position));
187                 getPool().returnFile(writer);
188             }
189         }
190         checkBytesWritten();
191     }
192     
193     @Override
194     protected String getFirstrecordStylesheet() {
195         return "/arcMetaheaderBody.xsl";
196     }
197 }