1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.crawler.writer;
27
28 import java.io.File;
29 import java.io.FileInputStream;
30 import java.io.FileNotFoundException;
31 import java.io.IOException;
32 import java.io.InputStream;
33 import java.io.StringWriter;
34 import java.net.InetAddress;
35 import java.net.UnknownHostException;
36 import java.util.ArrayList;
37 import java.util.List;
38 import java.util.concurrent.atomic.AtomicInteger;
39 import java.util.logging.Level;
40 import java.util.logging.Logger;
41
42 import javax.xml.transform.SourceLocator;
43 import javax.xml.transform.Templates;
44 import javax.xml.transform.Transformer;
45 import javax.xml.transform.TransformerConfigurationException;
46 import javax.xml.transform.TransformerException;
47 import javax.xml.transform.TransformerFactory;
48 import javax.xml.transform.stream.StreamResult;
49 import javax.xml.transform.stream.StreamSource;
50
51 import org.apache.commons.io.IOUtils;
52 import org.archive.crawler.Heritrix;
53 import org.archive.crawler.datamodel.CoreAttributeConstants;
54 import org.archive.crawler.datamodel.CrawlURI;
55 import org.archive.crawler.datamodel.FetchStatusCodes;
56 import org.archive.crawler.event.CrawlStatusListener;
57 import org.archive.crawler.framework.WriterPoolProcessor;
58 import org.archive.crawler.settings.XMLSettingsHandler;
59 import org.archive.io.ReplayInputStream;
60 import org.archive.io.WriterPoolMember;
61 import org.archive.io.WriterPoolSettings;
62 import org.archive.io.arc.ARCConstants;
63 import org.archive.io.arc.ARCWriter;
64 import org.archive.io.arc.ARCWriterPool;
65
66
67 /***
68 * Processor module for writing the results of successful fetches (and
69 * perhaps someday, certain kinds of network failures) to the Internet Archive
70 * ARC file format.
71 *
72 * Assumption is that there is only one of these ARCWriterProcessors per
73 * Heritrix instance.
74 *
75 * @author Parker Thompson
76 */
77 public class ARCWriterProcessor extends WriterPoolProcessor
78 implements CoreAttributeConstants, ARCConstants, CrawlStatusListener,
79 WriterPoolSettings, FetchStatusCodes {
80 private static final long serialVersionUID = 1957518408532644531L;
81
82 private final Logger logger = Logger.getLogger(this.getClass().getName());
83
84 /***
85 * Default path list.
86 */
87 private static final String [] DEFAULT_PATH = {"arcs"};
88
89 /***
90 * @param name Name of this writer.
91 */
92 public ARCWriterProcessor(String name) {
93 super(name, "ARCWriter processor");
94 }
95
96 protected String [] getDefaultPath() {
97 return DEFAULT_PATH;
98 }
99
100 protected void setupPool(final AtomicInteger serialNo) {
101 setPool(new ARCWriterPool(serialNo, this, getPoolMaximumActive(),
102 getPoolMaximumWait()));
103 }
104
105 /***
106 * Writes a CrawlURI and its associated data to store file.
107 *
108 * Currently this method understands the following uri types: dns, http,
109 * and https.
110 *
111 * @param curi CrawlURI to process.
112 */
113 protected void innerProcess(CrawlURI curi) {
114
115 if (curi.getFetchStatus() <= 0) {
116 return;
117 }
118
119
120 long recordLength = curi.getHttpRecorder().getRecordedInput().getSize();
121 if (recordLength <= 0) {
122
123
124 return;
125 }
126
127 ReplayInputStream ris = null;
128 try {
129 if(shouldWrite(curi)) {
130 ris = curi.getHttpRecorder().getRecordedInput()
131 .getReplayInputStream();
132 write(curi, recordLength, ris, getHostAddress(curi));
133 } else {
134 logger.info("does not write " + curi.toString());
135 }
136 } catch (IOException e) {
137 curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
138 curi.toString());
139 logger.log(Level.SEVERE, "Failed write of Record: " +
140 curi.toString(), e);
141 } finally {
142 IOUtils.closeQuietly(ris);
143 }
144 }
145
146 protected void write(CrawlURI curi, long recordLength, InputStream in,
147 String ip)
148 throws IOException {
149 WriterPoolMember writer = getPool().borrowFile();
150 long position = writer.getPosition();
151
152
153
154 writer.checkSize();
155 if (writer.getPosition() != position) {
156
157
158
159 setTotalBytesWritten(getTotalBytesWritten() +
160 (writer.getPosition() - position));
161 position = writer.getPosition();
162 }
163
164 ARCWriter w = (ARCWriter)writer;
165 try {
166 if (in instanceof ReplayInputStream) {
167 w.write(curi.toString(), curi.getContentType(),
168 ip, curi.getLong(A_FETCH_BEGAN_TIME),
169 recordLength, (ReplayInputStream)in);
170 } else {
171 w.write(curi.toString(), curi.getContentType(),
172 ip, curi.getLong(A_FETCH_BEGAN_TIME),
173 recordLength, in);
174 }
175 } catch (IOException e) {
176
177 getPool().invalidateFile(writer);
178
179
180
181 writer = null;
182 throw e;
183 } finally {
184 if (writer != null) {
185 setTotalBytesWritten(getTotalBytesWritten() +
186 (writer.getPosition() - position));
187 getPool().returnFile(writer);
188 }
189 }
190 checkBytesWritten();
191 }
192
193 @Override
194 protected String getFirstrecordStylesheet() {
195 return "/arcMetaheaderBody.xsl";
196 }
197 }