1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.framework;
26
27 import java.io.DataInputStream;
28 import java.io.DataOutputStream;
29 import java.io.File;
30 import java.io.FileInputStream;
31 import java.io.FileNotFoundException;
32 import java.io.FileOutputStream;
33 import java.io.IOException;
34 import java.io.ObjectInputStream;
35 import java.io.StringWriter;
36 import java.net.InetAddress;
37 import java.net.UnknownHostException;
38 import java.util.ArrayList;
39 import java.util.Arrays;
40 import java.util.Iterator;
41 import java.util.List;
42 import java.util.concurrent.atomic.AtomicInteger;
43 import java.util.logging.Logger;
44
45 import javax.management.AttributeNotFoundException;
46 import javax.management.MBeanException;
47 import javax.management.ReflectionException;
48 import javax.xml.transform.SourceLocator;
49 import javax.xml.transform.Templates;
50 import javax.xml.transform.Transformer;
51 import javax.xml.transform.TransformerConfigurationException;
52 import javax.xml.transform.TransformerException;
53 import javax.xml.transform.TransformerFactory;
54 import javax.xml.transform.stream.StreamResult;
55 import javax.xml.transform.stream.StreamSource;
56
57 import org.archive.crawler.Heritrix;
58 import org.archive.crawler.datamodel.CoreAttributeConstants;
59 import org.archive.crawler.datamodel.CrawlHost;
60 import org.archive.crawler.datamodel.CrawlOrder;
61 import org.archive.crawler.datamodel.CrawlURI;
62 import org.archive.crawler.datamodel.FetchStatusCodes;
63 import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
64 import org.archive.crawler.event.CrawlStatusListener;
65 import org.archive.crawler.settings.SimpleType;
66 import org.archive.crawler.settings.StringList;
67 import org.archive.crawler.settings.Type;
68 import org.archive.crawler.settings.XMLSettingsHandler;
69 import org.archive.io.ObjectPlusFilesInputStream;
70 import org.archive.io.WriterPool;
71 import org.archive.io.WriterPoolMember;
72
73 /***
74 * Abstract implementation of a file pool processor.
75 * Subclass to implement for a particular {@link WriterPoolMember} instance.
76 * @author Parker Thompson
77 * @author stack
78 */
79 public abstract class WriterPoolProcessor extends Processor
80 implements CoreAttributeConstants, CrawlStatusListener, FetchStatusCodes {
81 private final Logger logger = Logger.getLogger(this.getClass().getName());
82
83 /***
84 * Key to use asking settings for file compression value.
85 */
86 public static final String ATTR_COMPRESS = "compress";
87
88 /***
89 * Default as to whether we do compression of files.
90 */
91 public static final boolean DEFAULT_COMPRESS = true;
92
93 /***
94 * Key to use asking settings for file prefix value.
95 */
96 public static final String ATTR_PREFIX = "prefix";
97
98 /***
99 * Key to use asking settings for arc path value.
100 */
101 public static final String ATTR_PATH ="path";
102
103 /***
104 * Key to use asking settings for file suffix value.
105 */
106 public static final String ATTR_SUFFIX = "suffix";
107
108 /***
109 * Key to use asking settings for file max size value.
110 */
111 public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
112
113 /***
114 * Key to get maximum pool size.
115 *
116 * This key is for maximum files active in the pool.
117 */
118 public static final String ATTR_POOL_MAX_ACTIVE = "pool-max-active";
119
120 /***
121 * Key to get maximum wait on pool object before we give up and
122 * throw IOException.
123 */
124 public static final String ATTR_POOL_MAX_WAIT = "pool-max-wait";
125
126 /***
127 * Key for the maximum bytes to write attribute.
128 */
129 public static final String ATTR_MAX_BYTES_WRITTEN =
130 "total-bytes-to-write";
131
132 /***
133 * Key for whether to skip writing records of content-digest repeats
134 */
135 public static final String ATTR_SKIP_IDENTICAL_DIGESTS =
136 "skip-identical-digests";
137
138 /***
139 * CrawlURI annotation indicating no record was written
140 */
141 protected static final String ANNOTATION_UNWRITTEN = "unwritten";
142
143 /***
144 * Default maximum file size.
145 * TODO: Check that subclasses can set a different MAX_FILE_SIZE and
146 * it will be used in the constructor as default.
147 */
148 private static final int DEFAULT_MAX_FILE_SIZE = 100000000;
149
150 /***
151 * Default path list.
152 *
153 * TODO: Confirm this one gets picked up.
154 */
155 private static final String [] DEFAULT_PATH = {"crawl-store"};
156
157 /***
158 * Reference to pool.
159 */
160 transient private WriterPool pool = null;
161
162 /***
163 * Total number of bytes written to disc.
164 */
165 private long totalBytesWritten = 0;
166
167 /***
168 * Calculate metadata once only.
169 */
170 transient private List<String> cachedMetadata = null;
171
172
173 /***
174 * @param name Name of this processor.
175 */
176 public WriterPoolProcessor(String name) {
177 this(name, "Pool of files processor");
178 }
179
180 /***
181 * @param name Name of this processor.
182 * @param description Description for this processor.
183 */
184 public WriterPoolProcessor(final String name,
185 final String description) {
186 super(name, description);
187 Type e = addElementToDefinition(
188 new SimpleType(ATTR_COMPRESS, "Compress files when " +
189 "writing to disk.", new Boolean(DEFAULT_COMPRESS)));
190 e.setOverrideable(false);
191 e = addElementToDefinition(
192 new SimpleType(ATTR_PREFIX,
193 "File prefix. " +
194 "The text supplied here will be used as a prefix naming " +
195 "writer files. For example if the prefix is 'IAH', " +
196 "then file names will look like " +
197 "IAH-20040808101010-0001-HOSTNAME.arc.gz " +
198 "...if writing ARCs (The prefix will be " +
199 "separated from the date by a hyphen).",
200 WriterPoolMember.DEFAULT_PREFIX));
201 e = addElementToDefinition(
202 new SimpleType(ATTR_SUFFIX, "Suffix to tag onto " +
203 "files. If value is '${HOSTNAME}', will use hostname for " +
204 "suffix. If empty, no suffix will be added.",
205 WriterPoolMember.DEFAULT_SUFFIX));
206 e.setOverrideable(false);
207 e = addElementToDefinition(
208 new SimpleType(ATTR_MAX_SIZE_BYTES, "Max size of each file",
209 new Long(DEFAULT_MAX_FILE_SIZE)));
210 e.setOverrideable(false);
211 e = addElementToDefinition(
212 new StringList(ATTR_PATH, "Where to files. " +
213 "Supply absolute or relative path. If relative, files " +
214 "will be written relative to " +
215 "the " + CrawlOrder.ATTR_DISK_PATH + "setting." +
216 " If more than one path specified, we'll round-robin" +
217 " dropping files to each. This setting is safe" +
218 " to change midcrawl (You can remove and add new dirs" +
219 " as the crawler progresses).", getDefaultPath()));
220 e.setOverrideable(false);
221 e = addElementToDefinition(new SimpleType(ATTR_POOL_MAX_ACTIVE,
222 "Maximum active files in pool. " +
223 "This setting cannot be varied over the life of a crawl.",
224 new Integer(WriterPool.DEFAULT_MAX_ACTIVE)));
225 e.setOverrideable(false);
226 e = addElementToDefinition(new SimpleType(ATTR_POOL_MAX_WAIT,
227 "Maximum time to wait on pool element" +
228 " (milliseconds). This setting cannot be varied over the life" +
229 " of a crawl.",
230 new Integer(WriterPool.DEFAULT_MAXIMUM_WAIT)));
231 e.setOverrideable(false);
232 e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_WRITTEN,
233 "Total file bytes to write to disk." +
234 " Once the size of all files on disk has exceeded this " +
235 "limit, this processor will stop the crawler. " +
236 "A value of zero means no upper limit.", new Long(0)));
237 e.setOverrideable(false);
238 e.setExpertSetting(true);
239 e = addElementToDefinition(new SimpleType(ATTR_SKIP_IDENTICAL_DIGESTS,
240 "Whether to skip the writing of a record when URI " +
241 "history information is available and indicates the " +
242 "prior fetch had an identical content digest. " +
243 "Default is false.", new Boolean(false)));
244 e.setOverrideable(true);
245 e.setExpertSetting(true);
246 }
247
248 protected String [] getDefaultPath() {
249 return DEFAULT_PATH;
250 }
251
252 public synchronized void initialTasks() {
253
254 getSettingsHandler().getOrder().getController().
255 addCrawlStatusListener(this);
256 setupPool(new AtomicInteger());
257
258 if (getSettingsHandler().getOrder().getController().
259 isCheckpointRecover()) {
260 checkpointRecover();
261 }
262 }
263
264 protected AtomicInteger getSerialNo() {
265 return ((WriterPool)getPool()).getSerialNo();
266 }
267
268 /***
269 * Set up pool of files.
270 */
271 protected abstract void setupPool(final AtomicInteger serialNo);
272
273 /***
274 * Writes a CrawlURI and its associated data to store file.
275 *
276 * Currently this method understands the following uri types: dns, http,
277 * and https.
278 *
279 * @param curi CrawlURI to process.
280 */
281 protected abstract void innerProcess(CrawlURI curi);
282
283 protected void checkBytesWritten() {
284 long max = getMaxToWrite();
285 if (max <= 0) {
286 return;
287 }
288 if (max <= this.totalBytesWritten) {
289 getController().requestCrawlStop("Finished - Maximum bytes (" +
290 Long.toString(max) + ") written");
291 }
292 }
293
294 /***
295 * Whether the given CrawlURI should be written to archive files.
296 * Annotates CrawlURI with a reason for any negative answer.
297 *
298 * @param curi CrawlURI
299 * @return true if URI should be written; false otherwise
300 */
301 protected boolean shouldWrite(CrawlURI curi) {
302
303 if(((Boolean)getUncheckedAttribute(curi, ATTR_SKIP_IDENTICAL_DIGESTS))
304 && IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
305 curi.addAnnotation(ANNOTATION_UNWRITTEN + ":identicalDigest");
306 return false;
307 }
308 String scheme = curi.getUURI().getScheme().toLowerCase();
309
310 boolean retVal;
311 if (scheme.equals("dns")) {
312 retVal = curi.getFetchStatus() == S_DNS_SUCCESS;
313 } else if (scheme.equals("http") || scheme.equals("https")) {
314 retVal = curi.getFetchStatus() > 0 && curi.isHttpTransaction();
315 } else if (scheme.equals("ftp")) {
316 retVal = curi.getFetchStatus() == 200;
317 } else {
318
319 curi.addAnnotation(ANNOTATION_UNWRITTEN + ":scheme");
320 return false;
321 }
322 if (retVal == false) {
323
324 curi.addAnnotation(ANNOTATION_UNWRITTEN + ":status");
325 return false;
326 }
327 return true;
328 }
329
330 /***
331 * Return IP address of given URI suitable for recording (as in a
332 * classic ARC 5-field header line).
333 *
334 * @param curi CrawlURI
335 * @return String of IP address
336 */
337 protected String getHostAddress(CrawlURI curi) {
338
339 if(curi.getUURI().getScheme().toLowerCase().equals("dns")) {
340 return curi.getString(A_DNS_SERVER_IP_LABEL);
341 }
342
343 CrawlHost h = getController().getServerCache().getHostFor(curi);
344 if (h == null) {
345 throw new NullPointerException("Crawlhost is null for " +
346 curi + " " + curi.getVia());
347 }
348 InetAddress a = h.getIP();
349 if (a == null) {
350 throw new NullPointerException("Address is null for " +
351 curi + " " + curi.getVia() + ". Address " +
352 ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP)?
353 "was never looked up.":
354 (System.currentTimeMillis() - h.getIpFetched()) +
355 " ms ago."));
356 }
357 return h.getIP().getHostAddress();
358 }
359
360 /***
361 * Version of getAttributes that catches and logs exceptions
362 * and returns null if failure to fetch the attribute.
363 * @param name Attribute name.
364 * @return Attribute or null.
365 */
366 public Object getAttributeUnchecked(String name) {
367 Object result = null;
368 try {
369 result = super.getAttribute(name);
370 } catch (AttributeNotFoundException e) {
371 logger.warning(e.getLocalizedMessage());
372 } catch (MBeanException e) {
373 logger.warning(e.getLocalizedMessage());
374 } catch (ReflectionException e) {
375 logger.warning(e.getLocalizedMessage());
376 }
377 return result;
378 }
379
380 /***
381 * Max size we want files to be (bytes).
382 *
383 * Default is ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE. Note that ARC
384 * files will usually be bigger than maxSize; they'll be maxSize + length
385 * to next boundary.
386 * @return ARC maximum size.
387 */
388 public long getMaxSize() {
389 Object obj = getAttributeUnchecked(ATTR_MAX_SIZE_BYTES);
390 return (obj == null)? DEFAULT_MAX_FILE_SIZE: ((Long)obj).longValue();
391 }
392
393 public String getPrefix() {
394 Object obj = getAttributeUnchecked(ATTR_PREFIX);
395 return (obj == null)? WriterPoolMember.DEFAULT_PREFIX: (String)obj;
396 }
397
398 public List<File> getOutputDirs() {
399 Object obj = getAttributeUnchecked(ATTR_PATH);
400 List list = (obj == null)? Arrays.asList(DEFAULT_PATH): (StringList)obj;
401 ArrayList<File> results = new ArrayList<File>();
402 for (Iterator i = list.iterator(); i.hasNext();) {
403 String path = (String)i.next();
404 File f = new File(path);
405 if (!f.isAbsolute()) {
406 f = new File(getController().getDisk(), path);
407 }
408 if (!f.exists()) {
409 try {
410 f.mkdirs();
411 } catch (Exception e) {
412 e.printStackTrace();
413 continue;
414 }
415 }
416 results.add(f);
417 }
418 return results;
419 }
420
421 public boolean isCompressed() {
422 Object obj = getAttributeUnchecked(ATTR_COMPRESS);
423 return (obj == null)? DEFAULT_COMPRESS:
424 ((Boolean)obj).booleanValue();
425 }
426
427 /***
428 * @return Returns the poolMaximumActive.
429 */
430 public int getPoolMaximumActive() {
431 Object obj = getAttributeUnchecked(ATTR_POOL_MAX_ACTIVE);
432 return (obj == null)? WriterPool.DEFAULT_MAX_ACTIVE:
433 ((Integer)obj).intValue();
434 }
435
436 /***
437 * @return Returns the poolMaximumWait.
438 */
439 public int getPoolMaximumWait() {
440 Object obj = getAttributeUnchecked(ATTR_POOL_MAX_WAIT);
441 return (obj == null)? WriterPool.DEFAULT_MAXIMUM_WAIT:
442 ((Integer)obj).intValue();
443 }
444
445 public String getSuffix() {
446 Object obj = getAttributeUnchecked(ATTR_SUFFIX);
447 String sfx = (obj == null)?
448 WriterPoolMember.DEFAULT_SUFFIX: (String)obj;
449 if (sfx != null && sfx.trim().
450 equals(WriterPoolMember.HOSTNAME_VARIABLE)) {
451 String str = "localhost.localdomain";
452 try {
453 str = InetAddress.getLocalHost().getHostName();
454 } catch (UnknownHostException ue) {
455 logger.severe("Failed getHostAddress for this host: " + ue);
456 }
457 sfx = str;
458 }
459 return sfx;
460 }
461
462 public long getMaxToWrite() {
463 Object obj = getAttributeUnchecked(ATTR_MAX_BYTES_WRITTEN);
464 return (obj == null)? 0: ((Long)obj).longValue();
465 }
466
467 public void crawlEnding(String sExitMessage) {
468 this.pool.close();
469 }
470
471 public void crawlEnded(String sExitMessage) {
472
473 }
474
475
476
477
478 public void crawlStarted(String message) {
479
480 }
481
482 protected String getCheckpointStateFile() {
483 return this.getClass().getName() + ".state";
484 }
485
486 public void crawlCheckpoint(File checkpointDir) throws IOException {
487 int serial = getSerialNo().get();
488 if (this.pool.getNumActive() > 0) {
489
490
491
492
493
494 serial = getSerialNo().incrementAndGet();
495 }
496 saveCheckpointSerialNumber(checkpointDir, serial);
497
498 try {
499 this.pool.close();
500 } finally {
501
502 setupPool(new AtomicInteger(serial));
503 }
504 }
505
506 public void crawlPausing(String statusMessage) {
507
508 }
509
510 public void crawlPaused(String statusMessage) {
511
512 }
513
514 public void crawlResuming(String statusMessage) {
515
516 }
517
518 private void readObject(ObjectInputStream stream)
519 throws IOException, ClassNotFoundException {
520 stream.defaultReadObject();
521 ObjectPlusFilesInputStream coistream =
522 (ObjectPlusFilesInputStream)stream;
523 coistream.registerFinishTask( new Runnable() {
524 public void run() {
525 setupPool(new AtomicInteger());
526 }
527 });
528 }
529
530 protected WriterPool getPool() {
531 return pool;
532 }
533
534 protected void setPool(WriterPool pool) {
535 this.pool = pool;
536 }
537
538 protected long getTotalBytesWritten() {
539 return totalBytesWritten;
540 }
541
542 protected void setTotalBytesWritten(long totalBytesWritten) {
543 this.totalBytesWritten = totalBytesWritten;
544 }
545
546 /***
547 * Called out of {@link #initialTasks()} when recovering a checkpoint.
548 * Restore state.
549 */
550 protected void checkpointRecover() {
551 int serialNo = loadCheckpointSerialNumber();
552 if (serialNo != -1) {
553 getSerialNo().set(serialNo);
554 }
555 }
556
557 /***
558 * @return Serial number from checkpoint state file or if unreadable, -1
559 * (Client should check for -1).
560 */
561 protected int loadCheckpointSerialNumber() {
562 int result = -1;
563
564
565
566 File stateFile = new File(getSettingsHandler().getOrder()
567 .getController().getCheckpointRecover().getDirectory(),
568 getCheckpointStateFile());
569 if (!stateFile.exists()) {
570 logger.info(stateFile.getAbsolutePath()
571 + " doesn't exist so cannot restore Writer serial number.");
572 } else {
573 DataInputStream dis = null;
574 try {
575 dis = new DataInputStream(new FileInputStream(stateFile));
576 result = dis.readShort();
577 } catch (FileNotFoundException e) {
578 e.printStackTrace();
579 } catch (IOException e) {
580 e.printStackTrace();
581 } finally {
582 try {
583 if (dis != null) {
584 dis.close();
585 }
586 } catch (IOException e) {
587 e.printStackTrace();
588 }
589 }
590 }
591 return result;
592 }
593
594 protected void saveCheckpointSerialNumber(final File checkpointDir,
595 final int serialNo)
596 throws IOException {
597
598 File f = new File(checkpointDir, getCheckpointStateFile());
599 DataOutputStream dos = new DataOutputStream(new FileOutputStream(f));
600 try {
601 dos.writeShort(serialNo);
602 } finally {
603 dos.close();
604 }
605 }
606
607 /***
608 * Return list of metadatas to add to first arc file metadata record.
609 *
610 * Default is to stylesheet the order file. To specify stylesheet,
611 * override {@link #getFirstrecordStylesheet()}.
612 *
613 * Get xml files from settingshandler. Currently order file is the
614 * only xml file. We're NOT adding seeds to meta data.
615 *
616 * @return List of strings and/or files to add to arc file as metadata or
617 * null.
618 */
619 public synchronized List<String> getMetadata() {
620 if (this.cachedMetadata != null) {
621 return this.cachedMetadata;
622 }
623 return cacheMetadata();
624 }
625
626 protected synchronized List<String> cacheMetadata() {
627
628
629 if (getFirstrecordStylesheet() == null ||
630 getFirstrecordStylesheet().length() == 0) {
631 this.cachedMetadata = new ArrayList<String>(1);
632 this.cachedMetadata.add("");
633 return this.cachedMetadata;
634 }
635
636 List<String> result = null;
637 if (!XMLSettingsHandler.class.isInstance(getSettingsHandler())) {
638 logger.warning("Expected xml settings handler (No warcinfo).");
639
640 return result;
641 }
642
643 XMLSettingsHandler xsh = (XMLSettingsHandler)getSettingsHandler();
644 File orderFile = xsh.getOrderFile();
645 if (!orderFile.exists() || !orderFile.canRead()) {
646 logger.severe("File " + orderFile.getAbsolutePath() +
647 " is does not exist or is not readable.");
648 } else {
649 result = new ArrayList<String>(1);
650 result.add(getFirstrecordBody(orderFile));
651 }
652 this.cachedMetadata = result;
653 return this.cachedMetadata;
654 }
655
656 /***
657 * @preturn Full path to stylesheet (Its read off the CLASSPATH
658 * as resource).
659 */
660 protected String getFirstrecordStylesheet() {
661 return null;
662 }
663
664 /***
665 * Write the arc metadata body content.
666 *
667 * Its based on the order xml file but into this base we'll add other info
668 * such as machine ip.
669 *
670 * @param orderFile Order file.
671
672 *
673 * @return String that holds the arc metaheader body.
674 */
675 protected String getFirstrecordBody(File orderFile) {
676 String result = null;
677 TransformerFactory factory = TransformerFactory.newInstance();
678 Templates templates = null;
679 Transformer xformer = null;
680 try {
681 templates = factory.newTemplates(new StreamSource(
682 this.getClass().getResourceAsStream(getFirstrecordStylesheet())));
683 xformer = templates.newTransformer();
684
685 xformer.setParameter("software", "Heritrix " +
686 Heritrix.getVersion() + " http://crawler.archive.org");
687 xformer.setParameter("ip",
688 InetAddress.getLocalHost().getHostAddress());
689 xformer.setParameter("hostname",
690 InetAddress.getLocalHost().getHostName());
691 StreamSource source = new StreamSource(
692 new FileInputStream(orderFile));
693 StringWriter writer = new StringWriter();
694 StreamResult target = new StreamResult(writer);
695 xformer.transform(source, target);
696 result= writer.toString();
697 } catch (TransformerConfigurationException e) {
698 logger.severe("Failed transform " + e);
699 } catch (FileNotFoundException e) {
700 logger.severe("Failed transform, file not found " + e);
701 } catch (UnknownHostException e) {
702 logger.severe("Failed transform, unknown host " + e);
703 } catch(TransformerException e) {
704 SourceLocator locator = e.getLocator();
705 int col = locator.getColumnNumber();
706 int line = locator.getLineNumber();
707 String publicId = locator.getPublicId();
708 String systemId = locator.getSystemId();
709 logger.severe("Transform error " + e + ", col " + col + ", line " +
710 line + ", publicId " + publicId + ", systemId " + systemId);
711 }
712
713 return result;
714 }
715 }