1   /* PersistLogProcessor.java
2    * 
3    * Created on Feb 18, 2005
4    *
5    * Copyright (C) 2007 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor.recrawl;
24  
25  import java.io.File;
26  import java.io.FileNotFoundException;
27  import java.io.IOException;
28  import java.io.PrintStream;
29  import java.util.concurrent.atomic.AtomicInteger;
30  
31  import org.apache.commons.codec.binary.Base64;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.event.CrawlStatusListener;
34  import org.archive.crawler.io.CrawlerJournal;
35  import org.archive.crawler.settings.SimpleType;
36  import org.archive.util.FileUtils;
37  import org.archive.util.IoUtils;
38  
39  
40  
41  /***
42   * Log CrawlURI attributes from latest fetch for consultation by a later 
43   * recrawl. Log must be imported into alternate data structure in order
44   * to be consulted. 
45   * 
46   * @author gojomo
47   * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
48   */
49  public class PersistLogProcessor extends PersistProcessor implements CrawlStatusListener {
50      private static final long serialVersionUID = 1678691994065439346L;
51      
52      protected CrawlerJournal log;
53  
54      /*** setting for log filename */
55      public static final String ATTR_LOG_FILENAME = "log-filename";
56      /*** default log filename */ 
57      public static final String DEFAULT_LOG_FILENAME = "persistlog.txtser.gz";
58      
59      /***
60       * Usual constructor
61       * 
62       * @param name
63       */
64      public PersistLogProcessor(String name) {
65          super(name, "PersistLogProcessor. Logs CrawlURI attributes " +
66                  "from latest fetch for consultation by a later recrawl.");
67          
68          addElementToDefinition(new SimpleType(ATTR_LOG_FILENAME,
69                  "Filename to which to log URI persistence information. " +
70                  "Interpreted relative to job logs directory. " +
71                  "Default is 'persistlog.txtser.gz'. ", 
72                  DEFAULT_LOG_FILENAME));
73      }
74  
75      @Override
76      protected void initialTasks() {
77          // Add this class to crawl state listeners to note checkpoints
78          getController().addCrawlStatusListener(this);
79          try {
80              File logFile = FileUtils.maybeRelative(getController().getLogsDir(),
81                      (String) getUncheckedAttribute(null, ATTR_LOG_FILENAME));
82              log = new CrawlerJournal(logFile);
83          } catch (IOException e) {
84              // TODO Auto-generated catch block
85              throw new RuntimeException(e);
86          }
87      }
88      
89      @Override
90      protected void finalTasks() {
91          log.close();
92      }
93  
94      @Override
95      protected void innerProcess(CrawlURI curi) {
96          if(shouldStore(curi)) {
97              log.writeLine(persistKeyFor(curi), " ", new String(Base64.encodeBase64(IoUtils
98                      .serializeToByteArray(curi.getPersistentAList()))));      
99          }
100     }
101 
102     public void crawlCheckpoint(File checkpointDir) throws Exception {
103         // rotate log
104         log.checkpoint(checkpointDir);
105     }
106 
107     public void crawlEnded(String sExitMessage) {
108         // ignored
109         
110     }
111 
112     public void crawlEnding(String sExitMessage) {
113         // ignored
114         
115     }
116 
117     public void crawlPaused(String statusMessage) {
118         // ignored
119         
120     }
121 
122     public void crawlPausing(String statusMessage) {
123         // ignored
124         
125     }
126 
127     public void crawlResuming(String statusMessage) {
128         // ignored
129         
130     }
131 
132     public void crawlStarted(String message) {
133         // ignored
134     }
135 }