View Javadoc

1   /* PersistProcessor.java
2    * 
3    * Created on Feb 17, 2005
4    *
5    * Copyright (C) 2007 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.processor.recrawl;
24  
25  import java.io.BufferedReader;
26  import java.io.File;
27  import java.io.FileNotFoundException;
28  import java.io.IOException;
29  import java.io.UnsupportedEncodingException;
30  import java.util.Iterator;
31  import java.util.Map.Entry;
32  import java.util.logging.Level;
33  import java.util.logging.Logger;
34  
35  import org.apache.commons.codec.binary.Base64;
36  import org.apache.commons.io.IOUtils;
37  import org.archive.crawler.datamodel.CrawlURI;
38  import org.archive.crawler.framework.Processor;
39  import org.archive.crawler.io.CrawlerJournal;
40  import org.archive.util.IoUtils;
41  import org.archive.util.SURT;
42  import org.archive.util.bdbje.EnhancedEnvironment;
43  import org.archive.util.iterator.LineReadingIterator;
44  
45  import st.ata.util.AList;
46  
47  import com.sleepycat.bind.serial.SerialBinding;
48  import com.sleepycat.bind.serial.StoredClassCatalog;
49  import com.sleepycat.bind.tuple.StringBinding;
50  import com.sleepycat.collections.StoredIterator;
51  import com.sleepycat.collections.StoredSortedMap;
52  import com.sleepycat.je.Database;
53  import com.sleepycat.je.DatabaseConfig;
54  import com.sleepycat.je.DatabaseException;
55  import com.sleepycat.je.EnvironmentConfig;
56  
57  
58  
59  /***
60   * Superclass for Processors which utilize BDB-JE for URI state
61   * (including most notably history) persistence.
62   * 
63   * @author gojomo
64   */
65  public abstract class PersistProcessor extends Processor {
66      private static final Logger logger =
67          Logger.getLogger(PersistProcessor.class.getName());
68  
69      /*** name of history Database */
70      public static final String URI_HISTORY_DBNAME = "uri_history";
71      
72      /***
73       * @return DatabaseConfig for history Database
74       */
75      protected static DatabaseConfig historyDatabaseConfig() {
76          DatabaseConfig dbConfig = new DatabaseConfig();
77          dbConfig.setTransactional(false);
78          dbConfig.setAllowCreate(true);
79          dbConfig.setDeferredWrite(true);
80          return dbConfig;
81      }
82  
83      /***
84       * Usual constructor
85       * 
86       * @param name
87       * @param string
88       */
89      public PersistProcessor(String name, String string) {
90          super(name,string);
91      }
92  
93      /***
94       * Return a preferred String key for persisting the given CrawlURI's
95       * AList state. 
96       * 
97       * @param curi CrawlURI
98       * @return String key
99       */
100     public String persistKeyFor(CrawlURI curi) {
101         // use a case-sensitive SURT for uniqueness and sorting benefits
102         return SURT.fromURI(curi.getUURI().toString(),true);
103     }
104 
105     /***
106      * Whether the current CrawlURI's state should be persisted (to log or
107      * direct to database)
108      * 
109      * @param curi CrawlURI
110      * @return true if state should be stored; false to skip persistence
111      */
112     protected boolean shouldStore(CrawlURI curi) {
113         // TODO: don't store some codes, such as 304 unchanged?
114         return curi.isSuccess();
115     }
116 
117     /***
118      * Whether the current CrawlURI's state should be loaded
119      * 
120      * @param curi CrawlURI
121      * @return true if state should be loaded; false to skip loading
122      */
123     protected boolean shouldLoad(CrawlURI curi) {
124         // TODO: don't load some (prereqs?)
125         return true;
126     }
127 
128     /***
129      * Utility main for importing a log into a BDB-JE environment or moving a
130      * database between environments (2 arguments), or simply dumping a log
131      * to stdout in a more readable format (1 argument). 
132      * 
133      * @param args command-line arguments
134      * @throws DatabaseException
135      * @throws IOException
136      */
137     public static void main(String[] args) throws DatabaseException, IOException {
138         if(args.length==2) {
139             main2args(args);
140         } else if (args.length==1) {
141             main1arg(args);
142         } else {
143             System.out.println("Arguments: ");
144             System.out.println("    source [target]");
145             System.out.println(
146                 "...where source is either a txtser log file or BDB env dir");
147             System.out.println(
148                 "and target, if present, is a BDB env dir. ");
149             return;
150         }
151         
152     }
153 
154     /***
155      * Move the history information in the first argument (either the path 
156      * to a log or to an environment containing a uri_history database) to 
157      * the environment in the second environment (path; environment will 
158      * be created if it dow not already exist). 
159      * 
160      * @param args command-line arguments
161      * @throws DatabaseException
162      * @throws FileNotFoundException
163      * @throws UnsupportedEncodingException
164      * @throws IOException
165      */
166     private static void main2args(String[] args) throws DatabaseException, FileNotFoundException, UnsupportedEncodingException, IOException {
167         File source = new File(args[0]);
168         File env = new File(args[1]);
169         if(!env.exists()) {
170             env.mkdirs();
171         }
172         
173         // setup target environment
174         EnhancedEnvironment targetEnv = setupEnvironment(env);
175         StoredClassCatalog classCatalog = targetEnv.getClassCatalog();
176         Database historyDB = targetEnv.openDatabase(
177                 null,URI_HISTORY_DBNAME,historyDatabaseConfig());
178         StoredSortedMap historyMap = new StoredSortedMap(historyDB,
179                 new StringBinding(), new SerialBinding(classCatalog,
180                         AList.class), true);
181         
182         int count = 0;
183         
184         if(source.isFile()) {
185             // scan log, writing to database
186             BufferedReader br = CrawlerJournal.getBufferedReader(source);
187             Iterator iter = new LineReadingIterator(br);
188             while(iter.hasNext()) {
189                 String line = (String) iter.next(); 
190                 if(line.length()==0) {
191                     continue;
192                 }
193                 String[] splits = line.split(" ");
194                 if(splits.length!=2) {
195                     logger.severe("bad line: "+line);
196                     continue;
197                 }
198                 try {
199                     historyMap.put(
200                         splits[0], 
201                         IoUtils.deserializeFromByteArray(
202                             Base64.decodeBase64(splits[1].getBytes("UTF8"))));
203                 } catch (RuntimeException e) {
204                     logger.log(Level.SEVERE,"problem with line: "+line, e);
205                 }
206                 count++;
207             }
208             IOUtils.closeQuietly(br);
209         } else {
210             // open the source env history DB, copying entries to target env
211             EnhancedEnvironment sourceEnv = setupEnvironment(source);
212             StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
213             Database sourceHistoryDB = sourceEnv.openDatabase(
214                     null,URI_HISTORY_DBNAME,historyDatabaseConfig());
215             StoredSortedMap sourceHistoryMap = new StoredSortedMap(sourceHistoryDB,
216                     new StringBinding(), new SerialBinding(sourceClassCatalog,
217                             AList.class), true);
218             Iterator iter = sourceHistoryMap.entrySet().iterator();
219             while(iter.hasNext()) {
220                 Entry item = (Entry) iter.next(); 
221                 historyMap.put(item.getKey(), item.getValue());
222                 count++;
223             }
224             StoredIterator.close(iter);
225             sourceHistoryDB.close();
226             sourceEnv.close();
227         }
228         
229         // cleanup
230         historyDB.sync();
231         historyDB.close();
232         targetEnv.close();
233         System.out.println(count+" records imported from "+source+" to BDB env "+env);
234     }
235 
236     /***
237      * Dump the contents of the argument (path to a persist log) to stdout
238      * in a slightly more readable format. 
239      * 
240      * @param args command-line arguments
241      * @throws DatabaseException
242      * @throws FileNotFoundException
243      * @throws UnsupportedEncodingException
244      * @throws IOException
245      */
246     private static void main1arg(String[] args) throws DatabaseException, FileNotFoundException, UnsupportedEncodingException, IOException {
247         File source = new File(args[0]);
248         
249         int count = 0;
250         
251         if(source.isFile()) {
252             // scan log, writing to database
253             BufferedReader br = CrawlerJournal.getBufferedReader(source);
254             Iterator iter = new LineReadingIterator(br);
255             while(iter.hasNext()) {
256                 String line = (String) iter.next(); 
257                 if(line.length()==0) {
258                     continue;
259                 }
260                 String[] splits = line.split(" ");
261                 if(splits.length!=2) {
262                     logger.severe("bad line: "+line);
263                     continue;
264                 }
265                 try {
266                     AList alist = (AList)IoUtils.deserializeFromByteArray(
267                         Base64.decodeBase64(splits[1].getBytes("UTF8")));
268                     System.out.println(
269                         splits[0] + " " + alist.toPrettyString());
270                 } catch (RuntimeException e) {
271                     logger.log(Level.SEVERE,"problem with line: "+line, e);
272                 }
273                 count++;
274             }
275             IOUtils.closeQuietly(br);
276         } else {
277             // open the source env history DB, copying entries to target env
278             EnhancedEnvironment sourceEnv = setupEnvironment(source);
279             StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
280             Database sourceHistoryDB = sourceEnv.openDatabase(
281                     null,URI_HISTORY_DBNAME,historyDatabaseConfig());
282             StoredSortedMap sourceHistoryMap = new StoredSortedMap(sourceHistoryDB,
283                     new StringBinding(), new SerialBinding(sourceClassCatalog,
284                             AList.class), true);
285             Iterator iter = sourceHistoryMap.entrySet().iterator();
286             while(iter.hasNext()) {
287                 Entry item = (Entry) iter.next(); 
288                 AList alist = (AList)item.getValue();
289                 System.out.println(item.getKey() + " " + alist.toPrettyString());
290                 count++;
291             }
292             StoredIterator.close(iter);
293             sourceHistoryDB.close();
294             sourceEnv.close();
295         }
296         
297         System.out.println(count+" records dumped from "+source);
298     }
299     
300     private static EnhancedEnvironment setupEnvironment(File env) throws DatabaseException {
301         EnvironmentConfig envConfig = new EnvironmentConfig();
302         envConfig.setAllowCreate(true);
303         return new EnhancedEnvironment(env, envConfig);
304     }
305 }