1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.processor.recrawl;
24
25 import java.io.BufferedReader;
26 import java.io.File;
27 import java.io.FileNotFoundException;
28 import java.io.IOException;
29 import java.io.UnsupportedEncodingException;
30 import java.util.Iterator;
31 import java.util.Map.Entry;
32 import java.util.logging.Level;
33 import java.util.logging.Logger;
34
35 import org.apache.commons.codec.binary.Base64;
36 import org.apache.commons.io.IOUtils;
37 import org.archive.crawler.datamodel.CrawlURI;
38 import org.archive.crawler.framework.Processor;
39 import org.archive.crawler.io.CrawlerJournal;
40 import org.archive.util.IoUtils;
41 import org.archive.util.SURT;
42 import org.archive.util.bdbje.EnhancedEnvironment;
43 import org.archive.util.iterator.LineReadingIterator;
44
45 import st.ata.util.AList;
46
47 import com.sleepycat.bind.serial.SerialBinding;
48 import com.sleepycat.bind.serial.StoredClassCatalog;
49 import com.sleepycat.bind.tuple.StringBinding;
50 import com.sleepycat.collections.StoredIterator;
51 import com.sleepycat.collections.StoredSortedMap;
52 import com.sleepycat.je.Database;
53 import com.sleepycat.je.DatabaseConfig;
54 import com.sleepycat.je.DatabaseException;
55 import com.sleepycat.je.EnvironmentConfig;
56
57
58
59 /***
60 * Superclass for Processors which utilize BDB-JE for URI state
61 * (including most notably history) persistence.
62 *
63 * @author gojomo
64 */
65 public abstract class PersistProcessor extends Processor {
66 private static final Logger logger =
67 Logger.getLogger(PersistProcessor.class.getName());
68
69 /*** name of history Database */
70 public static final String URI_HISTORY_DBNAME = "uri_history";
71
72 /***
73 * @return DatabaseConfig for history Database
74 */
75 protected static DatabaseConfig historyDatabaseConfig() {
76 DatabaseConfig dbConfig = new DatabaseConfig();
77 dbConfig.setTransactional(false);
78 dbConfig.setAllowCreate(true);
79 dbConfig.setDeferredWrite(true);
80 return dbConfig;
81 }
82
83 /***
84 * Usual constructor
85 *
86 * @param name
87 * @param string
88 */
89 public PersistProcessor(String name, String string) {
90 super(name,string);
91 }
92
93 /***
94 * Return a preferred String key for persisting the given CrawlURI's
95 * AList state.
96 *
97 * @param curi CrawlURI
98 * @return String key
99 */
100 public String persistKeyFor(CrawlURI curi) {
101
102 return SURT.fromURI(curi.getUURI().toString(),true);
103 }
104
105 /***
106 * Whether the current CrawlURI's state should be persisted (to log or
107 * direct to database)
108 *
109 * @param curi CrawlURI
110 * @return true if state should be stored; false to skip persistence
111 */
112 protected boolean shouldStore(CrawlURI curi) {
113
114 return curi.isSuccess();
115 }
116
117 /***
118 * Whether the current CrawlURI's state should be loaded
119 *
120 * @param curi CrawlURI
121 * @return true if state should be loaded; false to skip loading
122 */
123 protected boolean shouldLoad(CrawlURI curi) {
124
125 return true;
126 }
127
128 /***
129 * Utility main for importing a log into a BDB-JE environment or moving a
130 * database between environments (2 arguments), or simply dumping a log
131 * to stdout in a more readable format (1 argument).
132 *
133 * @param args command-line arguments
134 * @throws DatabaseException
135 * @throws IOException
136 */
137 public static void main(String[] args) throws DatabaseException, IOException {
138 if(args.length==2) {
139 main2args(args);
140 } else if (args.length==1) {
141 main1arg(args);
142 } else {
143 System.out.println("Arguments: ");
144 System.out.println(" source [target]");
145 System.out.println(
146 "...where source is either a txtser log file or BDB env dir");
147 System.out.println(
148 "and target, if present, is a BDB env dir. ");
149 return;
150 }
151
152 }
153
154 /***
155 * Move the history information in the first argument (either the path
156 * to a log or to an environment containing a uri_history database) to
157 * the environment in the second environment (path; environment will
158 * be created if it dow not already exist).
159 *
160 * @param args command-line arguments
161 * @throws DatabaseException
162 * @throws FileNotFoundException
163 * @throws UnsupportedEncodingException
164 * @throws IOException
165 */
166 private static void main2args(String[] args) throws DatabaseException, FileNotFoundException, UnsupportedEncodingException, IOException {
167 File source = new File(args[0]);
168 File env = new File(args[1]);
169 if(!env.exists()) {
170 env.mkdirs();
171 }
172
173
174 EnhancedEnvironment targetEnv = setupEnvironment(env);
175 StoredClassCatalog classCatalog = targetEnv.getClassCatalog();
176 Database historyDB = targetEnv.openDatabase(
177 null,URI_HISTORY_DBNAME,historyDatabaseConfig());
178 StoredSortedMap historyMap = new StoredSortedMap(historyDB,
179 new StringBinding(), new SerialBinding(classCatalog,
180 AList.class), true);
181
182 int count = 0;
183
184 if(source.isFile()) {
185
186 BufferedReader br = CrawlerJournal.getBufferedReader(source);
187 Iterator iter = new LineReadingIterator(br);
188 while(iter.hasNext()) {
189 String line = (String) iter.next();
190 if(line.length()==0) {
191 continue;
192 }
193 String[] splits = line.split(" ");
194 if(splits.length!=2) {
195 logger.severe("bad line: "+line);
196 continue;
197 }
198 try {
199 historyMap.put(
200 splits[0],
201 IoUtils.deserializeFromByteArray(
202 Base64.decodeBase64(splits[1].getBytes("UTF8"))));
203 } catch (RuntimeException e) {
204 logger.log(Level.SEVERE,"problem with line: "+line, e);
205 }
206 count++;
207 }
208 IOUtils.closeQuietly(br);
209 } else {
210
211 EnhancedEnvironment sourceEnv = setupEnvironment(source);
212 StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
213 Database sourceHistoryDB = sourceEnv.openDatabase(
214 null,URI_HISTORY_DBNAME,historyDatabaseConfig());
215 StoredSortedMap sourceHistoryMap = new StoredSortedMap(sourceHistoryDB,
216 new StringBinding(), new SerialBinding(sourceClassCatalog,
217 AList.class), true);
218 Iterator iter = sourceHistoryMap.entrySet().iterator();
219 while(iter.hasNext()) {
220 Entry item = (Entry) iter.next();
221 historyMap.put(item.getKey(), item.getValue());
222 count++;
223 }
224 StoredIterator.close(iter);
225 sourceHistoryDB.close();
226 sourceEnv.close();
227 }
228
229
230 historyDB.sync();
231 historyDB.close();
232 targetEnv.close();
233 System.out.println(count+" records imported from "+source+" to BDB env "+env);
234 }
235
236 /***
237 * Dump the contents of the argument (path to a persist log) to stdout
238 * in a slightly more readable format.
239 *
240 * @param args command-line arguments
241 * @throws DatabaseException
242 * @throws FileNotFoundException
243 * @throws UnsupportedEncodingException
244 * @throws IOException
245 */
246 private static void main1arg(String[] args) throws DatabaseException, FileNotFoundException, UnsupportedEncodingException, IOException {
247 File source = new File(args[0]);
248
249 int count = 0;
250
251 if(source.isFile()) {
252
253 BufferedReader br = CrawlerJournal.getBufferedReader(source);
254 Iterator iter = new LineReadingIterator(br);
255 while(iter.hasNext()) {
256 String line = (String) iter.next();
257 if(line.length()==0) {
258 continue;
259 }
260 String[] splits = line.split(" ");
261 if(splits.length!=2) {
262 logger.severe("bad line: "+line);
263 continue;
264 }
265 try {
266 AList alist = (AList)IoUtils.deserializeFromByteArray(
267 Base64.decodeBase64(splits[1].getBytes("UTF8")));
268 System.out.println(
269 splits[0] + " " + alist.toPrettyString());
270 } catch (RuntimeException e) {
271 logger.log(Level.SEVERE,"problem with line: "+line, e);
272 }
273 count++;
274 }
275 IOUtils.closeQuietly(br);
276 } else {
277
278 EnhancedEnvironment sourceEnv = setupEnvironment(source);
279 StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
280 Database sourceHistoryDB = sourceEnv.openDatabase(
281 null,URI_HISTORY_DBNAME,historyDatabaseConfig());
282 StoredSortedMap sourceHistoryMap = new StoredSortedMap(sourceHistoryDB,
283 new StringBinding(), new SerialBinding(sourceClassCatalog,
284 AList.class), true);
285 Iterator iter = sourceHistoryMap.entrySet().iterator();
286 while(iter.hasNext()) {
287 Entry item = (Entry) iter.next();
288 AList alist = (AList)item.getValue();
289 System.out.println(item.getKey() + " " + alist.toPrettyString());
290 count++;
291 }
292 StoredIterator.close(iter);
293 sourceHistoryDB.close();
294 sourceEnv.close();
295 }
296
297 System.out.println(count+" records dumped from "+source);
298 }
299
300 private static EnhancedEnvironment setupEnvironment(File env) throws DatabaseException {
301 EnvironmentConfig envConfig = new EnvironmentConfig();
302 envConfig.setAllowCreate(true);
303 return new EnhancedEnvironment(env, envConfig);
304 }
305 }