View Javadoc

1   /* BdbFrontier
2    * 
3    * $Id: BdbFrontier.java 5440 2007-08-28 05:19:52Z gojomo $
4   * 
5    * Created on Sep 24, 2004
6    *
7    *  Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   *
25    */
26  package org.archive.crawler.frontier;
27  
28  import java.io.File;
29  import java.io.FileNotFoundException;
30  import java.io.IOException;
31  import java.io.Serializable;
32  import java.util.ArrayList;
33  import java.util.Collections;
34  import java.util.Iterator;
35  import java.util.List;
36  import java.util.Queue;
37  import java.util.TreeSet;
38  import java.util.concurrent.LinkedBlockingQueue;
39  import java.util.logging.Level;
40  import java.util.logging.Logger;
41  
42  import javax.management.AttributeNotFoundException;
43  
44  import org.apache.commons.collections.Closure;
45  import org.archive.crawler.datamodel.CrawlURI;
46  import org.archive.crawler.datamodel.UriUniqFilter;
47  import org.archive.crawler.framework.CrawlController;
48  import org.archive.crawler.framework.FrontierMarker;
49  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
50  import org.archive.crawler.settings.SimpleType;
51  import org.archive.crawler.settings.Type;
52  import org.archive.crawler.util.BdbUriUniqFilter;
53  import org.archive.crawler.util.BloomUriUniqFilter;
54  import org.archive.crawler.util.CheckpointUtils;
55  import org.archive.crawler.util.DiskFPMergeUriUniqFilter;
56  import org.archive.crawler.util.MemFPMergeUriUniqFilter;
57  import org.archive.queue.StoredQueue;
58  import org.archive.util.ArchiveUtils;
59  
60  import com.sleepycat.je.Database;
61  import com.sleepycat.je.DatabaseException;
62  
63  /***
64   * A Frontier using several BerkeleyDB JE Databases to hold its record of
65   * known hosts (queues), and pending URIs. 
66   *
67   * @author Gordon Mohr
68   */
69  public class BdbFrontier extends WorkQueueFrontier implements Serializable {
70      // be robust against trivial implementation changes
71      private static final long serialVersionUID = ArchiveUtils
72          .classnameBasedUID(BdbFrontier.class, 1);
73  
74      private static final Logger logger =
75          Logger.getLogger(BdbFrontier.class.getName());
76  
77      /*** all URIs scheduled to be crawled */
78      protected transient BdbMultipleWorkQueues pendingUris;
79  
80      /*** all URI-already-included options available to be chosen */
81      private String[] AVAILABLE_INCLUDED_OPTIONS = new String[] {
82              BdbUriUniqFilter.class.getName(),
83              BloomUriUniqFilter.class.getName(),
84              MemFPMergeUriUniqFilter.class.getName(),
85              DiskFPMergeUriUniqFilter.class.getName()};
86      
87      /*** URI-already-included to use (by class name) */
88      public final static String ATTR_INCLUDED = "uri-included-structure";
89      
90      private final static String DEFAULT_INCLUDED =
91          BdbUriUniqFilter.class.getName();
92      
93      /*** URI-already-included to use (by class name) */
94      public final static String ATTR_DUMP_PENDING_AT_CLOSE = 
95          "dump-pending-at-close";
96      private final static Boolean DEFAULT_DUMP_PENDING_AT_CLOSE = 
97          Boolean.FALSE;
98  
99      
100     /***
101      * Constructor.
102      * @param name Name for of this Frontier.
103      */
104     public BdbFrontier(String name) {
105         this(name, "BdbFrontier. "
106             + "A Frontier using BerkeleyDB Java Edition databases for "
107             + "persistence to disk.");
108         Type t = addElementToDefinition(new SimpleType(ATTR_INCLUDED,
109                 "Structure to use for tracking already-seen URIs. Non-default " +
110                 "options may require additional configuration via system " +
111                 "properties.", DEFAULT_INCLUDED, AVAILABLE_INCLUDED_OPTIONS));
112         t.setExpertSetting(true);
113         t = addElementToDefinition(new SimpleType(ATTR_DUMP_PENDING_AT_CLOSE,
114                 "Whether to dump all URIs waiting in queues to crawl.log " +
115                 "when a crawl ends. May add a significant delay to " +
116                 "crawl termination. Dumped lines will have a zero (0) " +
117                 "status.", DEFAULT_DUMP_PENDING_AT_CLOSE));
118         t.setExpertSetting(true);
119     }
120 
121     /***
122      * Create the BdbFrontier
123      * 
124      * @param name
125      * @param description
126      */
127     public BdbFrontier(String name, String description) {
128         super(name, description);
129     }
130     
131     /***
132      * Create the single object (within which is one BDB database)
133      * inside which all the other queues live. 
134      * 
135      * @return the created BdbMultipleWorkQueues
136      * @throws DatabaseException
137      */
138     private BdbMultipleWorkQueues createMultipleWorkQueues()
139     throws DatabaseException {
140         return new BdbMultipleWorkQueues(this.controller.getBdbEnvironment(),
141             this.controller.getBdbEnvironment().getClassCatalog(),
142             this.controller.isCheckpointRecover());
143     }
144 
145     
146     @Override
147     protected void initQueuesOfQueues() {
148         if(this.controller.isCheckpointRecover()) {
149             // do not setup here; take/init from deserialized frontier
150             return; 
151         }
152         // small risk of OutOfMemoryError: if 'hold-queues' is false,
153         // readyClassQueues may grow in size without bound
154         readyClassQueues = new LinkedBlockingQueue<String>();
155 
156         try {
157             Database inactiveQueuesDb = this.controller.getBdbEnvironment()
158                     .openDatabase(null, "inactiveQueues",
159                             StoredQueue.databaseConfig());
160             inactiveQueues = new StoredQueue<String>(inactiveQueuesDb,
161                     String.class, null);
162             Database retiredQueuesDb = this.controller.getBdbEnvironment()
163                     .openDatabase(null, "retiredQueues",
164                             StoredQueue.databaseConfig());
165             retiredQueues = new StoredQueue<String>(retiredQueuesDb,
166                     String.class, null);
167         } catch (DatabaseException e) {
168             throw new RuntimeException(e);
169         }
170         
171         // small risk of OutOfMemoryError: in large crawls with many 
172         // unresponsive queues, an unbounded number of snoozed queues 
173         // may exist
174         snoozedClassQueues = Collections.synchronizedSortedSet(new TreeSet<WorkQueue>());
175     }
176 
177     protected Queue<String> reinit(Queue<String> q, String name) {
178         try {
179             // restore the innner Database/StoredSortedMap of the queue
180             Database db = this.controller.getBdbEnvironment()
181                 .openDatabase(null, name, StoredQueue.databaseConfig());
182             
183             StoredQueue<String> queue;
184             if(q instanceof StoredQueue) {
185                 queue = (StoredQueue<String>) q;
186                 queue.hookupDatabase(db, String.class, null);
187             } else {
188                 // recovery of older checkpoint; copy to StoredQueue
189                 queue = new StoredQueue<String>(db,String.class,
190                         this.controller.getBdbEnvironment().getClassCatalog()); 
191                 queue.addAll(q);
192             }
193             return queue;
194         } catch (DatabaseException e) {
195             throw new RuntimeException(e);
196         }
197     }
198     
199     /***
200      * Create a UriUniqFilter that will serve as record 
201      * of already seen URIs.
202      *
203      * @return A UURISet that will serve as a record of already seen URIs
204      * @throws IOException
205      */
206     protected UriUniqFilter createAlreadyIncluded() throws IOException {
207         UriUniqFilter uuf;
208         String c = null;
209         try {
210             c = (String)getAttribute(null, ATTR_INCLUDED);
211         } catch (AttributeNotFoundException e) {
212             // Do default action if attribute not in order.
213         }
214         // TODO: avoid all this special-casing; enable some common
215         // constructor interface usable for all alt implemenations
216         if (c != null && c.equals(BloomUriUniqFilter.class.getName())) {
217             uuf = this.controller.isCheckpointRecover()?
218                     deserializeAlreadySeen(BloomUriUniqFilter.class,
219                         this.controller.getCheckpointRecover().getDirectory()):
220                     new BloomUriUniqFilter();
221         } else if (c!=null && c.equals(MemFPMergeUriUniqFilter.class.getName())) {
222             // TODO: add checkpointing for MemFPMergeUriUniqFilter
223             uuf = new MemFPMergeUriUniqFilter();
224         } else if (c!=null && c.equals(DiskFPMergeUriUniqFilter.class.getName())) {
225             // TODO: add checkpointing for DiskFPMergeUriUniqFilter
226             uuf = new DiskFPMergeUriUniqFilter(controller.getScratchDisk());
227         } else {
228             // Assume its BdbUriUniqFilter.
229             uuf = this.controller.isCheckpointRecover()?
230                 deserializeAlreadySeen(BdbUriUniqFilter.class,
231                     this.controller.getCheckpointRecover().getDirectory()):
232                 new BdbUriUniqFilter(this.controller.getBdbEnvironment());
233             if (this.controller.isCheckpointRecover()) {
234                 // If recover, need to call reopen of the db.
235                 try {
236                     ((BdbUriUniqFilter)uuf).
237                         reopen(this.controller.getBdbEnvironment());
238                 } catch (DatabaseException e) {
239                     throw new IOException(e.getMessage());
240                 }
241             }   
242         }
243         uuf.setDestination(this);
244         return uuf;
245     }
246     
247     protected UriUniqFilter deserializeAlreadySeen(
248             final Class<? extends UriUniqFilter> cls,
249             final File dir)
250     throws FileNotFoundException, IOException {
251         UriUniqFilter uuf = null;
252         try {
253             logger.fine("Started deserializing " + cls.getName() +
254                 " of checkpoint recover.");
255             uuf = CheckpointUtils.readObjectFromFile(cls, dir);
256             logger.fine("Finished deserializing bdbje as part " +
257                 "of checkpoint recover.");
258         } catch (ClassNotFoundException e) {
259             throw new IOException("Failed to deserialize "  +
260                 cls.getName() + ": " + e.getMessage());
261         }
262         return uuf;
263     }
264 
265     /***
266      * Return the work queue for the given CrawlURI's classKey. URIs
267      * are ordered and politeness-delayed within their 'class'.
268      * 
269      * @param curi CrawlURI to base queue on
270      * @return the found or created BdbWorkQueue
271      */
272     protected WorkQueue getQueueFor(CrawlURI curi) {
273         WorkQueue wq;
274         String classKey = curi.getClassKey();
275         synchronized (allQueues) {
276             wq = (WorkQueue)allQueues.get(classKey);
277             if (wq == null) {
278                 wq = new BdbWorkQueue(classKey, this);
279                 wq.setTotalBudget(((Long)getUncheckedAttribute(
280                     curi,ATTR_QUEUE_TOTAL_BUDGET)).longValue());
281                 allQueues.put(classKey, wq);
282             }
283         }
284         return wq;
285     }
286     
287     /***
288      * Return the work queue for the given classKey, or null
289      * if no such queue exists.
290      * 
291      * @param classKey key to look for
292      * @return the found WorkQueue
293      */
294     protected WorkQueue getQueueFor(String classKey) {
295         WorkQueue wq; 
296         synchronized (allQueues) {
297             wq = (WorkQueue)allQueues.get(classKey);
298         }
299         return wq;
300     }
301 
302     public FrontierMarker getInitialMarker(String regexpr,
303             boolean inCacheOnly) {
304         return pendingUris.getInitialMarker(regexpr);
305     }
306 
307     /***
308      * Return list of urls.
309      * @param marker
310      * @param numberOfMatches
311      * @param verbose 
312      * @return List of URIs (strings).
313      */
314     public ArrayList<String> getURIsList(FrontierMarker marker, 
315             int numberOfMatches, final boolean verbose) {
316         List curis;
317         try {
318             curis = pendingUris.getFrom(marker, numberOfMatches);
319         } catch (DatabaseException e) {
320             e.printStackTrace();
321             throw new RuntimeException(e);
322         }
323         ArrayList<String> results = new ArrayList<String>(curis.size());
324         Iterator iter = curis.iterator();
325         while(iter.hasNext()) {
326             CrawlURI curi = (CrawlURI) iter.next();
327             results.add("["+curi.getClassKey()+"] "+curi.singleLineReport());
328         }
329         return results;
330     }
331     
332     protected void initQueue() throws IOException {
333         try {
334             this.pendingUris = createMultipleWorkQueues();
335         } catch(DatabaseException e) {
336             throw (IOException)new IOException(e.getMessage()).initCause(e);
337         }
338     }
339     
340     protected void closeQueue() {
341         if((Boolean)getUncheckedAttribute(null,ATTR_DUMP_PENDING_AT_CLOSE)) {
342             try {
343                 dumpAllPendingToLog();
344             } catch (DatabaseException e) {
345                 logger.log(Level.WARNING,"dump pending problem",e);
346             }
347         }
348         if (this.pendingUris != null) {
349             this.pendingUris.close();
350             this.pendingUris = null;
351         }
352     }
353         
354     protected BdbMultipleWorkQueues getWorkQueues() {
355         return pendingUris;
356     }
357 
358     protected boolean workQueueDataOnDisk() {
359         return true;
360     }
361     
362     public void initialize(CrawlController c)
363     throws FatalConfigurationException, IOException {
364         this.controller = c;
365         // fill in anything from a checkpoint recovery first (because
366         // usual initialization will skip initQueueOfQueues in checkpoint)
367         if (c.isCheckpointRecover()) {
368             // If a checkpoint recover, copy old values from serialized
369             // instance into this Frontier instance. Do it this way because 
370             // though its possible to serialize BdbFrontier, its currently not
371             // possible to set/remove frontier attribute plugging the
372             // deserialized object back into the settings system.
373             // The below copying over is error-prone because its easy
374             // to miss a value.  Perhaps there's a better way?  Introspection?
375             BdbFrontier f = null;
376             try {
377                 f = (BdbFrontier)CheckpointUtils.
378                     readObjectFromFile(this.getClass(),
379                         c.getCheckpointRecover().getDirectory());
380             } catch (FileNotFoundException e) {
381                 throw new FatalConfigurationException("Failed checkpoint " +
382                     "recover: " + e.getMessage());
383             } catch (IOException e) {
384                 throw new FatalConfigurationException("Failed checkpoint " +
385                     "recover: " + e.getMessage());
386             } catch (ClassNotFoundException e) {
387                 throw new FatalConfigurationException("Failed checkpoint " +
388                     "recover: " + e.getMessage());
389             }
390 
391             this.nextOrdinal = f.nextOrdinal;
392             this.totalProcessedBytes = f.totalProcessedBytes;
393             this.liveDisregardedUriCount = f.liveDisregardedUriCount;
394             this.liveFailedFetchCount = f.liveFailedFetchCount;
395             this.processedBytesAfterLastEmittedURI =
396                 f.processedBytesAfterLastEmittedURI;
397             this.liveQueuedUriCount = f.liveQueuedUriCount;
398             this.liveSucceededFetchCount = f.liveSucceededFetchCount;
399             this.lastMaxBandwidthKB = f.lastMaxBandwidthKB;
400             this.readyClassQueues = f.readyClassQueues;
401             this.inactiveQueues = reinit(f.inactiveQueues,"inactiveQueues");
402             this.retiredQueues = reinit(f.retiredQueues,"retiredQueues");
403             this.snoozedClassQueues = f.snoozedClassQueues;
404             this.inProcessQueues = f.inProcessQueues;
405             super.initialize(c);
406             wakeQueues();
407         } else {
408             // perform usual initialization 
409             super.initialize(c);
410         }
411     }
412 
413     
414     
415     @Override
416     public void crawlEnded(String sExitMessage) {
417         ((StoredQueue)inactiveQueues).close();
418         ((StoredQueue)retiredQueues).close();
419         super.crawlEnded(sExitMessage);
420     }
421 
422     public void crawlCheckpoint(File checkpointDir) throws Exception {
423         super.crawlCheckpoint(checkpointDir);
424         logger.fine("Started serializing already seen as part "
425             + "of checkpoint. Can take some time.");
426         // An explicit sync on the any deferred write dbs is needed to make the
427         // db recoverable. Sync'ing the environment doesn't work.
428         if (this.pendingUris != null) {
429         	this.pendingUris.sync();
430         }
431         CheckpointUtils.writeObjectToFile(this.alreadyIncluded, checkpointDir);
432         logger.fine("Finished serializing already seen as part "
433             + "of checkpoint.");
434         // Serialize ourselves.
435         CheckpointUtils.writeObjectToFile(this, checkpointDir);
436     }
437     
438     /***
439      * Dump all still-enqueued URIs to the crawl.log -- without actually
440      * dequeuing. Useful for understanding what was remaining in a
441      * crawl that was ended early, for example at a time limit. 
442      * 
443      * @throws DatabaseException
444      */
445     public void dumpAllPendingToLog() throws DatabaseException {
446         Closure tolog = new Closure() {
447             public void execute(Object curi) {
448                 log((CrawlURI)curi);
449             }};
450         pendingUris.forAllPendingDo(tolog);
451     }
452 }