1   /* CachedBdbMap
2    * 
3    * $Id: CachedBdbMap.java 5383 2007-08-09 00:37:08Z gojomo $
4    * 
5    * Created on Mar 24, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.util;
26  
27  import java.io.File;
28  import java.io.IOException;
29  import java.io.Serializable;
30  import java.lang.ref.PhantomReference;
31  import java.lang.ref.Reference;
32  import java.lang.ref.ReferenceQueue;
33  import java.lang.ref.SoftReference;
34  import java.lang.reflect.Field;
35  import java.util.AbstractMap;
36  import java.util.HashMap;
37  import java.util.Iterator;
38  import java.util.LinkedList;
39  import java.util.Map;
40  import java.util.Set;
41  import java.util.logging.Level;
42  import java.util.logging.Logger;
43  
44  import com.sleepycat.bind.EntryBinding;
45  import com.sleepycat.bind.serial.SerialBinding;
46  import com.sleepycat.bind.serial.StoredClassCatalog;
47  import com.sleepycat.bind.tuple.TupleBinding;
48  import com.sleepycat.collections.StoredSortedMap;
49  import com.sleepycat.je.Database;
50  import com.sleepycat.je.DatabaseConfig;
51  import com.sleepycat.je.DatabaseException;
52  import com.sleepycat.je.Environment;
53  import com.sleepycat.je.EnvironmentConfig;
54  
55  /***
56   * A BDB JE backed hashmap. It extends the normal BDB JE map implementation by
57   * holding a cache of soft referenced objects. That is objects are not written
58   * to disk until they are not referenced by any other object and therefore can be
59   * Garbage Collected.
60   * 
61   * @author John Erik Halse
62   * @author stack
63   * @author gojomo
64   *  
65   */
66  public class CachedBdbMap<K,V> extends AbstractMap<K,V> 
67  implements Map<K,V>, Serializable {
68      
69      private static final long serialVersionUID = -8655539411367047332L;
70  
71      private static final Logger logger =
72          Logger.getLogger(CachedBdbMap.class.getName());
73  
74      /*** The database name of the class definition catalog.*/
75      private static final String CLASS_CATALOG = "java_class_catalog";
76  
77      /***
78       * A map of BDB JE Environments so that we reuse the Environment for
79       * databases in the same directory.
80       */
81      private static final Map<String,DbEnvironmentEntry> dbEnvironmentMap = 
82          new HashMap<String,DbEnvironmentEntry>();
83  
84      /*** The BDB JE environment used for this instance.
85       */
86      private transient DbEnvironmentEntry dbEnvironment;
87  
88      /*** The BDB JE database used for this instance. */
89      protected transient Database db;
90  
91      /*** The Collection view of the BDB JE database used for this instance. */
92      protected transient StoredSortedMap diskMap;
93  
94      /*** The softreferenced cache */
95      private transient Map<K,SoftEntry<V>> memMap;
96  
97      protected transient ReferenceQueue<V> refQueue;
98  
99      /*** The number of objects in the diskMap StoredMap. 
100      *  (Package access for unit testing.) */
101     protected int diskMapSize = 0;
102 
103     /***
104      * Count of times we got an object from in-memory cache.
105      */
106     private long cacheHit = 0;
107 
108     /***
109      * Count of times the {@link CachedBdbMap#get(Object)} method was called.
110      */
111     private long countOfGets = 0;
112 
113     /***
114      * Count of every time we went to the disk-based map AND we found an
115      * object (Doesn't include accesses that came back null).
116      */
117     private long diskHit = 0;
118     
119     /***
120      * Name of bdbje db.
121      */
122     private String dbName = null;
123 
124     /***
125      * Reference to the Reference#referent Field.
126      */
127     protected static Field referentField;
128     static {
129         // We need access to the referent field in the PhantomReference.
130         // For more on this trick, see
131         // http://www.javaspecialists.co.za/archive/Issue098.html and for
132         // discussion:
133         // http://www.theserverside.com/tss?service=direct/0/NewsThread/threadViewer.markNoisy.link&sp=l29865&sp=l146901
134         try {
135             referentField = Reference.class.getDeclaredField("referent");
136             referentField.setAccessible(true);
137         } catch (SecurityException e) {
138             throw new RuntimeException(e);
139         } catch (NoSuchFieldException e) {
140             throw new RuntimeException(e);
141         }
142     }
143 
144     /***
145      * Simple structure to keep needed information about a DB Environment.
146      */
147     protected static class DbEnvironmentEntry {
148         Environment environment;
149         StoredClassCatalog classCatalog;
150         int openDbCount = 0;
151         File dbDir;
152     }
153     
154     /***
155      * Shudown default constructor.
156      */
157     private CachedBdbMap() {
158         super();
159     }
160     
161     /***
162      * Constructor.
163      * 
164      * You must call
165      * {@link #initialize(Environment, Class, Class, StoredClassCatalog)}
166      * to finish construction. Construction is two-stepped to support
167      * reconnecting a deserialized CachedBdbMap with its backing bdbje
168      * database.
169      * 
170      * @param dbName Name of the backing db this instance should use.
171      */
172     public CachedBdbMap(final String dbName) {
173         this();
174         this.dbName = dbName;
175     }
176 
177     /***
178      * A constructor for creating a new CachedBdbMap.
179      * 
180      * Even though the put and get methods conforms to the Collections interface
181      * taking any object as key or value, you have to submit the class of the
182      * allowed key and value objects here and will get an exception if you try
183      * to put anything else in the map.
184      * 
185      * <p>This constructor internally calls
186      * {@link #initialize(Environment, Class, Class, StoredClassCatalog)}.
187      * Do not call initialize if you use this constructor.
188      * 
189      * @param dbDir The directory where the database will be created.
190      * @param dbName The name of the database to back this map by.
191      * @param keyClass The class of the objects allowed as keys.
192      * @param valueClass The class of the objects allowed as values.
193      * 
194      * @throws DatabaseException is thrown if the underlying BDB JE database
195      *             throws an exception.
196      */
197     public CachedBdbMap(final File dbDir, final String dbName,
198             final Class<K> keyClass, final Class<V> valueClass)
199     throws DatabaseException {
200         this(dbName);
201         this.dbEnvironment = getDbEnvironment(dbDir);
202         this.dbEnvironment.openDbCount++;
203         initialize(dbEnvironment.environment, keyClass, valueClass,
204             dbEnvironment.classCatalog);
205         if (logger.isLoggable(Level.INFO)) {
206             // Write out the bdb configuration.
207             EnvironmentConfig cfg = this.dbEnvironment.environment.getConfig();
208             logger.info("BdbConfiguration: Cache percentage "  +
209                 cfg.getCachePercent() + ", cache size " + cfg.getCacheSize() +
210                 ", Map size: " + size());
211         }
212     }
213     
214     /***
215      * Call this method when you have an instance when you used the
216      * default constructor or when you have a deserialized instance that you
217      * want to reconnect with an extant bdbje environment.  Do not
218      * call this method if you used the
219      * {@link #CachedBdbMap(File, String, Class, Class)} constructor.
220      * @param env
221      * @param keyClass
222      * @param valueClass
223      * @param classCatalog
224      * @throws DatabaseException
225      */
226     public synchronized void initialize(final Environment env, final Class keyClass,
227             final Class valueClass, final StoredClassCatalog classCatalog)
228     throws DatabaseException {
229         initializeInstance();
230         this.db = openDatabase(env, this.dbName);
231         this.diskMap = createDiskMap(this.db, classCatalog, keyClass,
232             valueClass);
233     }
234     
235     /***
236      * Do any instance setup.
237      * This method is used by constructors and when deserializing an instance.
238      */
239     protected void initializeInstance() {
240         this.memMap = new HashMap<K,SoftEntry<V>>();
241         this.refQueue = new ReferenceQueue<V>();
242     }
243     
244     protected StoredSortedMap createDiskMap(Database database,
245             StoredClassCatalog classCatalog, Class keyClass, Class valueClass) {
246         EntryBinding keyBinding = TupleBinding.getPrimitiveBinding(keyClass);
247         if(keyBinding == null) {
248             keyBinding = new SerialBinding(classCatalog, keyClass);
249         }
250         EntryBinding valueBinding = TupleBinding.getPrimitiveBinding(valueClass);
251         if(valueBinding == null) {
252             valueBinding = new SerialBinding(classCatalog, valueClass);
253         }
254         return new StoredSortedMap(database, keyBinding, valueBinding, true);
255     }
256 
257     /***
258      * Get the database environment for a physical directory where data will be
259      * stored.
260      * <p>
261      * If the environment already exist it will be reused, else a new one will
262      * be created.
263      * 
264      * @param dbDir The directory where BDB JE data will be stored.
265      * @return a datastructure containing the environment and a default database
266      *         for storing class definitions.
267      */
268     private DbEnvironmentEntry getDbEnvironment(File dbDir) {
269         if (dbEnvironmentMap.containsKey(dbDir.getAbsolutePath())) {
270             return (DbEnvironmentEntry) dbEnvironmentMap.get(dbDir
271                     .getAbsolutePath());
272         }
273         EnvironmentConfig envConfig = new EnvironmentConfig();
274         envConfig.setAllowCreate(true);
275         envConfig.setTransactional(false);
276         
277         // We're doing the caching ourselves so setting these at the lowest
278         // possible level.
279         envConfig.setCachePercent(1);
280         DbEnvironmentEntry env = new DbEnvironmentEntry();
281         try {
282             env.environment = new Environment(dbDir, envConfig);
283             env.dbDir = dbDir;
284             dbEnvironmentMap.put(dbDir.getAbsolutePath(), env);
285             
286             DatabaseConfig dbConfig = new DatabaseConfig();
287             dbConfig.setTransactional(false);
288             dbConfig.setAllowCreate(true);
289             dbConfig.setDeferredWrite(true);
290             
291             Database catalogDb = env.environment.openDatabase(null,
292                     CLASS_CATALOG, dbConfig);
293             
294             env.classCatalog = new StoredClassCatalog(catalogDb);
295         } catch (DatabaseException e) {
296             e.printStackTrace();
297             //throw new FatalConfigurationException(e.getMessage());
298         }
299         return env;
300     }
301 
302     protected Database openDatabase(final Environment environment,
303             final String dbName) throws DatabaseException {
304         DatabaseConfig dbConfig = new DatabaseConfig();
305         dbConfig.setTransactional(false);
306         dbConfig.setAllowCreate(true);
307         dbConfig.setDeferredWrite(true);
308         return environment.openDatabase(null, dbName, dbConfig);
309     }
310 
311     public synchronized void close() throws DatabaseException {
312         // Close out my bdb db.
313         if (this.db != null) {
314             try {
315                 this.db.sync();
316                 this.db.close();
317             } catch (DatabaseException e) {
318                 e.printStackTrace();
319             } finally {
320                 this.db = null;
321             }
322         }
323         if (dbEnvironment != null) {
324             dbEnvironment.openDbCount--;
325             if (dbEnvironment.openDbCount <= 0) {
326                 dbEnvironment.classCatalog.close();
327                 dbEnvironment.environment.close();
328                 dbEnvironmentMap.remove(dbEnvironment.dbDir.getAbsolutePath());
329                 dbEnvironment = null;
330             }
331         }
332     }
333 
334     protected void finalize() throws Throwable {
335         close();
336         super.finalize();
337     }
338 
339     /***
340      * The keySet of the diskMap is all relevant keys. 
341      * 
342      * @see java.util.Map#keySet()
343      */
344     @SuppressWarnings("unchecked")
345     public Set<K> keySet() {
346         return diskMap.keySet();
347     }
348     
349     public Set<Map.Entry<K,V>> entrySet() {
350         // Would require complicated implementation to 
351         // maintain identity guarantees, so skipping
352         throw new UnsupportedOperationException();
353     }
354 
355     public synchronized V get(final Object object) {
356         K key = toKey(object);
357         countOfGets++;
358         expungeStaleEntries();
359         if (countOfGets % 10000 == 0) {
360             logCacheSummary();
361         }
362         SoftEntry<V> entry = memMap.get(key);
363         if (entry != null) {
364             V val = entry.get(); // get & hold, so not cleared pre-return
365             if (val != null) {
366                 cacheHit++;
367                 return val;
368             }
369             // Explicitly clear this entry from referencequeue since its
370             // value is null.
371             expungeStaleEntry(entry);
372         }
373 
374         // check backing diskMap
375         V v = diskMapGet(key);
376         if (v != null) {
377             diskHit++;
378             memMap.put(key, new SoftEntry<V>(key, v, refQueue));
379         }
380         return v;
381     }
382 
383     /***
384      * Info to log, if at FINE level, on every get()
385      */
386     private void logCacheSummary() {
387         if (!logger.isLoggable((Level.FINE))) {
388             return;
389         }
390         try {
391             long cacheHitPercent = (cacheHit * 100) / (cacheHit + diskHit);
392             logger.fine("DB name: " + this.db.getDatabaseName()
393                 + ", Cache Hit: " + cacheHitPercent
394                 + "%, Not in map: " + (countOfGets - (cacheHit + diskHit))
395                 + ", Total number of gets: " + countOfGets);
396         } catch (DatabaseException e) {
397             // This is just for logging so ignore DB Exceptions
398         }
399     }
400     
401     public synchronized V put(K key, V value) {
402         V prevVal = get(key);
403         memMap.put(key, new SoftEntry<V>(key, value, refQueue));
404         diskMap.put(key,value); // dummy
405         if(prevVal==null) {
406             diskMapSize++;
407         }
408         return prevVal;
409     }
410 
411     /***
412      * Note that a call to this method CLOSEs the underlying bdbje.
413      * This instance is no longer of any use.  It must be re-initialized.
414      * We close the db here because if this BigMap is being treated as a plain
415      * Map, this is only opportunity for cleanup.
416      */
417     public synchronized void clear() {
418         this.memMap.clear();
419         this.diskMap.clear();
420         this.diskMapSize = 0;
421         try {
422             close();
423         } catch (DatabaseException e) {
424             e.printStackTrace();
425         }
426     }
427 
428     public synchronized V remove(final Object key) {
429         V prevValue = get(key);
430         memMap.remove(key);
431         expungeStaleEntries();
432         diskMap.remove(key);
433         diskMapSize--;
434         return prevValue;
435     }
436 
437     public synchronized boolean containsKey(Object key) {
438         if (quickContainsKey(key)) {
439             return true;
440         }
441         return diskMap.containsKey(key);
442     }
443 
444     public synchronized boolean quickContainsKey(Object key) {
445         expungeStaleEntries();
446         return memMap.containsKey(key);
447     }
448 
449     public synchronized boolean containsValue(Object value) {
450         if (quickContainsValue(value)) {
451             return true;
452         }
453         return diskMap.containsValue(value);
454     }
455 
456     public synchronized boolean quickContainsValue(Object value) {
457         expungeStaleEntries();
458         // FIXME this isn't really right, as memMap is of SoftEntries
459         return memMap.containsValue(value);
460     }
461 
462     public int size() {
463         return diskMapSize;
464     }
465     
466     protected String getDatabaseName() {
467         String name = "DbName-Lookup-Failed";
468         try {
469             if (this.db != null) {
470                 name = this.db.getDatabaseName();
471             }
472         } catch (DatabaseException e) {
473             // Ignore.
474         }
475         return name;
476     }
477     
478     /***
479      * Sync in-memory map entries to backing disk store.
480      * When done, the memory map will be cleared and all entries stored
481      * on disk.
482      */
483     public synchronized void sync() {
484         String dbName = null;
485         // Sync. memory and disk.
486         long startTime = 0;
487         if (logger.isLoggable(Level.INFO)) {
488             dbName = getDatabaseName();
489             startTime = System.currentTimeMillis();
490             logger.info(dbName + " start sizes: disk " + this.diskMapSize +
491                 ", mem " + this.memMap.size());
492         }
493         expungeStaleEntries();
494         LinkedList<SoftEntry> stale = new LinkedList<SoftEntry>(); 
495         for (Iterator i = this.memMap.keySet().iterator(); i.hasNext();) {
496             Object key = i.next();
497             SoftEntry entry = (SoftEntry) memMap.get(key);
498             if (entry != null) {
499                 // Get & hold so not cleared pre-return.
500                 Object value = entry.get();
501                 if (value != null) {
502                     this.diskMap.put(key, value);
503                 } else {
504                     stale.add(entry);
505                 }
506             }
507         }
508         // for any entries above that had been cleared, ensure expunged
509         for (SoftEntry entry : stale) {
510             expungeStaleEntry(entry);
511         }   
512         
513         // force sync of deferred-writes
514         try {
515             this.db.sync();
516         } catch (DatabaseException e) {
517             // TODO Auto-generated catch block
518             throw new RuntimeException(e);
519         }
520         
521         if (logger.isLoggable(Level.INFO)) {
522             logger.info(dbName + " sync took " +
523                 (System.currentTimeMillis() - startTime) + "ms. " +
524                 "Finish sizes: disk " +
525                 this.diskMapSize + ", mem " + this.memMap.size());
526         }
527     }
528 
529     private void expungeStaleEntries() {
530         int c = 0;
531         long startTime = System.currentTimeMillis();
532         for(SoftEntry entry; (entry = refQueuePoll()) != null;) {
533             expungeStaleEntry(entry);
534             c++;
535         }
536         if (c > 0 && logger.isLoggable(Level.FINER)) {
537             long endTime = System.currentTimeMillis();
538             try {
539                 logger.finer("DB: " + db.getDatabaseName() + ",  Expunged: "
540                         + c + ", Diskmap size: " + diskMapSize
541                         + ", Cache size: " + memMap.size()
542                         + ", in "+(endTime-startTime)+"ms");
543             } catch (DatabaseException e) {
544                 logger.log(Level.FINER,"exception while logging",e);
545             }
546         }
547     }
548     
549     private void expungeStaleEntry(SoftEntry entry) {
550         // If phantom already null, its already expunged -- probably
551         // because it was purged directly first from inside in
552         // {@link #get(String)} and then it went on the poll queue and
553         // when it came off inside in expungeStaleEntries, this method
554         // was called again.
555         if (entry.getPhantom() == null) {
556             return;
557         }
558         // If the object that is in memMap is not the one passed here, then
559         // memMap has been changed -- probably by a put on top of this entry.
560         if (memMap.get(entry.getPhantom().getKey()) == entry) {
561             memMap.remove(entry.getPhantom().getKey());
562             diskMap.put(entry.getPhantom().getKey(),
563                 entry.getPhantom().doctoredGet());
564         }
565         entry.clearPhantom();
566     }
567     
568     private class PhantomEntry<T> extends PhantomReference<T> {
569         private final Object key;
570 
571         public PhantomEntry(Object key, T referent) {
572             super(referent, null);
573             this.key = key;
574         }
575 
576         /***
577          * @return Return the referent. The contract for {@link #get()}
578          * always returns a null referent.  We've cheated and doctored
579          * PhantomReference to return the actual referent value.  See notes
580          * at {@link #referentField};
581          */
582         public Object doctoredGet() {
583             try {
584                 // Here we use the referentField saved off on static
585                 // initialization of this class to get at this References'
586                 // private referent field.
587                 return referentField.get(this);
588             } catch (IllegalAccessException e) {
589                 throw new RuntimeException(e);
590             }
591         }
592 
593         /***
594          * @return Returns the key.
595          */
596         public Object getKey() {
597             return this.key;
598         }
599     }
600 
601     private class SoftEntry<T> extends SoftReference<T> {
602         private PhantomEntry<T> phantom;
603 
604         public SoftEntry(Object key, T referent, ReferenceQueue<T> q) {
605             super(referent, q);
606             this.phantom = new PhantomEntry<T>(key, referent);
607         }
608 
609         /***
610          * @return Returns the phantom reference.
611          */
612         public PhantomEntry getPhantom() {
613             return this.phantom;
614         }
615         
616         public void clearPhantom() {
617             this.phantom.clear();
618             this.phantom = null;
619             super.clear();
620         }
621     }
622     
623     private void readObject(java.io.ObjectInputStream stream)
624     throws IOException, ClassNotFoundException {
625         stream.defaultReadObject();
626         initializeInstance();
627         if (logger.isLoggable(Level.FINE)) {
628             logger.fine(getDatabaseName() + " diskMapSize: " + diskMapSize);
629         }
630     }
631     
632  
633     
634     @SuppressWarnings("unchecked")
635     private K toKey(Object o) {
636         return (K)o;
637     }
638     
639     @SuppressWarnings("unchecked")
640     private V diskMapGet(K k) {
641         return (V)diskMap.get(k);
642     }
643     
644     @SuppressWarnings("unchecked")
645     private SoftEntry<V> refQueuePoll() {
646         return (SoftEntry)refQueue.poll();
647     }
648 }