1   /* BloomFilter
2   *
3   * $Id: BloomFilter32bp2Split.java 4644 2006-09-20 22:40:21Z paul_jack $
4   *
5   * Created on Jun 21, 2005
6   *
7   * Copyright (C) 2005 Internet Archive; a slight adaptation of
8   * LGPL work (C) Sebastiano Vigna
9   *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25  */
26  
27  package org.archive.util;
28  
29  import java.io.Serializable;
30  import java.security.SecureRandom;
31  
32  /*** A Bloom filter.
33   *
34   * SLIGHTLY ADAPTED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
35   * 
36   * <p>KEY CHANGES:
37   *
38   * <ul>
39   * <li>Adapted to use 32bit ops as much as possible... may be slightly
40   * faster on 32bit hardware/OS</li>
41   * <li>Changed to use bitfield that is a power-of-two in size, allowing
42   * hash() to use bitshifting rather than modulus... may be slightly
43   * faster</li>
44   * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between 
45   * similar strings</li>
46   * <li>Removed dependence on cern.colt MersenneTwister (replaced with
47   * SecureRandom) and QuickBitVector (replaced with local methods).</li>
48   * </ul>
49   * 
50   * <hr>
51   * 
52   * <P>Instances of this class represent a set of character sequences (with false positives)
53   * using a Bloom filter. Because of the way Bloom filters work,
54   * you cannot remove elements.
55   *
56   * <P>Bloom filters have an expected error rate, depending on the number
57   * of hash functions used, on the filter size and on the number of elements in the filter. This implementation
58   * uses a variable optimal number of hash functions, depending on the expected
59   * number of elements. More precisely, a Bloom
60   * filter for <var>n</var> character sequences with <var>d</var> hash functions will use
61   * ln 2 <var>d</var><var>n</var> &#8776; 1.44 <var>d</var><var>n</var> bits;
62   * false positives will happen with probability 2<sup>-<var>d</var></sup>.
63   *
64   * <P>Hash functions are generated at creation time using universal hashing. Each hash function
65   * uses {@link #NUMBER_OF_WEIGHTS} random integers, which are cyclically multiplied by
66   * the character codes in a character sequence. The resulting integers are XOR-ed together.
67   *
68   * <P>This class exports access methods that are very similar to those of {@link java.util.Set},
69   * but it does not implement that interface, as too many non-optional methods
70   * would be unimplementable (e.g., iterators).
71   *
72   * @author Sebastiano Vigna
73   */
74  public class BloomFilter32bp2Split implements Serializable, BloomFilter {
75  
76      private static final long serialVersionUID = -1504889954381695129L;
77      
78      /*** The number of weights used to create hash functions. */
79      final public static int NUMBER_OF_WEIGHTS = 2083; // CHANGED FROM 16
80      /*** The number of bits in this filter. */
81      final public long m; 
82      /*** the power-of-two that m is */
83      final public long power; // 1<<power == m
84      /*** The number of hash functions used by this filter. */
85      final public int d;
86      /*** The underlying bit vectorS. */
87      final private int[][] bits;
88      /*** Bitshift to get first index */
89      final private int aShift;
90      /*** Mask to get second index */
91      final private int bMask;
92      /*** The random integers used to generate the hash functions. */
93      final private int[][] weight;
94  
95      /*** The number of elements currently in the filter. It may be
96       * smaller than the actual number of additions of distinct character
97       * sequences because of false positives.
98       */
99      private int size;
100 
101     /*** The natural logarithm of 2, used in the computation of the number of bits. */
102     private final static double NATURAL_LOG_OF_2 = Math.log( 2 );
103 
104     private final static boolean DEBUG = false;
105 
106     /*** Creates a new Bloom filter with given number of hash functions and expected number of elements.
107      *
108      * @param n the expected number of elements.
109      * @param d the number of hash functions; if the filter add not more than <code>n</code> elements,
110      * false positives will happen with probability 2<sup>-<var>d</var></sup>.
111      */
112     public BloomFilter32bp2Split( final int n, final int d ) {
113         this.d = d;
114         long minBits = (long) ((long)n * (long)d / NATURAL_LOG_OF_2);
115         long pow = 0;
116         while((1L<<pow) < minBits) {
117         	pow++;
118         }
119         this.power = pow;
120         this.m = 1L<<pow;
121         int len = (int) (m / 32);
122         if ( m > 1L<<32 ) {
123         	throw new IllegalArgumentException( "This filter would require " + m + " bits" );
124         }
125 
126         aShift = (int) (pow - ADDRESS_BITS_PER_UNIT - 8);
127         bMask = (1<<aShift) - 1;
128         bits = new int[256][ 1<<aShift ];
129 
130         System.out.println("power "+power+" bits "+m+" len "+len);
131         System.out.println("aShift "+aShift+" bMask "+bMask);
132 
133         if ( DEBUG ) System.err.println( "Number of bits: " + m );
134 
135         // seeded for reproduceable behavior in repeated runs; BUT: 
136         // SecureRandom's default implementation (as of 1.5) 
137         // seems to mix in its own seeding.
138         final SecureRandom random = new SecureRandom(new byte[] {19,96});
139         weight = new int[ d ][];
140         for( int i = 0; i < d; i++ ) {
141             weight[ i ] = new int[ NUMBER_OF_WEIGHTS ];
142             for( int j = 0; j < NUMBER_OF_WEIGHTS; j++ )
143                  weight[ i ][ j ] = random.nextInt();
144         }
145     }
146 
147     /*** The number of character sequences in the filter.
148      *
149      * @return the number of character sequences in the filter (but see {@link #contains(CharSequence)}).
150      */
151 
152     public int size() {
153         return size;
154     }
155 
156     /*** Hashes the given sequence with the given hash function.
157      *
158      * @param s a character sequence.
159      * @param l the length of <code>s</code>.
160      * @param k a hash function index (smaller than {@link #d}).
161      * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
162      */
163 	private int hash( final CharSequence s, final int l, final int k ) {
164 		final int[] w = weight[ k ];
165 		int h = 0, i = l;
166 		while( i-- != 0 ) h ^= s.charAt( i ) * w[ i % NUMBER_OF_WEIGHTS ];
167 		return h >>> (32-power); 
168 	}
169 
170     /*** Checks whether the given character sequence is in this filter.
171      *
172      * <P>Note that this method may return true on a character sequence that is has
173      * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
174      * where <var>d</var> is the number of hash functions specified at creation time, if
175      * the number of the elements in the filter is less than <var>n</var>, the number
176      * of expected elements specified at creation time.
177      *
178      * @param s a character sequence.
179      * @return true if the sequence is in the filter (or if a sequence with the
180      * same hash sequence is in the filter).
181      */
182 
183     public boolean contains( final CharSequence s ) {
184         int i = d, l = s.length();
185         while( i-- != 0 ) if ( ! getBit( hash( s, l, i ) ) ) return false;
186         return true;
187     }
188 
189     /*** Adds a character sequence to the filter.
190      *
191      * @param s a character sequence.
192      * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
193      */
194 
195     public boolean add( final CharSequence s ) {
196         boolean result = false;
197         int i = d, l = s.length();
198         int h;
199         while( i-- != 0 ) {
200             h = hash( s, l, i );
201             if ( ! setGetBit( h ) ) result = true;
202         }
203         if ( result ) size++;
204         return result;
205     }
206     
207     protected final static int ADDRESS_BITS_PER_UNIT = 5; // 32=2^5
208     protected final static int BIT_INDEX_MASK = 31; // = BITS_PER_UNIT - 1;
209 
210     /***
211      * Returns from the local bitvector the value of the bit with 
212      * the specified index. The value is <tt>true</tt> if the bit 
213      * with the index <tt>bitIndex</tt> is currently set; otherwise, 
214      * returns <tt>false</tt>.
215      *
216      * (adapted from cern.colt.bitvector.QuickBitVector)
217      * 
218      * @param     bitIndex   the bit index.
219      * @return    the value of the bit with the specified index.
220      */
221     protected boolean getBit(int bitIndex) {
222         int intIndex = (int)(bitIndex >>> ADDRESS_BITS_PER_UNIT);
223         return ((bits[intIndex>>>aShift][intIndex&bMask] & (1 << (bitIndex & BIT_INDEX_MASK))) != 0);
224     }
225 
226     /***
227      * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
228      *
229      * (adapted from cern.colt.bitvector.QuickBitVector)
230      * 
231      * @param     bitIndex   the index of the bit to be set.
232      */
233     protected void setBit(int bitIndex) {
234         int intIndex = (int)(bitIndex >>> ADDRESS_BITS_PER_UNIT);
235         bits[intIndex>>>aShift][intIndex&bMask] |= 1 << (bitIndex & BIT_INDEX_MASK);
236     }
237     
238     /***
239      * Sets the bit with index <tt>bitIndex</tt> in local bitvector -- 
240      * returning the old value. 
241      *
242      * (adapted from cern.colt.bitvector.QuickBitVector)
243      * 
244      * @param     bitIndex   the index of the bit to be set.
245      */
246     protected boolean setGetBit(int bitIndex) {
247         int intIndex = (int)(bitIndex >>> ADDRESS_BITS_PER_UNIT);
248         int a = intIndex>>>aShift;
249         int b = intIndex&bMask;
250         int mask = 1 << (bitIndex & BIT_INDEX_MASK);
251         boolean ret = ((bits[a][b] & (mask)) != 0);
252         bits[a][b] |= mask;
253         return ret;
254     }
255     
256 	/* (non-Javadoc)
257 	 * @see org.archive.util.BloomFilter#getSizeBytes()
258 	 */
259 	public long getSizeBytes() {
260 		return bits.length*bits[0].length*4;
261 	}
262 }