1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.util;
28
29 import java.io.Serializable;
30 import java.security.SecureRandom;
31
32 /*** A Bloom filter.
33 *
34 * SLIGHTLY ADAPTED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
35 *
36 * <p>KEY CHANGES:
37 *
38 * <ul>
39 * <li>Adapted to use 32bit ops as much as possible... may be slightly
40 * faster on 32bit hardware/OS</li>
41 * <li>Changed to use bitfield that is a power-of-two in size, allowing
42 * hash() to use bitshifting rather than modulus... may be slightly
43 * faster</li>
44 * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between
45 * similar strings</li>
46 * <li>Removed dependence on cern.colt MersenneTwister (replaced with
47 * SecureRandom) and QuickBitVector (replaced with local methods).</li>
48 * </ul>
49 *
50 * <hr>
51 *
52 * <P>Instances of this class represent a set of character sequences (with false positives)
53 * using a Bloom filter. Because of the way Bloom filters work,
54 * you cannot remove elements.
55 *
56 * <P>Bloom filters have an expected error rate, depending on the number
57 * of hash functions used, on the filter size and on the number of elements in the filter. This implementation
58 * uses a variable optimal number of hash functions, depending on the expected
59 * number of elements. More precisely, a Bloom
60 * filter for <var>n</var> character sequences with <var>d</var> hash functions will use
61 * ln 2 <var>d</var><var>n</var> ≈ 1.44 <var>d</var><var>n</var> bits;
62 * false positives will happen with probability 2<sup>-<var>d</var></sup>.
63 *
64 * <P>Hash functions are generated at creation time using universal hashing. Each hash function
65 * uses {@link #NUMBER_OF_WEIGHTS} random integers, which are cyclically multiplied by
66 * the character codes in a character sequence. The resulting integers are XOR-ed together.
67 *
68 * <P>This class exports access methods that are very similar to those of {@link java.util.Set},
69 * but it does not implement that interface, as too many non-optional methods
70 * would be unimplementable (e.g., iterators).
71 *
72 * @author Sebastiano Vigna
73 */
74 public class BloomFilter32bp2Split implements Serializable, BloomFilter {
75
76 private static final long serialVersionUID = -1504889954381695129L;
77
78 /*** The number of weights used to create hash functions. */
79 final public static int NUMBER_OF_WEIGHTS = 2083;
80 /*** The number of bits in this filter. */
81 final public long m;
82 /*** the power-of-two that m is */
83 final public long power;
84 /*** The number of hash functions used by this filter. */
85 final public int d;
86 /*** The underlying bit vectorS. */
87 final private int[][] bits;
88 /*** Bitshift to get first index */
89 final private int aShift;
90 /*** Mask to get second index */
91 final private int bMask;
92 /*** The random integers used to generate the hash functions. */
93 final private int[][] weight;
94
95 /*** The number of elements currently in the filter. It may be
96 * smaller than the actual number of additions of distinct character
97 * sequences because of false positives.
98 */
99 private int size;
100
101 /*** The natural logarithm of 2, used in the computation of the number of bits. */
102 private final static double NATURAL_LOG_OF_2 = Math.log( 2 );
103
104 private final static boolean DEBUG = false;
105
106 /*** Creates a new Bloom filter with given number of hash functions and expected number of elements.
107 *
108 * @param n the expected number of elements.
109 * @param d the number of hash functions; if the filter add not more than <code>n</code> elements,
110 * false positives will happen with probability 2<sup>-<var>d</var></sup>.
111 */
112 public BloomFilter32bp2Split( final int n, final int d ) {
113 this.d = d;
114 long minBits = (long) ((long)n * (long)d / NATURAL_LOG_OF_2);
115 long pow = 0;
116 while((1L<<pow) < minBits) {
117 pow++;
118 }
119 this.power = pow;
120 this.m = 1L<<pow;
121 int len = (int) (m / 32);
122 if ( m > 1L<<32 ) {
123 throw new IllegalArgumentException( "This filter would require " + m + " bits" );
124 }
125
126 aShift = (int) (pow - ADDRESS_BITS_PER_UNIT - 8);
127 bMask = (1<<aShift) - 1;
128 bits = new int[256][ 1<<aShift ];
129
130 System.out.println("power "+power+" bits "+m+" len "+len);
131 System.out.println("aShift "+aShift+" bMask "+bMask);
132
133 if ( DEBUG ) System.err.println( "Number of bits: " + m );
134
135
136
137
138 final SecureRandom random = new SecureRandom(new byte[] {19,96});
139 weight = new int[ d ][];
140 for( int i = 0; i < d; i++ ) {
141 weight[ i ] = new int[ NUMBER_OF_WEIGHTS ];
142 for( int j = 0; j < NUMBER_OF_WEIGHTS; j++ )
143 weight[ i ][ j ] = random.nextInt();
144 }
145 }
146
147 /*** The number of character sequences in the filter.
148 *
149 * @return the number of character sequences in the filter (but see {@link #contains(CharSequence)}).
150 */
151
152 public int size() {
153 return size;
154 }
155
156 /*** Hashes the given sequence with the given hash function.
157 *
158 * @param s a character sequence.
159 * @param l the length of <code>s</code>.
160 * @param k a hash function index (smaller than {@link #d}).
161 * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
162 */
163 private int hash( final CharSequence s, final int l, final int k ) {
164 final int[] w = weight[ k ];
165 int h = 0, i = l;
166 while( i-- != 0 ) h ^= s.charAt( i ) * w[ i % NUMBER_OF_WEIGHTS ];
167 return h >>> (32-power);
168 }
169
170 /*** Checks whether the given character sequence is in this filter.
171 *
172 * <P>Note that this method may return true on a character sequence that is has
173 * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
174 * where <var>d</var> is the number of hash functions specified at creation time, if
175 * the number of the elements in the filter is less than <var>n</var>, the number
176 * of expected elements specified at creation time.
177 *
178 * @param s a character sequence.
179 * @return true if the sequence is in the filter (or if a sequence with the
180 * same hash sequence is in the filter).
181 */
182
183 public boolean contains( final CharSequence s ) {
184 int i = d, l = s.length();
185 while( i-- != 0 ) if ( ! getBit( hash( s, l, i ) ) ) return false;
186 return true;
187 }
188
189 /*** Adds a character sequence to the filter.
190 *
191 * @param s a character sequence.
192 * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
193 */
194
195 public boolean add( final CharSequence s ) {
196 boolean result = false;
197 int i = d, l = s.length();
198 int h;
199 while( i-- != 0 ) {
200 h = hash( s, l, i );
201 if ( ! setGetBit( h ) ) result = true;
202 }
203 if ( result ) size++;
204 return result;
205 }
206
207 protected final static int ADDRESS_BITS_PER_UNIT = 5;
208 protected final static int BIT_INDEX_MASK = 31;
209
210 /***
211 * Returns from the local bitvector the value of the bit with
212 * the specified index. The value is <tt>true</tt> if the bit
213 * with the index <tt>bitIndex</tt> is currently set; otherwise,
214 * returns <tt>false</tt>.
215 *
216 * (adapted from cern.colt.bitvector.QuickBitVector)
217 *
218 * @param bitIndex the bit index.
219 * @return the value of the bit with the specified index.
220 */
221 protected boolean getBit(int bitIndex) {
222 int intIndex = (int)(bitIndex >>> ADDRESS_BITS_PER_UNIT);
223 return ((bits[intIndex>>>aShift][intIndex&bMask] & (1 << (bitIndex & BIT_INDEX_MASK))) != 0);
224 }
225
226 /***
227 * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
228 *
229 * (adapted from cern.colt.bitvector.QuickBitVector)
230 *
231 * @param bitIndex the index of the bit to be set.
232 */
233 protected void setBit(int bitIndex) {
234 int intIndex = (int)(bitIndex >>> ADDRESS_BITS_PER_UNIT);
235 bits[intIndex>>>aShift][intIndex&bMask] |= 1 << (bitIndex & BIT_INDEX_MASK);
236 }
237
238 /***
239 * Sets the bit with index <tt>bitIndex</tt> in local bitvector --
240 * returning the old value.
241 *
242 * (adapted from cern.colt.bitvector.QuickBitVector)
243 *
244 * @param bitIndex the index of the bit to be set.
245 */
246 protected boolean setGetBit(int bitIndex) {
247 int intIndex = (int)(bitIndex >>> ADDRESS_BITS_PER_UNIT);
248 int a = intIndex>>>aShift;
249 int b = intIndex&bMask;
250 int mask = 1 << (bitIndex & BIT_INDEX_MASK);
251 boolean ret = ((bits[a][b] & (mask)) != 0);
252 bits[a][b] |= mask;
253 return ret;
254 }
255
256
257
258
259 public long getSizeBytes() {
260 return bits.length*bits[0].length*4;
261 }
262 }