1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.util;
28
29 import java.io.Serializable;
30 import java.security.SecureRandom;
31
32 /*** A Bloom filter.
33 *
34 * SLIGHTLY ADAPTED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter
35 *
36 * <p>KEY CHANGES:
37 *
38 * <ul>
39 * <li>Adapted to use 32bit ops as much as possible... may be slightly
40 * faster on 32bit hardware/OS</li>
41 * <li>NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between
42 * similar strings</li>
43 * <li>Removed dependence on cern.colt MersenneTwister (replaced with
44 * SecureRandom) and QuickBitVector (replaced with local methods).</li>
45 * </ul>
46 *
47 * <hr>
48 *
49 * <P>Instances of this class represent a set of character sequences (with false positives)
50 * using a Bloom filter. Because of the way Bloom filters work,
51 * you cannot remove elements.
52 *
53 * <P>Bloom filters have an expected error rate, depending on the number
54 * of hash functions used, on the filter size and on the number of elements in the filter. This implementation
55 * uses a variable optimal number of hash functions, depending on the expected
56 * number of elements. More precisely, a Bloom
57 * filter for <var>n</var> character sequences with <var>d</var> hash functions will use
58 * ln 2 <var>d</var><var>n</var> ≈ 1.44 <var>d</var><var>n</var> bits;
59 * false positives will happen with probability 2<sup>-<var>d</var></sup>.
60 *
61 * <P>Hash functions are generated at creation time using universal hashing. Each hash function
62 * uses {@link #NUMBER_OF_WEIGHTS} random integers, which are cyclically multiplied by
63 * the character codes in a character sequence. The resulting integers are XOR-ed together.
64 *
65 * <P>This class exports access methods that are very similar to those of {@link java.util.Set},
66 * but it does not implement that interface, as too many non-optional methods
67 * would be unimplementable (e.g., iterators).
68 *
69 * @author Sebastiano Vigna
70 */
71 public class BloomFilter32bitSplit implements Serializable, BloomFilter {
72
73 private static final long serialVersionUID = -164106965277863971L;
74
75 /*** The number of weights used to create hash functions. */
76 final public static int NUMBER_OF_WEIGHTS = 2083;
77 /*** The number of bits in this filter. */
78 final public long m;
79 /*** The number of hash functions used by this filter. */
80 final public int d;
81 /*** The underlying bit vectorS. */
82
83 final private int[][] bits;
84 /*** The random integers used to generate the hash functions. */
85 final private int[][] weight;
86
87 /*** The number of elements currently in the filter. It may be
88 * smaller than the actual number of additions of distinct character
89 * sequences because of false positives.
90 */
91 private int size;
92
93 /*** The natural logarithm of 2, used in the computation of the number of bits. */
94 private final static double NATURAL_LOG_OF_2 = Math.log( 2 );
95
96 /*** number of ints in 1MB. */
97 private final static int ONE_MB_INTS = 1 << 18;
98
99 private final static boolean DEBUG = false;
100
101 /*** Creates a new Bloom filter with given number of hash functions and expected number of elements.
102 *
103 * @param n the expected number of elements.
104 * @param d the number of hash functions; if the filter add not more than <code>n</code> elements,
105 * false positives will happen with probability 2<sup>-<var>d</var></sup>.
106 */
107 public BloomFilter32bitSplit( final int n, final int d ) {
108 this.d = d;
109 int len =
110 (int)Math.ceil( ( (long)n * (long)d / NATURAL_LOG_OF_2 ) / 32 );
111
112 len = ((len / ONE_MB_INTS)+1)*ONE_MB_INTS;
113 this.m = len*32L;
114 if ( m >= 1L<<54 ) {
115 throw new IllegalArgumentException( "This filter would require " + m + " bits" );
116 }
117
118 bits = new int[ len/ONE_MB_INTS ][ONE_MB_INTS];
119
120 if ( DEBUG ) System.err.println( "Number of bits: " + m );
121
122
123
124
125 final SecureRandom random = new SecureRandom(new byte[] {19,96});
126 weight = new int[ d ][];
127 for( int i = 0; i < d; i++ ) {
128 weight[ i ] = new int[ NUMBER_OF_WEIGHTS ];
129 for( int j = 0; j < NUMBER_OF_WEIGHTS; j++ )
130 weight[ i ][ j ] = random.nextInt();
131 }
132 }
133
134 /*** The number of character sequences in the filter.
135 *
136 * @return the number of character sequences in the filter (but see {@link #contains(CharSequence)}).
137 */
138
139 public int size() {
140 return size;
141 }
142
143 /*** Hashes the given sequence with the given hash function.
144 *
145 * @param s a character sequence.
146 * @param l the length of <code>s</code>.
147 * @param k a hash function index (smaller than {@link #d}).
148 * @return the position in the filter corresponding to <code>s</code> for the hash function <code>k</code>.
149 */
150 private long hash( final CharSequence s, final int l, final int k ) {
151 final int[] w = weight[ k ];
152 int h = 0, i = l;
153 while( i-- != 0 ) h ^= s.charAt( i ) * w[ i % NUMBER_OF_WEIGHTS ];
154 return ((long)h-Integer.MIN_VALUE) % m;
155 }
156
157 /*** Checks whether the given character sequence is in this filter.
158 *
159 * <P>Note that this method may return true on a character sequence that is has
160 * not been added to the filter. This will happen with probability 2<sub>-<var>d</var></sub>,
161 * where <var>d</var> is the number of hash functions specified at creation time, if
162 * the number of the elements in the filter is less than <var>n</var>, the number
163 * of expected elements specified at creation time.
164 *
165 * @param s a character sequence.
166 * @return true if the sequence is in the filter (or if a sequence with the
167 * same hash sequence is in the filter).
168 */
169
170 public boolean contains( final CharSequence s ) {
171 int i = d, l = s.length();
172 while( i-- != 0 ) if ( ! getBit( hash( s, l, i ) ) ) return false;
173 return true;
174 }
175
176 /*** Adds a character sequence to the filter.
177 *
178 * @param s a character sequence.
179 * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}).
180 */
181
182 public boolean add( final CharSequence s ) {
183 boolean result = false;
184 int i = d, l = s.length();
185 long h;
186 while( i-- != 0 ) {
187 h = hash( s, l, i );
188 if ( ! setGetBit( h ) ) result = true;
189 }
190 if ( result ) size++;
191 return result;
192 }
193
194 protected final static long ADDRESS_BITS_PER_UNIT = 5;
195 protected final static long BIT_INDEX_MASK = 31;
196
197 /***
198 * Returns from the local bitvector the value of the bit with
199 * the specified index. The value is <tt>true</tt> if the bit
200 * with the index <tt>bitIndex</tt> is currently set; otherwise,
201 * returns <tt>false</tt>.
202 *
203 * (adapted from cern.colt.bitvector.QuickBitVector)
204 *
205 * @param bitIndex the bit index.
206 * @return the value of the bit with the specified index.
207 */
208 protected boolean getBit(long bitIndex) {
209 long intIndex = (bitIndex >>> ADDRESS_BITS_PER_UNIT);
210 return ((bits[(int)(intIndex / ONE_MB_INTS)][(int)(intIndex % ONE_MB_INTS)]
211 & (1 << (bitIndex & BIT_INDEX_MASK))) != 0);
212 }
213
214 /***
215 * Changes the bit with index <tt>bitIndex</tt> in local bitvector.
216 *
217 * (adapted from cern.colt.bitvector.QuickBitVector)
218 *
219 * @param bitIndex the index of the bit to be set.
220 */
221 protected void setBit(long bitIndex) {
222 long intIndex = (bitIndex >>> ADDRESS_BITS_PER_UNIT);
223 bits[(int)(intIndex / ONE_MB_INTS)][(int)(intIndex % ONE_MB_INTS)]
224 |= 1 << (bitIndex & BIT_INDEX_MASK);
225 }
226
227 /***
228 * Sets the bit with index <tt>bitIndex</tt> in local bitvector --
229 * returning the old value.
230 *
231 * (adapted from cern.colt.bitvector.QuickBitVector)
232 *
233 * @param bitIndex the index of the bit to be set.
234 */
235 protected boolean setGetBit(long bitIndex) {
236 long intIndex = (int) (bitIndex >>> ADDRESS_BITS_PER_UNIT);
237 int a = (int)(intIndex / ONE_MB_INTS);
238 int b = (int)(intIndex % ONE_MB_INTS);
239 int mask = 1 << (bitIndex & BIT_INDEX_MASK);
240 boolean ret = ((bits[a][b] & (mask)) != 0);
241 bits[a][b] |= mask;
242 return ret;
243 }
244
245
246
247
248 public long getSizeBytes() {
249 return bits.length*bits[0].length*4;
250 }
251 }