Browse Source

Add `DeriveSize()` to assist in sizing the bloom filter.

Yawning Angel 1 year ago
parent
commit
b6eb2cf6d2
2 changed files with 31 additions and 6 deletions
  1. 24 2
      bloom.go
  2. 7 4
      bloom_test.go

+ 24 - 2
bloom.go

@@ -21,7 +21,10 @@ import (
 	"github.com/dchest/siphash"
 )
 
-const maxNrHashes = 32
+const (
+	maxNrHashes = 32
+	ln2         = 0.69314718055994529
+)
 
 // Filter is a bloom filter.
 type Filter struct {
@@ -35,11 +38,26 @@ type Filter struct {
 	nrEntries    int
 }
 
+// DeriveSize returns the size of a filter (as a power of 2) in bits,
+// required to hold at least n entries with a p false positive rate.
+//
+// The returned value is directly suitable for use as the mLn2 parameter
+// passed to New().
+func DeriveSize(n int, p float64) int {
+	if n <= 0 {
+		panic("negative number of entries")
+	}
+	if p <= 0.0 || p >= 1.0 {
+		panic("invalid false positive rate")
+	}
+	m := (float64(n) * math.Log(p)) / math.Log(1.0/math.Pow(2.0, ln2))
+	return int(math.Ceil(math.Log2(m)))
+}
+
 // New constructs a new Filter with a filter set size 2^mLn2 bits, and false
 // postive rate p.
 func New(rand io.Reader, mLn2 int, p float64) (*Filter, error) {
 	const (
-		ln2         = 0.69314718055994529
 		ln2Sq       = 0.48045301391820139
 		maxMln2     = strconv.IntSize - 1
 		maxHeapSize = 1 << 39 // 512 GiB
@@ -50,6 +68,10 @@ func New(rand io.Reader, mLn2 int, p float64) (*Filter, error) {
 		return nil, err
 	}
 
+	if p <= 0.0 || p >= 1.0 {
+		return nil, fmt.Errorf("invalid false positive rate: %v", p)
+	}
+
 	if mLn2 > maxMln2 {
 		return nil, fmt.Errorf("requested filter too large: %d", mLn2)
 	}

+ 7 - 4
bloom_test.go

@@ -23,6 +23,9 @@ func TestFilter(t *testing.T) {
 		entryLength       = 32
 		falsePositiveRate = 0.01
 		filterSize        = 15 // 2^15 bits = 4 KiB
+
+		expectedEntries = 3418
+		expectedHashes  = 7
 	)
 
 	assert := assert.New(t)
@@ -34,10 +37,10 @@ func TestFilter(t *testing.T) {
 	assert.Equal(0, f.Entries(), "Entries(), empty filter")
 	assert.Equal(4096, len(f.b), "Backing store size")
 
-	// I could assert on these since the values calculated by New() are
-	// supposed to be optimal, but I won't for now.
-	t.Logf("Hashes: %v", f.nrHashes)         // 7 hashes is "ideal".
-	t.Logf("MaxEntries: %v", f.MaxEntries()) // 3418 entries with these params.
+	// Assert that the bloom filter math is correct.
+	assert.Equal(expectedEntries, f.MaxEntries(), "Max entries")
+	assert.Equal(expectedHashes, f.nrHashes, "Hashes")
+	assert.Equal(filterSize, DeriveSize(f.MaxEntries(), falsePositiveRate), "DeriveSize()")
 
 	// Generate enough entries to fully saturate the filter.
 	max := f.MaxEntries()