bloom.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. // bloom.go - Bloom filter.
  2. // Written in 2015 by Yawning Angel
  3. //
  4. // To the extent possible under law, the author(s) have dedicated all copyright
  5. // and related and neighboring rights to this software to the public domain
  6. // worldwide. This software is distributed without any warranty.
  7. //
  8. // You should have received a copy of the CC0 Public Domain Dedication along
  9. // with this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
  10. // Package bloom implements a Bloom Filter.
  11. package bloom
  12. import (
  13. "encoding/binary"
  14. "fmt"
  15. "io"
  16. "math"
  17. "strconv"
  18. "github.com/dchest/siphash"
  19. )
  20. const (
  21. maxNrHashes = 32
  22. ln2 = 0.69314718055994529
  23. )
  24. // Filter is a bloom filter.
  25. type Filter struct {
  26. b []byte
  27. hashMask uint64
  28. k1, k2 uint64
  29. nrHashes int
  30. nrEntriesMax int
  31. nrEntries int
  32. }
  33. // DeriveSize returns the size of a filter (as a power of 2) in bits,
  34. // required to hold at least n entries with a p false positive rate.
  35. //
  36. // The returned value is directly suitable for use as the mLn2 parameter
  37. // passed to New().
  38. func DeriveSize(n int, p float64) int {
  39. if n <= 0 {
  40. panic("negative number of entries")
  41. }
  42. if p <= 0.0 || p >= 1.0 {
  43. panic("invalid false positive rate")
  44. }
  45. m := (float64(n) * math.Log(p)) / math.Log(1.0/math.Pow(2.0, ln2))
  46. return int(math.Ceil(math.Log2(m)))
  47. }
  48. // New constructs a new Filter with a filter set size 2^mLn2 bits, and false
  49. // postive rate p.
  50. func New(rand io.Reader, mLn2 int, p float64) (*Filter, error) {
  51. const (
  52. ln2Sq = 0.48045301391820139
  53. maxMln2 = strconv.IntSize - 1
  54. )
  55. var key [16]byte
  56. if _, err := io.ReadFull(rand, key[:]); err != nil {
  57. return nil, err
  58. }
  59. if p <= 0.0 || p >= 1.0 {
  60. return nil, fmt.Errorf("invalid false positive rate: %v", p)
  61. }
  62. if mLn2 > maxMln2 {
  63. return nil, fmt.Errorf("requested filter too large: %d", mLn2)
  64. }
  65. m := 1 << uint64(mLn2)
  66. n := -1.0 * float64(m) * ln2Sq / math.Log(p)
  67. k := int((float64(m) * ln2 / n) + 0.5)
  68. if uint64(n) > (1 << uint(maxMln2)) {
  69. return nil, fmt.Errorf("requested filter too large (nrEntriesMax overflow): %d", mLn2)
  70. }
  71. f := new(Filter)
  72. f.k1 = binary.BigEndian.Uint64(key[0:8])
  73. f.k2 = binary.BigEndian.Uint64(key[8:16])
  74. f.nrEntriesMax = int(n)
  75. f.nrHashes = k
  76. f.hashMask = uint64(m - 1)
  77. if f.nrHashes < 2 {
  78. f.nrHashes = 2
  79. } else if f.nrHashes > maxNrHashes {
  80. return nil, fmt.Errorf("requested parameters need too many hashes")
  81. }
  82. f.b = make([]byte, m/8)
  83. return f, nil
  84. }
  85. // MaxEntries returns the maximum capacity of the Filter.
  86. func (f *Filter) MaxEntries() int {
  87. return f.nrEntriesMax
  88. }
  89. // Entries returns the number of entries that have been inserted into the
  90. // Filter.
  91. func (f *Filter) Entries() int {
  92. return f.nrEntries
  93. }
  94. // TestAndSet tests the Filter for a given value's membership, adds the value
  95. // to the filter, and returns true iff it was present at the time of the call.
  96. func (f *Filter) TestAndSet(b []byte) bool {
  97. var hashes [maxNrHashes]uint64
  98. f.getHashes(b, &hashes)
  99. // Just return true iff the entry is present.
  100. if f.test(&hashes) {
  101. return true
  102. }
  103. // Add and return false.
  104. f.add(&hashes)
  105. f.nrEntries++
  106. return false
  107. }
  108. // Test tests the Filter for a given value's membership and returns true iff
  109. // it is present (or a false positive).
  110. func (f *Filter) Test(b []byte) bool {
  111. var hashes [maxNrHashes]uint64
  112. f.getHashes(b, &hashes)
  113. return f.test(&hashes)
  114. }
  115. func (f *Filter) getHashes(b []byte, hashes *[maxNrHashes]uint64) {
  116. // Per "Less Hashing, Same Performance: Building a Better Bloom Filter"
  117. // (Kirsch and Miteznmacher), with a suitably good PRF, only two calls to
  118. // the hash algorithm are needed. This is done with the "experimental"
  119. // 128 bit digest variant of SipHash, split into 2 64 bit unsigned
  120. // integers.
  121. hashes[0], hashes[1] = siphash.Hash128(f.k1, f.k2, b)
  122. for i := 2; i < f.nrHashes; i++ {
  123. hashes[i] = hashes[0] + uint64(i)*hashes[1]
  124. }
  125. }
  126. func (f *Filter) test(hashes *[maxNrHashes]uint64) bool {
  127. for i := 0; i < f.nrHashes; i++ {
  128. idx := hashes[i] & f.hashMask
  129. if 0 == f.b[idx/8]&(1<<(idx&7)) {
  130. // Break out early if there is a miss.
  131. return false
  132. }
  133. }
  134. return true
  135. }
  136. func (f *Filter) add(hashes *[maxNrHashes]uint64) {
  137. for i := 0; i < f.nrHashes; i++ {
  138. idx := hashes[i] & f.hashMask
  139. f.b[idx/8] |= (1 << (idx & 7))
  140. }
  141. }