mirror of
https://github.com/dashpay/dash.git
synced 2024-12-26 12:32:48 +01:00
Switch to a more efficient rolling Bloom filter
For each 'bit' in the filter we really maintain 2 bits, which store either: 0: not set 1-3: set in generation N After (nElements / 2) insertions, we switch to a new generation, and wipe entries which already had the new generation number, effectively switching from the last 1.5 * nElements set to the last 1.0 * nElements set. This is 25% more space efficient than the previous implementation, and can (at peak) store 1.5 times the requested amount of history (though only 1.0 times the requested history is guaranteed). The existing unit tests should be sufficient.
This commit is contained in:
parent
92aa7311d6
commit
086ee67d83
@ -216,30 +216,54 @@ void CBloomFilter::UpdateEmptyFull()
|
|||||||
isEmpty = empty;
|
isEmpty = empty;
|
||||||
}
|
}
|
||||||
|
|
||||||
CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate) :
|
CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate)
|
||||||
b1(nElements * 2, fpRate, 0), b2(nElements * 2, fpRate, 0)
|
|
||||||
{
|
{
|
||||||
// Implemented using two bloom filters of 2 * nElements each.
|
double logFpRate = log(fpRate);
|
||||||
// We fill them up, and clear them, staggered, every nElements
|
/* The optimal number of hash functions is log(fpRate) / log(0.5), but
|
||||||
// inserted, so at least one always contains the last nElements
|
* restrict it to the range 1-50. */
|
||||||
// inserted.
|
nHashFuncs = std::max(1, std::min((int)round(logFpRate / log(0.5)), 50));
|
||||||
nInsertions = 0;
|
/* In this rolling bloom filter, we'll store between 2 and 3 generations of nElements / 2 entries. */
|
||||||
nBloomSize = nElements * 2;
|
nEntriesPerGeneration = (nElements + 1) / 2;
|
||||||
|
uint32_t nMaxElements = nEntriesPerGeneration * 3;
|
||||||
|
/* The maximum fpRate = pow(1.0 - exp(-nHashFuncs * nMaxElements / nFilterBits), nHashFuncs)
|
||||||
|
* => pow(fpRate, 1.0 / nHashFuncs) = 1.0 - exp(-nHashFuncs * nMaxElements / nFilterBits)
|
||||||
|
* => 1.0 - pow(fpRate, 1.0 / nHashFuncs) = exp(-nHashFuncs * nMaxElements / nFilterBits)
|
||||||
|
* => log(1.0 - pow(fpRate, 1.0 / nHashFuncs)) = -nHashFuncs * nMaxElements / nFilterBits
|
||||||
|
* => nFilterBits = -nHashFuncs * nMaxElements / log(1.0 - pow(fpRate, 1.0 / nHashFuncs))
|
||||||
|
* => nFilterBits = -nHashFuncs * nMaxElements / log(1.0 - exp(logFpRate / nHashFuncs))
|
||||||
|
*/
|
||||||
|
uint32_t nFilterBits = (uint32_t)ceil(-1.0 * nHashFuncs * nMaxElements / log(1.0 - exp(logFpRate / nHashFuncs)));
|
||||||
|
data.clear();
|
||||||
|
/* We store up to 16 'bits' per data element. */
|
||||||
|
data.resize((nFilterBits + 15) / 16);
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Similar to CBloomFilter::Hash */
|
||||||
|
inline unsigned int CRollingBloomFilter::Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const {
|
||||||
|
return MurmurHash3(nHashNum * 0xFBA4C795 + nTweak, vDataToHash) % (data.size() * 16);
|
||||||
|
}
|
||||||
|
|
||||||
void CRollingBloomFilter::insert(const std::vector<unsigned char>& vKey)
|
void CRollingBloomFilter::insert(const std::vector<unsigned char>& vKey)
|
||||||
{
|
{
|
||||||
if (nInsertions == 0) {
|
if (nEntriesThisGeneration == nEntriesPerGeneration) {
|
||||||
b1.clear();
|
nEntriesThisGeneration = 0;
|
||||||
} else if (nInsertions == nBloomSize / 2) {
|
nGeneration++;
|
||||||
b2.clear();
|
if (nGeneration == 4) {
|
||||||
|
nGeneration = 1;
|
||||||
|
}
|
||||||
|
/* Wipe old entries that used this generation number. */
|
||||||
|
for (uint32_t p = 0; p < data.size() * 16; p++) {
|
||||||
|
if (get(p) == nGeneration) {
|
||||||
|
put(p, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
b1.insert(vKey);
|
nEntriesThisGeneration++;
|
||||||
b2.insert(vKey);
|
|
||||||
if (++nInsertions == nBloomSize) {
|
for (int n = 0; n < nHashFuncs; n++) {
|
||||||
nInsertions = 0;
|
uint32_t h = Hash(n, vKey);
|
||||||
|
put(h, nGeneration);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -251,10 +275,13 @@ void CRollingBloomFilter::insert(const uint256& hash)
|
|||||||
|
|
||||||
bool CRollingBloomFilter::contains(const std::vector<unsigned char>& vKey) const
|
bool CRollingBloomFilter::contains(const std::vector<unsigned char>& vKey) const
|
||||||
{
|
{
|
||||||
if (nInsertions < nBloomSize / 2) {
|
for (int n = 0; n < nHashFuncs; n++) {
|
||||||
return b2.contains(vKey);
|
uint32_t h = Hash(n, vKey);
|
||||||
|
if (get(h) == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return b1.contains(vKey);
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CRollingBloomFilter::contains(const uint256& hash) const
|
bool CRollingBloomFilter::contains(const uint256& hash) const
|
||||||
@ -265,8 +292,10 @@ bool CRollingBloomFilter::contains(const uint256& hash) const
|
|||||||
|
|
||||||
void CRollingBloomFilter::reset()
|
void CRollingBloomFilter::reset()
|
||||||
{
|
{
|
||||||
unsigned int nNewTweak = GetRand(std::numeric_limits<unsigned int>::max());
|
nTweak = GetRand(std::numeric_limits<unsigned int>::max());
|
||||||
b1.reset(nNewTweak);
|
nEntriesThisGeneration = 0;
|
||||||
b2.reset(nNewTweak);
|
nGeneration = 1;
|
||||||
nInsertions = 0;
|
for (std::vector<uint32_t>::iterator it = data.begin(); it != data.end(); it++) {
|
||||||
|
*it = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
26
src/bloom.h
26
src/bloom.h
@ -110,8 +110,11 @@ public:
|
|||||||
* reset() is provided, which also changes nTweak to decrease the impact of
|
* reset() is provided, which also changes nTweak to decrease the impact of
|
||||||
* false-positives.
|
* false-positives.
|
||||||
*
|
*
|
||||||
* contains(item) will always return true if item was one of the last N things
|
* contains(item) will always return true if item was one of the last N to 1.5*N
|
||||||
* insert()'ed ... but may also return true for items that were not inserted.
|
* insert()'ed ... but may also return true for items that were not inserted.
|
||||||
|
*
|
||||||
|
* It needs around 1.8 bytes per element per factor 0.1 of false positive rate.
|
||||||
|
* (More accurately: 3/(log(256)*log(2)) * log(1/fpRate) * nElements bytes)
|
||||||
*/
|
*/
|
||||||
class CRollingBloomFilter
|
class CRollingBloomFilter
|
||||||
{
|
{
|
||||||
@ -129,10 +132,23 @@ public:
|
|||||||
void reset();
|
void reset();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
unsigned int nBloomSize;
|
int nEntriesPerGeneration;
|
||||||
unsigned int nInsertions;
|
int nEntriesThisGeneration;
|
||||||
CBloomFilter b1, b2;
|
int nGeneration;
|
||||||
|
std::vector<uint32_t> data;
|
||||||
|
unsigned int nTweak;
|
||||||
|
int nHashFuncs;
|
||||||
|
|
||||||
|
unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const;
|
||||||
|
|
||||||
|
inline int get(uint32_t position) const {
|
||||||
|
return (data[(position >> 4) % data.size()] >> (2 * (position & 0xF))) & 0x3;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void put(uint32_t position, uint32_t val) {
|
||||||
|
uint32_t& cell = data[(position >> 4) % data.size()];
|
||||||
|
cell = (cell & ~(((uint32_t)3) << (2 * (position & 0xF)))) | (val << (2 * (position & 0xF)));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#endif // BITCOIN_BLOOM_H
|
#endif // BITCOIN_BLOOM_H
|
||||||
|
@ -180,7 +180,7 @@ namespace {
|
|||||||
* million to make it highly unlikely for users to have issues with this
|
* million to make it highly unlikely for users to have issues with this
|
||||||
* filter.
|
* filter.
|
||||||
*
|
*
|
||||||
* Memory used: 1.7MB
|
* Memory used: 1.3 MB
|
||||||
*/
|
*/
|
||||||
boost::scoped_ptr<CRollingBloomFilter> recentRejects;
|
boost::scoped_ptr<CRollingBloomFilter> recentRejects;
|
||||||
uint256 hashRecentRejectsChainTip;
|
uint256 hashRecentRejectsChainTip;
|
||||||
|
Loading…
Reference in New Issue
Block a user