fix: Improve quorum data caching and cleanup (#5731)

## Issue being fixed or feature implemented ## What was done? ## How Has This Been Tested? ## Breaking Changes ## Checklist: - [x] I have performed a self-review of my own code - [x] I have commented my code, particularly in hard-to-understand areas - [ ] I have added or updated relevant unit/integration/functional/e2e tests - [ ] I have made corresponding changes to the documentation - [x] I have assigned this pull request to a milestone _(for repository code-owners and collaborators only)_ --------- Co-authored-by: PastaPastaPasta <6443210+PastaPastaPasta@users.noreply.github.com>
2024-12-25 03:52:49 +01:00 · 2023-11-29 17:17:58 +03:00 · 2023-11-29 17:17:58 +03:00 · 4092abc10d
commit 4092abc10d
parent 96d4a30510
5 changed files with 67 additions and 31 deletions
--- a/src/llmq/params.h
+++ b/src/llmq/params.h
@ -104,6 +104,13 @@ struct LLMQParams {
    // For rotated quorums it should be equal to 2 x active quorums set.
    int keepOldConnections;

+    // The number of quorums for which we should keep keys. Usually it's equal to keepOldConnections.
+    // Unlike for other quorum types we want to keep data (secret key shares and vvec)
+    // for Platform quorums for much longer because Platform can be restarted and
+    // it must be able to re-sign stuff.
+
+    int keepOldKeys;
+
    // How many members should we try to send all sigShares to before we give up.
    int recoveryMembers;
 };
@ -138,6 +145,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 2, // just a few ones to allow easier testing

        .keepOldConnections = 3,
+        .keepOldKeys = 3,
        .recoveryMembers = 3,
    },

@ -163,6 +171,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 2, // just a few ones to allow easier testing

        .keepOldConnections = 3,
+        .keepOldKeys = 3,
        .recoveryMembers = 3,
    },

@ -188,6 +197,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 2, // just a few ones to allow easier testing

        .keepOldConnections = 3,
+        .keepOldKeys = 3,
        .recoveryMembers = 3,
    },

@ -213,6 +223,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 2, // just a few ones to allow easier testing

        .keepOldConnections = 4,
+        .keepOldKeys = 4,
        .recoveryMembers = 3,
    },

@ -238,6 +249,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 2, // just a few ones to allow easier testing

        .keepOldConnections = 4,
+        .keepOldKeys = 24 * 30 * 2, // 2 months of quorums
        .recoveryMembers = 3,
    },

@ -263,6 +275,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 4, // just a few ones to allow easier testing

        .keepOldConnections = 5,
+        .keepOldKeys = 5,
        .recoveryMembers = 6,
    },

@ -288,6 +301,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 2, // just a few ones to allow easier testing

        .keepOldConnections = 4,
+        .keepOldKeys = 4,
        .recoveryMembers = 4,
    },

@ -313,6 +327,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 4, // just a few ones to allow easier testing

        .keepOldConnections = 5,
+        .keepOldKeys = 24 * 30 * 2, // 2 months of quorums
        .recoveryMembers = 6,
    },

@ -338,6 +353,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {

        .signingActiveQuorumCount = 24, // a full day worth of LLMQs
        .keepOldConnections = 25,
+        .keepOldKeys = 25,
        .recoveryMembers = 25,
    },

@ -363,6 +379,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {

        .signingActiveQuorumCount = 32,
        .keepOldConnections = 64,
+        .keepOldKeys = 64,
        .recoveryMembers = 25,
    },

@ -389,6 +406,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 4, // two days worth of LLMQs

        .keepOldConnections = 5,
+        .keepOldKeys = 5,
        .recoveryMembers = 100,
    },

@ -416,6 +434,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 4, // four days worth of LLMQs

        .keepOldConnections = 5,
+        .keepOldKeys = 5,
        .recoveryMembers = 100,
    },

@ -443,6 +462,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 24, // a full day worth of LLMQs

        .keepOldConnections = 25,
+        .keepOldKeys = 24 * 30 * 2, // 2 months of quorums
        .recoveryMembers = 50,
    },

@ -470,6 +490,7 @@ static constexpr std::array<LLMQParams, 14> available_llmqs = {
        .signingActiveQuorumCount = 24, // a full day worth of LLMQs

        .keepOldConnections = 25,
+        .keepOldKeys = 24 * 30 * 2, // 2 months of quorums
        .recoveryMembers = 12,
    },

--- a/src/llmq/quorums.cpp
+++ b/src/llmq/quorums.cpp
@ -200,8 +200,8 @@ CQuorumManager::CQuorumManager(CBLSWorker& _blsWorker, CChainState& chainstate,
    m_mn_sync(mn_sync),
    m_peerman(peerman)
 {
-    utils::InitQuorumsCache(mapQuorumsCache);
-    utils::InitQuorumsCache(scanQuorumsCache);
+    utils::InitQuorumsCache(mapQuorumsCache, false);
+    utils::InitQuorumsCache(scanQuorumsCache, false);

    quorumThreadInterrupt.reset();
 }
@ -296,7 +296,7 @@ void CQuorumManager::UpdatedBlockTip(const CBlockIndex* pindexNew, bool fInitial
    }

    TriggerQuorumDataRecoveryThreads(pindexNew);
-    CleanupOldQuorumData(pindexNew);
+    StartCleanupOldQuorumDataThread(pindexNew);
 }

 void CQuorumManager::CheckQuorumConnections(const Consensus::LLMQParams& llmqParams, const CBlockIndex* pindexNew) const
@ -956,7 +956,7 @@ void CQuorumManager::StartQuorumDataRecoveryThread(const CQuorumCPtr pQuorum, co
    });
 }

-static void DataCleanupHelper(CDBWrapper& db, std::set<uint256> skip_list)
+static void DataCleanupHelper(CDBWrapper& db, std::set<uint256> skip_list, bool compact = false)
 {
    const auto prefixes = {DB_QUORUM_QUORUM_VVEC, DB_QUORUM_SK_SHARE};

@ -990,39 +990,54 @@ static void DataCleanupHelper(CDBWrapper& db, std::set<uint256> skip_list)

        db.WriteBatch(batch);

-        LogPrint(BCLog::LLMQ, "CQuorumManager::%d -- %s removed %d\n", __func__, prefix, count);
+        LogPrint(BCLog::LLMQ, "CQuorumManager::%s -- %s removed %d\n", __func__, prefix, count);
    }

    pcursor.reset();
+
+    if (compact) {
+        // Avoid using this on regular cleanups, use on db migrations only
+        LogPrint(BCLog::LLMQ, "CQuorumManager::%s -- compact start\n", __func__);
        db.CompactFull();
+        LogPrint(BCLog::LLMQ, "CQuorumManager::%s -- compact end\n", __func__);
+    }
 }

-void CQuorumManager::CleanupOldQuorumData(const CBlockIndex* pIndex) const
+void CQuorumManager::StartCleanupOldQuorumDataThread(const CBlockIndex* pIndex) const
 {
-    if (!fMasternodeMode || pIndex == nullptr || (pIndex->nHeight % 576 != 0)) {
+    // Note: this function is CPU heavy and we don't want it to be running during DKGs.
+    // The largest dkgMiningWindowStart for a related quorum type is 42 (LLMQ_60_75).
+    // At the same time most quorums use dkgInterval = 24 so the next DKG for them
+    // (after block 576 + 42) will start at block 576 + 24 * 2. That's only a 6 blocks
+    // window and it's better to have more room so we pick next cycle.
+    // dkgMiningWindowStart for small quorums is 10 i.e. a safe block to start
+    // these calculations is at height 576 + 24 * 2 + 10 = 576 + 58.
+    if (!fMasternodeMode || pIndex == nullptr || (pIndex->nHeight % 576 != 58)) {
        return;
    }

+    cxxtimer::Timer t(/*start=*/ true);
+    LogPrint(BCLog::LLMQ, "CQuorumManager::%s -- start\n", __func__);
+
+    // do not block the caller thread
+    workerPool.push([pIndex, t, this](int threadId) {
        std::set<uint256> dbKeysToSkip;

-    LogPrint(BCLog::LLMQ, "CQuorumManager::%d -- start\n", __func__);
-    // Platform quorums in all networks are created every 24 blocks (~1h).
-    // Unlike for other quorum types we want to keep data (secret key shares and vvec)
-    // for Platform quorums for at least 2 months because Platform can be restarted and
-    // it must be able to re-sign stuff. During a month, 24 * 30 quorums are created.
-    constexpr auto numPlatformQuorumsDataToKeep = 24 * 30 * 2;
-
        for (const auto& params : Params().GetConsensus().llmqs) {
-        auto nQuorumsToKeep = params.type == Params().GetConsensus().llmqTypePlatform ? numPlatformQuorumsDataToKeep : params.keepOldConnections;
-        const auto vecQuorums = ScanQuorums(params.type, pIndex, nQuorumsToKeep);
-        for (const auto& pQuorum : vecQuorums) {
+            if (quorumThreadInterrupt) {
+                break;
+            }
+            for (const auto& pQuorum : ScanQuorums(params.type, pIndex, params.keepOldKeys)) {
                dbKeysToSkip.insert(MakeQuorumKey(*pQuorum));
            }
        }

+        if (!quorumThreadInterrupt) {
            DataCleanupHelper(m_evoDb.GetRawDB(), dbKeysToSkip);
+        }

-    LogPrint(BCLog::LLMQ, "CQuorumManager::%d -- done\n", __func__);
+        LogPrint(BCLog::LLMQ, "CQuorumManager::StartCleanupOldQuorumDataThread -- done. time=%d\n", t.count());
+    });
 }

 } // namespace llmq
--- a/src/llmq/quorums.h
+++ b/src/llmq/quorums.h
@ -277,7 +277,7 @@ private:
    void StartCachePopulatorThread(const CQuorumCPtr pQuorum) const;
    void StartQuorumDataRecoveryThread(const CQuorumCPtr pQuorum, const CBlockIndex* pIndex, uint16_t nDataMask) const;

-    void CleanupOldQuorumData(const CBlockIndex* pIndex) const;
+    void StartCleanupOldQuorumDataThread(const CBlockIndex* pIndex) const;
 };

 extern std::unique_ptr<CQuorumManager> quorumManager;
--- a/src/llmq/utils.cpp
+++ b/src/llmq/utils.cpp
@ -1110,17 +1110,17 @@ std::map<Consensus::LLMQType, QvvecSyncMode> GetEnabledQuorumVvecSyncEntries()
 }

 template <typename CacheType>
-void InitQuorumsCache(CacheType& cache)
+void InitQuorumsCache(CacheType& cache, bool limit_by_connections)
 {
    for (const auto& llmq : Params().GetConsensus().llmqs) {
        cache.emplace(std::piecewise_construct, std::forward_as_tuple(llmq.type),
-                      std::forward_as_tuple(llmq.keepOldConnections));
+                      std::forward_as_tuple(limit_by_connections ? llmq.keepOldConnections : llmq.keepOldKeys));
    }
 }
-template void InitQuorumsCache<std::map<Consensus::LLMQType, unordered_lru_cache<uint256, bool, StaticSaltedHasher>>>(std::map<Consensus::LLMQType, unordered_lru_cache<uint256, bool, StaticSaltedHasher>>& cache);
-template void InitQuorumsCache<std::map<Consensus::LLMQType, unordered_lru_cache<uint256, std::vector<CQuorumCPtr>, StaticSaltedHasher>>>(std::map<Consensus::LLMQType, unordered_lru_cache<uint256, std::vector<CQuorumCPtr>, StaticSaltedHasher>>& cache);
-template void InitQuorumsCache<std::map<Consensus::LLMQType, unordered_lru_cache<uint256, std::shared_ptr<llmq::CQuorum>, StaticSaltedHasher, 0ul, 0ul>, std::less<Consensus::LLMQType>, std::allocator<std::pair<Consensus::LLMQType const, unordered_lru_cache<uint256, std::shared_ptr<llmq::CQuorum>, StaticSaltedHasher, 0ul, 0ul>>>>>(std::map<Consensus::LLMQType, unordered_lru_cache<uint256, std::shared_ptr<llmq::CQuorum>, StaticSaltedHasher, 0ul, 0ul>, std::less<Consensus::LLMQType>, std::allocator<std::pair<Consensus::LLMQType const, unordered_lru_cache<uint256, std::shared_ptr<llmq::CQuorum>, StaticSaltedHasher, 0ul, 0ul>>>>&);
-template void InitQuorumsCache<std::map<Consensus::LLMQType, unordered_lru_cache<uint256, int, StaticSaltedHasher>>>(std::map<Consensus::LLMQType, unordered_lru_cache<uint256, int, StaticSaltedHasher>>& cache);
+template void InitQuorumsCache<std::map<Consensus::LLMQType, unordered_lru_cache<uint256, bool, StaticSaltedHasher>>>(std::map<Consensus::LLMQType, unordered_lru_cache<uint256, bool, StaticSaltedHasher>>& cache, bool limit_by_connections);
+template void InitQuorumsCache<std::map<Consensus::LLMQType, unordered_lru_cache<uint256, std::vector<CQuorumCPtr>, StaticSaltedHasher>>>(std::map<Consensus::LLMQType, unordered_lru_cache<uint256, std::vector<CQuorumCPtr>, StaticSaltedHasher>>& cache, bool limit_by_connections);
+template void InitQuorumsCache<std::map<Consensus::LLMQType, unordered_lru_cache<uint256, std::shared_ptr<llmq::CQuorum>, StaticSaltedHasher, 0ul, 0ul>, std::less<Consensus::LLMQType>, std::allocator<std::pair<Consensus::LLMQType const, unordered_lru_cache<uint256, std::shared_ptr<llmq::CQuorum>, StaticSaltedHasher, 0ul, 0ul>>>>>(std::map<Consensus::LLMQType, unordered_lru_cache<uint256, std::shared_ptr<llmq::CQuorum>, StaticSaltedHasher, 0ul, 0ul>, std::less<Consensus::LLMQType>, std::allocator<std::pair<Consensus::LLMQType const, unordered_lru_cache<uint256, std::shared_ptr<llmq::CQuorum>, StaticSaltedHasher, 0ul, 0ul>>>>&cache, bool limit_by_connections);
+template void InitQuorumsCache<std::map<Consensus::LLMQType, unordered_lru_cache<uint256, int, StaticSaltedHasher>>>(std::map<Consensus::LLMQType, unordered_lru_cache<uint256, int, StaticSaltedHasher>>& cache, bool limit_by_connections);

 } // namespace utils

--- a/src/llmq/utils.h
+++ b/src/llmq/utils.h
@ -120,7 +120,7 @@ void IterateNodesRandom(NodesContainer& nodeStates, Continue&& cont, Callback&&
 }

 template <typename CacheType>
-void InitQuorumsCache(CacheType& cache);
+void InitQuorumsCache(CacheType& cache, bool limit_by_connections = true);

 } // namespace utils