From c60fd71a65e841efe187992f46c583a704cc37f5 Mon Sep 17 00:00:00 2001
From: Suhas Daftuar <sdaftuar@chaincode.com>
Date: Wed, 11 Oct 2017 08:55:14 -0400
Subject: [PATCH 1/3] Disconnecting from bad outbound peers in IBD

When in IBD, we'd like to use all our outbound peers to help us
sync the chain.  Disconnect any outbound peers whose headers have
insufficient work.
---
 src/net_processing.cpp          | 18 ++++++++++++++++++
 test/functional/minchainwork.py |  8 ++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/net_processing.cpp b/src/net_processing.cpp
index 6c26cd4cee..8551e84558 100644
--- a/src/net_processing.cpp
+++ b/src/net_processing.cpp
@@ -2383,6 +2383,24 @@ bool static ProcessMessage(CNode* pfrom, const std::string& strCommand, CDataStr
                 }
             }
         }
+        // If we're in IBD, we want outbound peers that will serve us a useful
+        // chain. Disconnect peers that are on chains with insufficient work.
+        if (IsInitialBlockDownload() && nCount != MAX_HEADERS_RESULTS) {
+            // When nCount < MAX_HEADERS_RESULTS, we know we have no more
+            // headers to fetch from this peer.
+            if (nodestate->pindexBestKnownBlock && nodestate->pindexBestKnownBlock->nChainWork < nMinimumChainWork) {
+                // This peer has too little work on their headers chain to help
+                // us sync -- disconnect if using an outbound slot (unless
+                // whitelisted or addnode).
+                // Note: We compare their tip to nMinimumChainWork (rather than
+                // chainActive.Tip()) because we won't start block download
+                // until we have a headers chain that has at least
+                // nMinimumChainWork, even if a peer has a chain past our tip,
+                if (!(pfrom->fInbound || pfrom->fWhitelisted || pfrom->m_manual_connection)) {
+                    pfrom->fDisconnect = true;
+                }
+            }
+        }
         }
     }
 
diff --git a/test/functional/minchainwork.py b/test/functional/minchainwork.py
index c7579d2548..35cd7ad141 100755
--- a/test/functional/minchainwork.py
+++ b/test/functional/minchainwork.py
@@ -27,6 +27,7 @@ class MinimumChainWorkTest(BitcoinTestFramework):
     def set_test_params(self):
         self.setup_clean_chain = True
         self.num_nodes = 3
+
         self.extra_args = [[], ["-minimumchainwork=0x65"], ["-minimumchainwork=0x65"]]
         self.node_min_work = [0, 101, 101]
 
@@ -74,6 +75,13 @@ class MinimumChainWorkTest(BitcoinTestFramework):
         self.nodes[0].generate(1)
 
         self.log.info("Verifying nodes are all synced")
+
+        # Because nodes in regtest are all manual connections (eg using
+        # addnode), node1 should not have disconnected node0. If not for that,
+        # we'd expect node1 to have disconnected node0 for serving an
+        # insufficient work chain, in which case we'd need to reconnect them to
+        # continue the test.
+
         self.sync_all()
         self.log.info("Blockcounts: %s", [n.getblockcount() for n in self.nodes])
 

From 5a6d00c6defc587e22c93e63029fdd538ce8858d Mon Sep 17 00:00:00 2001
From: Suhas Daftuar <sdaftuar@gmail.com>
Date: Thu, 12 Oct 2017 13:55:43 -0400
Subject: [PATCH 2/3] Permit disconnection of outbound peers on bad/slow chains

Currently we have no rotation of outbound peers.  If an outbound peer
stops serving us blocks, or is on a consensus-incompatible chain with
less work than our tip (but otherwise valid headers), then we will never
disconnect that peer, even though that peer is using one of our 8
outbound connection slots.  Because we rely on our outbound peers to
find an honest node in order to reach consensus, allowing an
incompatible peer to occupy one of those slots is undesirable,
particularly if it is possible for all such slots to be occupied by such
peers.

Protect against this by always checking to see if a peer's best known
block has less work than our tip, and if so, set a 20 minute timeout --
if the peer is still not known to have caught up to a chain with as much
work as ours after 20 minutes, then send a single getheaders message,
wait 2 more minutes, and if a better header hasn't been received by then,
disconnect that peer.

Note:

- we do not require that our peer sync to the same tip as ours, just an
equal or greater work tip.  (Doing otherwise would risk partitioning the
network in the event of a chain split, and is also unnecessary.)

- we pick 4 of our outbound peers and do not subject them to this logic,
to be more conservative. We don't wish to permit temporary network
issues (or an attacker) to excessively disrupt network topology.
---
 src/net_processing.cpp | 113 ++++++++++++++++++++++++++++++++++++++++-
 src/net_processing.h   |   8 +++
 2 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/src/net_processing.cpp b/src/net_processing.cpp
index 8551e84558..f83a20102e 100644
--- a/src/net_processing.cpp
+++ b/src/net_processing.cpp
@@ -124,6 +124,9 @@ namespace {
     /** Number of peers from which we're downloading blocks. */
     int nPeersWithValidatedDownloads = 0;
 
+    /** Number of outbound peers with m_chain_sync.m_protect. */
+    int g_outbound_peers_with_protect_from_disconnect = 0;
+
     /** Relay map, protected by cs_main. */
     typedef std::map<uint256, CTransactionRef> MapRelay;
     MapRelay mapRelay;
@@ -201,6 +204,33 @@ struct CNodeState {
      */
     bool fSupportsDesiredCmpctVersion;
 
+    /** State used to enforce CHAIN_SYNC_TIMEOUT
+      * Only in effect for outbound, non-manual connections, with
+      * m_protect == false
+      * Algorithm: if a peer's best known block has less work than our tip,
+      * set a timeout CHAIN_SYNC_TIMEOUT seconds in the future:
+      *   - If at timeout their best known block now has more work than our tip
+      *     when the timeout was set, then either reset the timeout or clear it
+      *     (after comparing against our current tip's work)
+      *   - If at timeout their best known block still has less work than our
+      *     tip did when the timeout was set, then send a getheaders message,
+      *     and set a shorter timeout, HEADERS_RESPONSE_TIME seconds in future.
+      *     If their best known block is still behind when that new timeout is
+      *     reached, disconnect.
+      */
+    struct ChainSyncTimeoutState {
+        //! A timeout used for checking whether our peer has sufficiently synced
+        int64_t m_timeout;
+        //! A header with the work we require on our peer's chain
+        const CBlockIndex * m_work_header;
+        //! After timeout is reached, set to true after sending getheaders
+        bool m_sent_getheaders;
+        //! Whether this peer is protected from disconnection due to a bad/slow chain
+        bool m_protect;
+    };
+
+    ChainSyncTimeoutState m_chain_sync;
+
     CNodeState(CAddress addrIn, std::string addrNameIn) : address(addrIn), name(addrNameIn) {
         fCurrentlyConnected = false;
         nMisbehavior = 0;
@@ -223,6 +253,7 @@ struct CNodeState {
         fHaveWitness = false;
         fWantsCmpctWitness = false;
         fSupportsDesiredCmpctVersion = false;
+        m_chain_sync = { 0, nullptr, false, false };
     }
 };
 
@@ -502,6 +533,13 @@ void FindNextBlocksToDownload(NodeId nodeid, unsigned int count, std::vector<con
 
 } // namespace
 
+// Returns true for outbound peers, excluding manual connections, feelers, and
+// one-shots
+bool IsOutboundDisconnectionCandidate(const CNode *node)
+{
+    return !(node->fInbound || node->m_manual_connection || node->fFeeler || node->fOneShot);
+}
+
 void PeerLogicValidation::InitializeNode(CNode *pnode) {
     CAddress addr = pnode->addr;
     std::string addrName = pnode->GetAddrName();
@@ -534,6 +572,8 @@ void PeerLogicValidation::FinalizeNode(NodeId nodeid, bool& fUpdateConnectionTim
     nPreferredDownload -= state->fPreferredDownload;
     nPeersWithValidatedDownloads -= (state->nBlocksInFlightValidHeaders != 0);
     assert(nPeersWithValidatedDownloads >= 0);
+    g_outbound_peers_with_protect_from_disconnect -= state->m_chain_sync.m_protect;
+    assert(g_outbound_peers_with_protect_from_disconnect >= 0);
 
     mapNodeState.erase(nodeid);
 
@@ -542,6 +582,7 @@ void PeerLogicValidation::FinalizeNode(NodeId nodeid, bool& fUpdateConnectionTim
         assert(mapBlocksInFlight.empty());
         assert(nPreferredDownload == 0);
         assert(nPeersWithValidatedDownloads == 0);
+        assert(g_outbound_peers_with_protect_from_disconnect == 0);
     }
     LogPrint(BCLog::NET, "Cleared nodestate for peer=%d\n", nodeid);
 }
@@ -2324,6 +2365,10 @@ bool static ProcessMessage(CNode* pfrom, const std::string& strCommand, CDataStr
         assert(pindexLast);
         UpdateBlockAvailability(pfrom->GetId(), pindexLast->GetBlockHash());
 
+        // From here, pindexBestKnownBlock should be guaranteed to be non-null,
+        // because it is set in UpdateBlockAvailability. Some nullptr checks
+        // are still present, however, as belt-and-suspenders.
+
         if (nCount == MAX_HEADERS_RESULTS) {
             // Headers message had its maximum size; the peer may have more headers.
             // TODO: optimize: if pindexLast is an ancestor of chainActive.Tip or pindexBestHeader, continue
@@ -2396,11 +2441,22 @@ bool static ProcessMessage(CNode* pfrom, const std::string& strCommand, CDataStr
                 // chainActive.Tip()) because we won't start block download
                 // until we have a headers chain that has at least
                 // nMinimumChainWork, even if a peer has a chain past our tip,
-                if (!(pfrom->fInbound || pfrom->fWhitelisted || pfrom->m_manual_connection)) {
+                // as an anti-DoS measure.
+                if (IsOutboundDisconnectionCandidate(pfrom)) {
+                    LogPrintf("Disconnecting outbound peer %d -- headers chain has insufficient work\n", pfrom->GetId());
                     pfrom->fDisconnect = true;
                 }
             }
         }
+
+        if (!pfrom->fDisconnect && IsOutboundDisconnectionCandidate(pfrom) && nodestate->pindexBestKnownBlock != nullptr) {
+            // If this is an outbound peer, check to see if we should protect
+            // it from the bad/lagging chain logic.
+            if (g_outbound_peers_with_protect_from_disconnect < MAX_OUTBOUND_PEERS_TO_PROTECT_FROM_DISCONNECT && nodestate->pindexBestKnownBlock->nChainWork >= chainActive.Tip()->nChainWork && !nodestate->m_chain_sync.m_protect) {
+                nodestate->m_chain_sync.m_protect = true;
+                ++g_outbound_peers_with_protect_from_disconnect;
+            }
+        }
         }
     }
 
@@ -2799,6 +2855,58 @@ bool PeerLogicValidation::ProcessMessages(CNode* pfrom, std::atomic<bool>& inter
     return fMoreWork;
 }
 
+void PeerLogicValidation::ConsiderEviction(CNode *pto, int64_t time_in_seconds)
+{
+    AssertLockHeld(cs_main);
+
+    CNodeState &state = *State(pto->GetId());
+    const CNetMsgMaker msgMaker(pto->GetSendVersion());
+
+    if (!state.m_chain_sync.m_protect && IsOutboundDisconnectionCandidate(pto) && state.fSyncStarted) {
+        // This is an outbound peer subject to disconnection if they don't
+        // announce a block with as much work as the current tip within
+        // CHAIN_SYNC_TIMEOUT + HEADERS_RESPONSE_TIME seconds (note: if
+        // their chain has more work than ours, we should sync to it,
+        // unless it's invalid, in which case we should find that out and
+        // disconnect from them elsewhere).
+        if (state.pindexBestKnownBlock != nullptr && state.pindexBestKnownBlock->nChainWork >= chainActive.Tip()->nChainWork) {
+            if (state.m_chain_sync.m_timeout != 0) {
+                state.m_chain_sync.m_timeout = 0;
+                state.m_chain_sync.m_work_header = nullptr;
+                state.m_chain_sync.m_sent_getheaders = false;
+            }
+        } else if (state.m_chain_sync.m_timeout == 0 || (state.m_chain_sync.m_work_header != nullptr && state.pindexBestKnownBlock != nullptr && state.pindexBestKnownBlock->nChainWork >= state.m_chain_sync.m_work_header->nChainWork)) {
+            // Our best block known by this peer is behind our tip, and we're either noticing
+            // that for the first time, OR this peer was able to catch up to some earlier point
+            // where we checked against our tip.
+            // Either way, set a new timeout based on current tip.
+            state.m_chain_sync.m_timeout = time_in_seconds + CHAIN_SYNC_TIMEOUT;
+            state.m_chain_sync.m_work_header = chainActive.Tip();
+            state.m_chain_sync.m_sent_getheaders = false;
+        } else if (state.m_chain_sync.m_timeout > 0 && time_in_seconds > state.m_chain_sync.m_timeout) {
+            // No evidence yet that our peer has synced to a chain with work equal to that
+            // of our tip, when we first detected it was behind. Send a single getheaders
+            // message to give the peer a chance to update us.
+            if (state.m_chain_sync.m_sent_getheaders) {
+                // They've run out of time to catch up!
+                LogPrintf("Disconnecting outbound peer %d for old chain, best known block = %s\n", pto->GetId(), state.pindexBestKnownBlock != nullptr ? state.pindexBestKnownBlock->GetBlockHash().ToString() : "<none>");
+                pto->fDisconnect = true;
+            } else {
+                LogPrint(BCLog::NET, "sending getheaders to outbound peer=%d to verify chain work (current best known block:%s, benchmark blockhash: %s)\n", pto->GetId(), state.pindexBestKnownBlock != nullptr ? state.pindexBestKnownBlock->GetBlockHash().ToString() : "<none>", state.m_chain_sync.m_work_header->GetBlockHash().ToString());
+                connman->PushMessage(pto, msgMaker.Make(NetMsgType::GETHEADERS, chainActive.GetLocator(state.m_chain_sync.m_work_header->pprev), uint256()));
+                state.m_chain_sync.m_sent_getheaders = true;
+                constexpr int64_t HEADERS_RESPONSE_TIME = 120; // 2 minutes
+                // Bump the timeout to allow a response, which could clear the timeout
+                // (if the response shows the peer has synced), reset the timeout (if
+                // the peer syncs to the required work but not to our tip), or result
+                // in disconnect (if we advance to the timeout and pindexBestKnownBlock
+                // has not sufficiently progressed)
+                state.m_chain_sync.m_timeout = time_in_seconds + HEADERS_RESPONSE_TIME;
+            }
+        }
+    }
+}
+
 class CompareInvMempoolOrder
 {
     CTxMemPool *mp;
@@ -3265,6 +3373,9 @@ bool PeerLogicValidation::SendMessages(CNode* pto, std::atomic<bool>& interruptM
             }
         }
 
+        // Check that outbound peers have reasonable chains
+        // GetTime() is used by this anti-DoS logic so we can test this using mocktime
+        ConsiderEviction(pto, GetTime());
 
         //
         // Message: getdata (blocks)
diff --git a/src/net_processing.h b/src/net_processing.h
index 79745cdd42..656324bba0 100644
--- a/src/net_processing.h
+++ b/src/net_processing.h
@@ -21,6 +21,12 @@ static const unsigned int DEFAULT_BLOCK_RECONSTRUCTION_EXTRA_TXN = 100;
  *  Timeout = base + per_header * (expected number of headers) */
 static constexpr int64_t HEADERS_DOWNLOAD_TIMEOUT_BASE = 15 * 60 * 1000000; // 15 minutes
 static constexpr int64_t HEADERS_DOWNLOAD_TIMEOUT_PER_HEADER = 1000; // 1ms/header
+/** Protect at least this many outbound peers from disconnection due to slow/
+ * behind headers chain.
+ */
+static constexpr int32_t MAX_OUTBOUND_PEERS_TO_PROTECT_FROM_DISCONNECT = 4;
+/** Timeout for (unprotected) outbound peers to sync to our chainwork, in seconds */
+static constexpr int64_t CHAIN_SYNC_TIMEOUT = 20 * 60; // 20 minutes
 
 class PeerLogicValidation : public CValidationInterface, public NetEventsInterface {
 private:
@@ -47,6 +53,8 @@ public:
     * @return                      True if there is more work to be done
     */
     bool SendMessages(CNode* pto, std::atomic<bool>& interrupt) override;
+
+    void ConsiderEviction(CNode *pto, int64_t time_in_seconds);
 };
 
 struct CNodeStateStats {

From e065249c014a070a8799b2ff947af5b8f012c5c1 Mon Sep 17 00:00:00 2001
From: Suhas Daftuar <sdaftuar@gmail.com>
Date: Thu, 19 Oct 2017 11:42:47 -0400
Subject: [PATCH 3/3] Add unit test for outbound peer eviction

---
 src/test/DoS_tests.cpp | 55 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/test/DoS_tests.cpp b/src/test/DoS_tests.cpp
index b88ad5ed1b..7bcf304833 100644
--- a/src/test/DoS_tests.cpp
+++ b/src/test/DoS_tests.cpp
@@ -42,6 +42,51 @@ static NodeId id = 0;
 
 BOOST_FIXTURE_TEST_SUITE(DoS_tests, TestingSetup)
 
+// Test eviction of an outbound peer whose chain never advances
+// Mock a node connection, and use mocktime to simulate a peer
+// which never sends any headers messages.  PeerLogic should
+// decide to evict that outbound peer, after the appropriate timeouts.
+// Note that we protect 4 outbound nodes from being subject to
+// this logic; this test takes advantage of that protection only
+// being applied to nodes which send headers with sufficient
+// work.
+BOOST_AUTO_TEST_CASE(outbound_slow_chain_eviction)
+{
+    std::atomic<bool> interruptDummy(false);
+
+    // Mock an outbound peer
+    CAddress addr1(ip(0xa0b0c001), NODE_NONE);
+    CNode dummyNode1(id++, ServiceFlags(NODE_NETWORK|NODE_WITNESS), 0, INVALID_SOCKET, addr1, 0, 0, CAddress(), "", /*fInboundIn=*/ false);
+    dummyNode1.SetSendVersion(PROTOCOL_VERSION);
+
+    peerLogic->InitializeNode(&dummyNode1);
+    dummyNode1.nVersion = 1;
+    dummyNode1.fSuccessfullyConnected = true;
+
+    // This test requires that we have a chain with non-zero work.
+    BOOST_CHECK(chainActive.Tip() != nullptr);
+    BOOST_CHECK(chainActive.Tip()->nChainWork > 0);
+
+    // Test starts here
+    peerLogic->SendMessages(&dummyNode1, interruptDummy); // should result in getheaders
+    BOOST_CHECK(dummyNode1.vSendMsg.size() > 0);
+    dummyNode1.vSendMsg.clear();
+
+    int64_t nStartTime = GetTime();
+    // Wait 21 minutes
+    SetMockTime(nStartTime+21*60);
+    peerLogic->SendMessages(&dummyNode1, interruptDummy); // should result in getheaders
+    BOOST_CHECK(dummyNode1.vSendMsg.size() > 0);
+    // Wait 3 more minutes
+    SetMockTime(nStartTime+24*60);
+    peerLogic->SendMessages(&dummyNode1, interruptDummy); // should result in disconnect
+    BOOST_CHECK(dummyNode1.fDisconnect == true);
+    SetMockTime(0);
+
+    bool dummy;
+    peerLogic->FinalizeNode(dummyNode1.GetId(), dummy);
+}
+
 BOOST_AUTO_TEST_CASE(DoS_banning)
 {
     std::atomic<bool> interruptDummy(false);
@@ -71,6 +116,10 @@ BOOST_AUTO_TEST_CASE(DoS_banning)
     Misbehaving(dummyNode2.GetId(), 50);
     peerLogic->SendMessages(&dummyNode2, interruptDummy);
     BOOST_CHECK(connman->IsBanned(addr2));
+
+    bool dummy;
+    peerLogic->FinalizeNode(dummyNode1.GetId(), dummy);
+    peerLogic->FinalizeNode(dummyNode2.GetId(), dummy);
 }
 
 BOOST_AUTO_TEST_CASE(DoS_banscore)
@@ -95,6 +144,9 @@ BOOST_AUTO_TEST_CASE(DoS_banscore)
     peerLogic->SendMessages(&dummyNode1, interruptDummy);
     BOOST_CHECK(connman->IsBanned(addr1));
     gArgs.ForceSetArg("-banscore", std::to_string(DEFAULT_BANSCORE_THRESHOLD));
+
+    bool dummy;
+    peerLogic->FinalizeNode(dummyNode1.GetId(), dummy);
 }
 
 BOOST_AUTO_TEST_CASE(DoS_bantime)
@@ -121,6 +173,9 @@ BOOST_AUTO_TEST_CASE(DoS_bantime)
 
     SetMockTime(nStartTime+60*60*24+1);
     BOOST_CHECK(!connman->IsBanned(addr));
+
+    bool dummy;
+    peerLogic->FinalizeNode(dummyNode.GetId(), dummy);
 }
 
 CTransactionRef RandomOrphan()