From 176c021d085f5a45bc9e038e760942aa648dd797 Mon Sep 17 00:00:00 2001 From: Suhas Daftuar Date: Thu, 15 Jun 2017 17:08:48 -0400 Subject: [PATCH] [qa] Test non-atomic chainstate writes Adds new functional test, dbcrash.py, which uses -dbcrashratio to exercise the logic for recovering from a crash during chainstate flush. dbcrash.py is added to the extended tests, as it may take ~10 minutes to run Use _Exit() instead of exit() for crash simulation This eliminates stderr output such as: terminate called without an active exception or Assertion failed: (!pthread_mutex_destroy(&m)), function ~recursive_mutex, file /usr/local/include/boost/thread/pthread/recursive_mutex.hpp, line 104. Eliminating the stderr output on crash simulation allows testing with test_runner.py, which reports a test as failed if stderr is produced. --- src/txdb.cpp | 2 +- test/functional/dbcrash.py | 268 +++++++++++++++++++++++++ test/functional/test_framework/util.py | 2 + test/functional/test_runner.py | 1 + 4 files changed, 272 insertions(+), 1 deletion(-) create mode 100755 test/functional/dbcrash.py diff --git a/src/txdb.cpp b/src/txdb.cpp index 4cdfc7d6f5..d24162ba2d 100644 --- a/src/txdb.cpp +++ b/src/txdb.cpp @@ -124,7 +124,7 @@ bool CCoinsViewDB::BatchWrite(CCoinsMap &mapCoins, const uint256 &hashBlock) { static FastRandomContext rng; if (rng.randrange(crash_simulate) == 0) { LogPrintf("Simulating a crash. Goodbye.\n"); - exit(0); + _Exit(0); } } } diff --git a/test/functional/dbcrash.py b/test/functional/dbcrash.py new file mode 100755 index 0000000000..4a10743f04 --- /dev/null +++ b/test/functional/dbcrash.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +# Copyright (c) 2017 The Bitcoin Core developers +# Distributed under the MIT software license, see the accompanying +# file COPYING or http://www.opensource.org/licenses/mit-license.php. +"""Test recovery from a crash during chainstate writing.""" + +from test_framework.test_framework import BitcoinTestFramework +from test_framework.util import * +from test_framework.script import * +from test_framework.mininode import * +import random +try: + import http.client as httplib +except ImportError: + import httplib +import errno + +''' +Test structure: + +- 4 nodes + * node0, node1, and node2 will have different dbcrash ratios, and different + dbcache sizes + * node3 will be a regular node, with no crashing. + * The nodes will not connect to each other. + +- use default test framework starting chain. initialize starting_tip_height to + tip height. + +- Main loop: + * generate lots of transactions on node3, enough to fill up a block. + * uniformly randomly pick a tip height from starting_tip_height to + tip_height; with probability 1/(height_difference+4), invalidate this block. + * mine enough blocks to overtake tip_height at start of loop. + * for each node in [node0,node1,node2]: + - for each mined block: + * submit block to node + * if node crashed on/after submitting: + - restart until recovery succeeds + - check that utxo matches node3 using gettxoutsetinfo +''' + +class ChainstateWriteCrashTest(BitcoinTestFramework): + + def __init__(self): + super().__init__() + self.num_nodes = 4 + self.setup_clean_chain = False + + # Set -maxmempool=0 to turn off mempool memory sharing with dbcache + # Set -rpcservertimeout=900 to reduce socket disconnects in this + # long-running test + self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900"] + + # Set different crash ratios and cache sizes. Note that not all of + # -dbcache goes to pcoinsTip. + self.node0_args = ["-dbcrashratio=8", "-dbcache=4", "-dbbatchsize=200000"] + self.base_args + self.node1_args = ["-dbcrashratio=16", "-dbcache=8", "-dbbatchsize=200000"] + self.base_args + self.node2_args = ["-dbcrashratio=24", "-dbcache=16", "-dbbatchsize=200000"] + self.base_args + + # Node3 is a normal node with default args, except will mine full blocks + self.node3_args = ["-blockmaxweight=4000000"] + self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args] + + # We'll track some test coverage statistics + self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2 + self.crashed_on_restart = 0 # Track count of crashes during recovery + + def setup_network(self): + self.setup_nodes() + # Leave them unconnected, we'll use submitblock directly in this test + + # Starts up a given node id, waits for the tip to reach the given block + # hash, and calculates the utxo hash. Exceptions on startup should + # indicate node crash (due to -dbcrashratio), in which case we try again. + # Give up after 60 seconds. + # Returns the utxo hash of the given node. + def restart_node(self, node_index, expected_tip): + time_start = time.time() + while time.time() - time_start < 60: + try: + # Any of these RPC calls could throw due to node crash + self.nodes[node_index] = self.start_node(node_index, self.options.tmpdir, self.extra_args[node_index]) + self.nodes[node_index].waitforblock(expected_tip) + utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_2'] + return utxo_hash + except: + # An exception here should mean the node is about to crash. + # If bitcoind exits, then try again. wait_for_node_exit() + # should raise an exception if bitcoind doesn't exit. + wait_for_node_exit(node_index, timeout=10) + self.crashed_on_restart += 1 + time.sleep(1) + + # If we got here, bitcoind isn't coming back up on restart. Could be a + # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio -- + # perhaps we generated a test case that blew up our cache? + # TODO: If this happens a lot, we should try to restart without -dbcrashratio + # and make sure that recovery happens. + raise AssertionError("Unable to successfully restart node %d in allotted time", node_index) + + # Try submitting a block to the given node. + # Catch any exceptions that indicate the node has crashed. + # Returns true if the block was submitted successfully; false otherwise. + def submit_block_catch_error(self, node_index, block): + try: + self.nodes[node_index].submitblock(block) + return True + except (httplib.CannotSendRequest, httplib.RemoteDisconnected) as e: + self.log.debug("node %d submitblock raised exception: %s", node_index, e) + return False + except OSError as e: + self.log.debug("node %d submitblock raised OSError exception: errno=%s", node_index, e.errno) + if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]: + # The node has likely crashed + return False + else: + # Unexpected exception, raise + raise + + # Use submitblock to sync node3's chain with the other nodes + # If submitblock fails, restart the node and get the new utxo hash. + def sync_node3blocks(self, block_hashes): + # If any nodes crash while updating, we'll compare utxo hashes to + # ensure recovery was successful. + node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2'] + + # Retrieve all the blocks from node3 + blocks = [] + for block_hash in block_hashes: + blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)]) + + # Deliver each block to each other node + for i in range(3): + nodei_utxo_hash = None + self.log.debug("Syncing blocks to node %d", i) + for (block_hash, block) in blocks: + # Get the block from node3, and submit to node_i + self.log.debug("submitting block %s", block_hash) + if not self.submit_block_catch_error(i, block): + # TODO: more carefully check that the crash is due to -dbcrashratio + # (change the exit code perhaps, and check that here?) + wait_for_node_exit(i, timeout=30) + self.log.debug("Restarting node %d after block hash %s", i, block_hash) + nodei_utxo_hash = self.restart_node(i, block_hash) + assert nodei_utxo_hash is not None + self.restart_counts[i] += 1 + else: + # Clear it out after successful submitblock calls -- the cached + # utxo hash will no longer be correct + nodei_utxo_hash = None + + # Check that the utxo hash matches node3's utxo set + # NOTE: we only check the utxo set if we had to restart the node + # after the last block submitted: + # - checking the utxo hash causes a cache flush, which we don't + # want to do every time; so + # - we only update the utxo cache after a node restart, since flushing + # the cache is a no-op at that point + if nodei_utxo_hash is not None: + self.log.debug("Checking txoutsetinfo matches for node %d", i) + assert_equal(nodei_utxo_hash, node3_utxo_hash) + + # Verify that the utxo hash of each node matches node3. + # Restart any nodes that crash while querying. + def verify_utxo_hash(self): + node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2'] + self.log.info("Verifying utxo hash matches for all nodes") + + for i in range(3): + try: + nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_2'] + except OSError: + # probably a crash on db flushing + nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash()) + assert_equal(nodei_utxo_hash, node3_utxo_hash) + + + def generate_small_transactions(self, node, count, utxo_list): + FEE = 1000 # TODO: replace this with node relay fee based calculation + num_transactions = 0 + random.shuffle(utxo_list) + while len(utxo_list) >= 2 and num_transactions < count: + tx = CTransaction() + input_amount = 0 + for i in range(2): + utxo = utxo_list.pop() + tx.vin.append(CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout']))) + input_amount += int(utxo['amount']*COIN) + output_amount = (input_amount - FEE)//3 + + if output_amount <= 0: + # Sanity check -- if we chose inputs that are too small, skip + continue + + for i in range(3): + tx.vout.append(CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey']))) + + # Sign and send the transaction to get into the mempool + tx_signed_hex = node.signrawtransaction(ToHex(tx))['hex'] + node.sendrawtransaction(tx_signed_hex) + num_transactions += 1 + + def run_test(self): + + # Start by creating a lot of utxos on node3 + initial_height = self.nodes[3].getblockcount() + utxo_list = create_confirmed_utxos(self.nodes[3].getnetworkinfo()['relayfee'], self.nodes[3], 5000) + self.log.info("Prepped %d utxo entries", len(utxo_list)) + + # Sync these blocks with the other nodes + block_hashes_to_sync = [] + for height in range(initial_height+1, self.nodes[3].getblockcount()+1): + block_hashes_to_sync.append(self.nodes[3].getblockhash(height)) + + self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync)) + # Syncing the blocks could cause nodes to crash, so the test begins here. + self.sync_node3blocks(block_hashes_to_sync) + + starting_tip_height = self.nodes[3].getblockcount() + + # Main test loop: + # each time through the loop, generate a bunch of transactions, + # and then either mine a single new block on the tip, or some-sized reorg. + for i in range(40): + self.log.info("Iteration %d, generating 2500 transactions %s", i, self.restart_counts) + # Generate a bunch of small-ish transactions + self.generate_small_transactions(self.nodes[3], 2500, utxo_list) + # Pick a random block between current tip, and starting tip + current_height = self.nodes[3].getblockcount() + random_height = random.randint(starting_tip_height, current_height) + self.log.debug("At height %d, considering height %d", current_height, random_height) + if random_height > starting_tip_height: + # Randomly reorg from this point with some probability (1/4 for + # tip, 1/5 for tip-1, ...) + if random.random() < 1.0/(current_height + 4 - random_height): + self.log.debug("Invalidating block at height %d", random_height) + self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height)) + + # Now generate new blocks until we pass the old tip height + self.log.debug("Mining longer tip") + block_hashes = self.nodes[3].generate(current_height+1-self.nodes[3].getblockcount()) + self.log.debug("Syncing %d new blocks...", len(block_hashes)) + self.sync_node3blocks(block_hashes) + utxo_list = self.nodes[3].listunspent() + self.log.debug("Node3 utxo count: %d", len(utxo_list)) + + # Check that the utxo hashes agree with node3 + # Useful side effect: each utxo cache gets flushed here, so that we + # won't get crashes on shutdown at the end of the test. + self.verify_utxo_hash() + + # Check the test coverage + self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart) + + # If no nodes were restarted, we didn't test anything. + assert self.restart_counts != [0, 0, 0] + + # Make sure we tested the case of crash-during-recovery. + assert self.crashed_on_restart > 0 + + # Warn if any of the nodes escaped restart. + for i in range(3): + if self.restart_counts[i] == 0: + self.log.warn("Node %d never crashed during utxo flush!", i) + +if __name__ == "__main__": + ChainstateWriteCrashTest().main() diff --git a/test/functional/test_framework/util.py b/test/functional/test_framework/util.py index fa6388bf96..2a4f3104aa 100644 --- a/test/functional/test_framework/util.py +++ b/test/functional/test_framework/util.py @@ -249,6 +249,8 @@ def wait_for_bitcoind_start(process, datadir, i, rpchost=None): raise time.sleep(0.25) +def wait_for_node_exit(node_index, timeout): + bitcoind_processes[node_index].wait(timeout) def _start_node(i, dirname, extra_args=None, rpchost=None, timewait=None, binary=None, stderr=None): """Start a bitcoind and return RPC connection to it diff --git a/test/functional/test_runner.py b/test/functional/test_runner.py index 0dca318af8..70a7b2a1ff 100755 --- a/test/functional/test_runner.py +++ b/test/functional/test_runner.py @@ -124,6 +124,7 @@ EXTENDED_SCRIPTS = [ # vv Tests less than 5m vv 'maxuploadtarget.py', 'mempool_packages.py', + 'dbcrash.py', # vv Tests less than 2m vv 'bip68-sequence.py', 'getblocktemplate_longpoll.py',