From 573d3a0e0219758535d68ee18ef2809e50874590 Mon Sep 17 00:00:00 2001
From: fanquake <fanquake@gmail.com>
Date: Wed, 5 Feb 2020 08:29:57 +0800
Subject: [PATCH] Merge #17336: scripts: search for first block file for
 linearize-data with some block files pruned

317fb96de9c6257972f1213b4ef2c3fe87dde99f Add search for first blk file with pruned node (Rjected)

Pull request description:

  <!--
  *** Please remove the following help text before submitting: ***

  Pull requests without a rationale and clear improvement may be closed
  immediately.
  -->

  <!--
  Please provide clear motivation for your patch and explain how it improves
  Bitcoin Core user experience or Bitcoin Core developer experience
  significantly:

  * Any test improvements or new tests that improve coverage are always welcome.
  * All other changes should have accompanying unit tests (see `src/test/`) or
    functional tests (see `test/`). Contributors should note which tests cover
    modified code. If no tests exist for a region of modified code, new tests
    should accompany the change.
  * Bug fixes are most welcome when they come with steps to reproduce or an
    explanation of the potential issue as well as reasoning for the way the bug
    was fixed.
  * Features are welcome, but might be rejected due to design or scope issues.
    If a feature is based on a lot of dependencies, contributors should first
    consider building the system outside of Bitcoin Core, if possible.
  * Refactoring changes are only accepted if they are required for a feature or
    bug fix or otherwise improve developer experience significantly. For example,
    most "code style" refactoring changes require a thorough explanation why they
    are useful, what downsides they have and why they *significantly* improve
    developer experience or avoid serious programming bugs. Note that code style
    is often a subjective matter. Unless they are explicitly mentioned to be
    preferred in the [developer notes](/doc/developer-notes.md), stylistic code
    changes are usually rejected.
  -->
  When bitcoind is running in pruned mode, producing a hashlist with `./linearize-hashes.py linearize.cfg > hashlist.txt` and then executing `linearize-data.py linearize.cfg` will produce:
  ```
  Read 313001 hashes
  Input file /home/dan/.bitcoin/blocks/blk00000.dat
  Premature end of block data
  ```
  This happens because `linearize-data` starts by attempting to process `blk00000.dat` regardless of whether or not `blk00000.dat` actually exists - this may not be the case if working with a pruned node.
  This PR adds a function which finds the first block file that does exist, and calls that function when the `BlockDataCopier` is initialized.

  This is a refactor of #16431.

  <!--
  Bitcoin Core has a thorough review process and even the most trivial change
  needs to pass a lot of eyes and requires non-zero or even substantial time
  effort to review. There is a huge lack of active reviewers on the project, so
  patches often sit for a long time.
  -->

ACKs for top commit:
  darosior:
    ACK 317fb96de9c6257972f1213b4ef2c3fe87dde99f
  laanwj:
    Code review ACK 317fb96de9c6257972f1213b4ef2c3fe87dde99f
  theStack:
    Code review ACK https://github.com/bitcoin/bitcoin/pull/17336/commits/317fb96de9c6257972f1213b4ef2c3fe87dde99f

Tree-SHA512: fc8014282df6cfe7b267e64db8ce7d82b86b758c302fbfea4a3c39b62d93512f5c2e31a0de4e9c5ec18fc0268c917f011257d37b45afaef6033eec90e4aa585f
---
 contrib/linearize/linearize-data.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py
index b94d2885d4..108058ac83 100755
--- a/contrib/linearize/linearize-data.py
+++ b/contrib/linearize/linearize-data.py
@@ -16,6 +16,7 @@ import sys
 import dash_hash
 import datetime
 import time
+import glob
 from collections import namedtuple
 from binascii import hexlify, unhexlify
 
@@ -95,6 +96,30 @@ def mkblockmap(blkindex):
         blkmap[hash] = height
     return blkmap
 
+# This gets the first block file ID that exists from the input block
+# file directory.
+def getFirstBlockFileId(block_dir_path):
+    # First, this sets up a pattern to search for block files, for
+    # example 'blkNNNNN.dat'.
+    blkFilePattern = os.path.join(block_dir_path, "blk[0-9][0-9][0-9][0-9][0-9].dat")
+
+    # This search is done with glob
+    blkFnList = glob.glob(blkFilePattern)
+
+    if len(blkFnList) == 0:
+        print("blocks not pruned - starting at 0")
+        return 0
+    # We then get the lexicographic minimum, which should be the first
+    # block file name.
+    firstBlkFilePath = min(blkFnList)
+    firstBlkFn = os.path.basename(firstBlkFilePath)
+
+    # now, the string should be ['b','l','k','N','N','N','N','N','.','d','a','t']
+    # So get the ID by choosing:              3   4   5   6   7
+    # The ID is not necessarily 0 if this is a pruned node.
+    blkId = int(firstBlkFn[3:8])
+    return blkId
+
 # Block header and extent on disk
 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
 
@@ -104,7 +129,9 @@ class BlockDataCopier:
         self.blkindex = blkindex
         self.blkmap = blkmap
 
-        self.inFn = 0
+        # Get first occurring block file id - for pruned nodes this
+        # will not necessarily be 0
+        self.inFn = getFirstBlockFileId(self.settings['input'])
         self.inF = None
         self.outFn = 0
         self.outsz = 0