dash/contrib/linearize/linearize-data.py
MarcoFalke 17acd6b472
Merge #12987: tests/tools: Enable additional Python flake8 rules for automatic linting via Travis
643aad17fa Enable additional flake8 rules (practicalswift)
f020aca297 Minor Python cleanups to make flake8 pass with the new rules enabled (practicalswift)

Pull request description:

  Enabled rules:

  ```
  * E242: tab after ','
  * E266: too many leading '#' for block comment
  * E401: multiple imports on one line
  * E402: module level import not at top of file
  * E701: multiple statements on one line (colon)
  * E901: SyntaxError: invalid syntax
  * E902: TokenError: EOF in multi-line string
  * F821: undefined name 'Foo'
  * W293: blank line contains whitespace
  * W606: 'async' and 'await' are reserved keywords starting with Python 3.7
  ```

  Note to reviewers:
  * In general we don't allow whitespace cleanups to existing code, but in order to allow for enabling Travis checking for these rules a few smaller whitespace cleanups had to made as part of this PR.
  * Use [this `?w=1` link](https://github.com/bitcoin/bitcoin/pull/12987/files?w=1) to show a diff without whitespace changes.

  Before this commit:

  ```
  $ flake8 -qq --statistics --ignore=B,C,E,F,I,N,W --select=E112,E113,E115,E116,E125,E131,E133,E223,E224,E242,E266,E271,E272,E273,E274,E275,E304,E306,E401,E402,E502,E701,E702,E703,E714,E721,E741,E742,E743,F401,E901,E902,F402,F404,F406,F407,F601,F602,F621,F622,F631,F701,F702,F703,F704,F705,F706,F707,F811,F812,F821,F822,F823,F831,F841,W292,W293,W504,W601,W602,W603,W604,W605,W606 .
  5     E266 too many leading '#' for block comment
  4     E401 multiple imports on one line
  6     E402 module level import not at top of file
  5     E701 multiple statements on one line (colon)
  1     F812 list comprehension redefines 'n' from line 159
  4     F821 undefined name 'ConnectionRefusedError'
  28    W293 blank line contains whitespace
  ```

  After this commit:

  ```
  $ flake8 -qq --statistics --ignore=B,C,E,F,I,N,W --select=E112,E113,E115,E116,E125,E131,E133,E223,E224,E242,E266,E271,E272,E273,E274,E275,E304,E306,E401,E402,E502,E701,E702,E703,E714,E721,E741,E742,E743,F401,E901,E902,F402,F404,F406,F407,F601,F602,F621,F622,F631,F701,F702,F703,F704,F705,F706,F707,F811,F812,F821,F822,F823,F831,F841,W292,W293,W504,W601,W602,W603,W604,W605,W606 .
  $
  ```

Tree-SHA512: fc7d5e752298a50d4248afc620ee2c173135b4ca008e48e02913ac968e5a24a5fd5396926047ec62f1d580d537434ccae01f249bb2f3338fa59dc630bf97ca7a
Signed-off-by: pasta <pasta@dashboost.org>
2020-07-22 12:12:46 -05:00

324 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
#
# linearize-data.py: Construct a linear, no-fork version of the chain.
#
# Copyright (c) 2013-2014 The Bitcoin Core developers
# Distributed under the MIT software license, see the accompanying
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
#
from __future__ import print_function, division
import struct
import re
import os
import os.path
import sys
import dash_hash
import datetime
import time
from collections import namedtuple
from binascii import hexlify, unhexlify
settings = {}
def hex_switchEndian(s):
""" Switches the endianness of a hex string (in pairs of hex chars) """
pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
return b''.join(pairList[::-1]).decode()
def uint32(x):
return x & 0xffffffff
def bytereverse(x):
return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
(((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
def bufreverse(in_buf):
out_words = []
for i in range(0, len(in_buf), 4):
word = struct.unpack('@I', in_buf[i:i+4])[0]
out_words.append(struct.pack('@I', bytereverse(word)))
return b''.join(out_words)
def wordreverse(in_buf):
out_words = []
for i in range(0, len(in_buf), 4):
out_words.append(in_buf[i:i+4])
out_words.reverse()
return b''.join(out_words)
def calc_hdr_hash(blk_hdr):
#hash1 = hashlib.sha256()
#hash1.update(blk_hdr)
#hash1_o = hash1.digest()
#hash2 = hashlib.sha256()
#hash2.update(hash1_o)
#hash2_o = hash2.digest()
#return hash2_o
pow_hash = dash_hash.getPoWHash(blk_hdr)
return pow_hash
def calc_hash_str(blk_hdr):
hash = calc_hdr_hash(blk_hdr)
hash = bufreverse(hash)
hash = wordreverse(hash)
hash_str = hexlify(hash).decode('utf-8')
return hash_str
def get_blk_dt(blk_hdr):
members = struct.unpack("<I", blk_hdr[68:68+4])
nTime = members[0]
dt = datetime.datetime.fromtimestamp(nTime)
dt_ym = datetime.datetime(dt.year, dt.month, 1)
return (dt_ym, nTime)
# When getting the list of block hashes, undo any byte reversals.
def get_block_hashes(settings):
blkindex = []
f = open(settings['hashlist'], "r", encoding="utf8")
for line in f:
line = line.rstrip()
if settings['rev_hash_bytes'] == 'true':
line = hex_switchEndian(line)
blkindex.append(line)
print("Read " + str(len(blkindex)) + " hashes")
return blkindex
# The block map shouldn't give or receive byte-reversed hashes.
def mkblockmap(blkindex):
blkmap = {}
for height,hash in enumerate(blkindex):
blkmap[hash] = height
return blkmap
# Block header and extent on disk
BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
class BlockDataCopier:
def __init__(self, settings, blkindex, blkmap):
self.settings = settings
self.blkindex = blkindex
self.blkmap = blkmap
self.inFn = 0
self.inF = None
self.outFn = 0
self.outsz = 0
self.outF = None
self.outFname = None
self.blkCountIn = 0
self.blkCountOut = 0
self.lastDate = datetime.datetime(2000, 1, 1)
self.highTS = 1408893517 - 315360000
self.timestampSplit = False
self.fileOutput = True
self.setFileTime = False
self.maxOutSz = settings['max_out_sz']
if 'output' in settings:
self.fileOutput = False
if settings['file_timestamp'] != 0:
self.setFileTime = True
if settings['split_timestamp'] != 0:
self.timestampSplit = True
# Extents and cache for out-of-order blocks
self.blockExtents = {}
self.outOfOrderData = {}
self.outOfOrderSize = 0 # running total size for items in outOfOrderData
def writeBlock(self, inhdr, blk_hdr, rawblock):
blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
self.outF.close()
if self.setFileTime:
os.utime(self.outFname, (int(time.time()), self.highTS))
self.outF = None
self.outFname = None
self.outFn = self.outFn + 1
self.outsz = 0
(blkDate, blkTS) = get_blk_dt(blk_hdr)
if self.timestampSplit and (blkDate > self.lastDate):
print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
self.lastDate = blkDate
if self.outF:
self.outF.close()
if self.setFileTime:
os.utime(self.outFname, (int(time.time()), self.highTS))
self.outF = None
self.outFname = None
self.outFn = self.outFn + 1
self.outsz = 0
if not self.outF:
if self.fileOutput:
self.outFname = self.settings['output_file']
else:
self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
print("Output file " + self.outFname)
self.outF = open(self.outFname, "wb")
self.outF.write(inhdr)
self.outF.write(blk_hdr)
self.outF.write(rawblock)
self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
self.blkCountOut = self.blkCountOut + 1
if blkTS > self.highTS:
self.highTS = blkTS
if (self.blkCountOut % 1000) == 0:
print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
def inFileName(self, fn):
return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
def fetchBlock(self, extent):
'''Fetch block contents from disk given extents'''
with open(self.inFileName(extent.fn), "rb") as f:
f.seek(extent.offset)
return f.read(extent.size)
def copyOneBlock(self):
'''Find the next block to be written in the input, and copy it to the output.'''
extent = self.blockExtents.pop(self.blkCountOut)
if self.blkCountOut in self.outOfOrderData:
# If the data is cached, use it from memory and remove from the cache
rawblock = self.outOfOrderData.pop(self.blkCountOut)
self.outOfOrderSize -= len(rawblock)
else: # Otherwise look up data on disk
rawblock = self.fetchBlock(extent)
self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
def run(self):
while self.blkCountOut < len(self.blkindex):
if not self.inF:
fname = self.inFileName(self.inFn)
print("Input file " + fname)
try:
self.inF = open(fname, "rb")
except IOError:
print("Premature end of block data")
return
inhdr = self.inF.read(8)
if (not inhdr or (inhdr[0] == "\0")):
self.inF.close()
self.inF = None
self.inFn = self.inFn + 1
continue
inMagic = inhdr[:4]
if (inMagic != self.settings['netmagic']):
print("Invalid magic: " + hexlify(inMagic).decode('utf-8'))
return
inLenLE = inhdr[4:]
su = struct.unpack("<I", inLenLE)
inLen = su[0] - 80 # length without header
blk_hdr = self.inF.read(80)
inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
self.hash_str = calc_hash_str(blk_hdr)
if not self.hash_str in blkmap:
# Because blocks can be written to files out-of-order as of 0.10, the script
# may encounter blocks it doesn't know about. Treat as debug output.
if settings['debug_output'] == 'true':
print("Skipping unknown block " + self.hash_str)
self.inF.seek(inLen, os.SEEK_CUR)
continue
blkHeight = self.blkmap[self.hash_str]
self.blkCountIn += 1
if self.blkCountOut == blkHeight:
# If in-order block, just copy
rawblock = self.inF.read(inLen)
self.writeBlock(inhdr, blk_hdr, rawblock)
# See if we can catch up to prior out-of-order blocks
while self.blkCountOut in self.blockExtents:
self.copyOneBlock()
else: # If out-of-order, skip over block data for now
self.blockExtents[blkHeight] = inExtent
if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
# If there is space in the cache, read the data
# Reading the data in file sequence instead of seeking and fetching it later is preferred,
# but we don't want to fill up memory
self.outOfOrderData[blkHeight] = self.inF.read(inLen)
self.outOfOrderSize += inLen
else: # If no space in cache, seek forward
self.inF.seek(inLen, os.SEEK_CUR)
print("Done (%i blocks written)" % (self.blkCountOut))
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: linearize-data.py CONFIG-FILE")
sys.exit(1)
f = open(sys.argv[1], encoding="utf8")
for line in f:
# skip comment lines
m = re.search('^\s*#', line)
if m:
continue
# parse key=value lines
m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
if m is None:
continue
settings[m.group(1)] = m.group(2)
f.close()
# Force hash byte format setting to be lowercase to make comparisons easier.
# Also place upfront in case any settings need to know about it.
if 'rev_hash_bytes' not in settings:
settings['rev_hash_bytes'] = 'false'
settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
if 'netmagic' not in settings:
settings['netmagic'] = 'bf0c6bbd'
if 'genesis' not in settings:
settings['genesis'] = '00000ffd590b1485b3caadc19b22e6379c733355108f107a430458cdf3407ab6'
if 'input' not in settings:
settings['input'] = 'input'
if 'hashlist' not in settings:
settings['hashlist'] = 'hashlist.txt'
if 'file_timestamp' not in settings:
settings['file_timestamp'] = 0
if 'split_timestamp' not in settings:
settings['split_timestamp'] = 0
if 'max_out_sz' not in settings:
settings['max_out_sz'] = 1000 * 1000 * 1000
if 'out_of_order_cache_sz' not in settings:
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
if 'debug_output' not in settings:
settings['debug_output'] = 'false'
settings['max_out_sz'] = int(settings['max_out_sz'])
settings['split_timestamp'] = int(settings['split_timestamp'])
settings['file_timestamp'] = int(settings['file_timestamp'])
settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
settings['debug_output'] = settings['debug_output'].lower()
if 'output_file' not in settings and 'output' not in settings:
print("Missing output file / directory")
sys.exit(1)
blkindex = get_block_hashes(settings)
blkmap = mkblockmap(blkindex)
# Block hash map won't be byte-reversed. Neither should the genesis hash.
if not settings['genesis'] in blkmap:
print("Genesis block not found in hashlist")
else:
BlockDataCopier(settings, blkindex, blkmap).run()