From aedc74dfa688306c5a139a88782da74f69ba6757 Mon Sep 17 00:00:00 2001 From: "Wladimir J. van der Laan" Date: Mon, 6 Oct 2014 17:55:55 +0200 Subject: [PATCH] contrib: make linearize-data.py cope with out-of-order blocks Make it possible to read blocks in any order. This will be required after headers-first (#4468), so should be merged before that. - Read block header. For expected blocks, continue, else skip. - For in-order blocks: copy block contents directly. Write prior out-of-order blocks if this connects a consecutive span. - For out-of-order blocks, store extents of block data for later retrieval. Cache out-of-order blocks in memory up to 100MB (configurable). --- contrib/linearize/example-linearize.cfg | 2 + contrib/linearize/linearize-data.py | 242 +++++++++++++++--------- 2 files changed, 154 insertions(+), 90 deletions(-) diff --git a/contrib/linearize/example-linearize.cfg b/contrib/linearize/example-linearize.cfg index 071345f23..e0fef1388 100644 --- a/contrib/linearize/example-linearize.cfg +++ b/contrib/linearize/example-linearize.cfg @@ -15,3 +15,5 @@ output_file=/home/example/Downloads/bootstrap.dat hashlist=hashlist.txt split_year=1 +# Maxmimum size in bytes of out-of-order blocks cache in memory +out_of_order_cache_sz = 100000000 diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py index 3b5d198c1..2dac3a614 100755 --- a/contrib/linearize/linearize-data.py +++ b/contrib/linearize/linearize-data.py @@ -2,11 +2,12 @@ # # linearize-data.py: Construct a linear, no-fork version of the chain. # -# Copyright (c) 2013 The Bitcoin developers +# Copyright (c) 2013-2014 The Bitcoin developers # Distributed under the MIT/X11 software license, see the accompanying # file COPYING or http://www.opensource.org/licenses/mit-license.php. # +from __future__ import print_function, division import json import struct import re @@ -17,10 +18,10 @@ import sys import hashlib import datetime import time +from collections import namedtuple settings = {} - def uint32(x): return x & 0xffffffffL @@ -78,116 +79,174 @@ def get_block_hashes(settings): return blkindex -def mkblockset(blkindex): +def mkblockmap(blkindex): blkmap = {} - for hash in blkindex: - blkmap[hash] = True + for height,hash in enumerate(blkindex): + blkmap[hash] = height return blkmap -def copydata(settings, blkindex, blkset): - inFn = 0 - inF = None - outFn = 0 - outsz = 0 - outF = None - outFname = None - blkCount = 0 +# Block header and extent on disk +BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) - lastDate = datetime.datetime(2000, 1, 1) - highTS = 1408893517 - 315360000 - timestampSplit = False - fileOutput = True - setFileTime = False - maxOutSz = settings['max_out_sz'] - if 'output' in settings: - fileOutput = False - if settings['file_timestamp'] != 0: - setFileTime = True - if settings['split_timestamp'] != 0: - timestampSplit = True +class BlockDataCopier: + def __init__(self, settings, blkindex, blkmap): + self.settings = settings + self.blkindex = blkindex + self.blkmap = blkmap - while True: - if not inF: - fname = "%s/blk%05d.dat" % (settings['input'], inFn) - print("Input file" + fname) - try: - inF = open(fname, "rb") - except IOError: - print "Done" - return + self.inFn = 0 + self.inF = None + self.outFn = 0 + self.outsz = 0 + self.outF = None + self.outFname = None + self.blkCountIn = 0 + self.blkCountOut = 0 - inhdr = inF.read(8) - if (not inhdr or (inhdr[0] == "\0")): - inF.close() - inF = None - inFn = inFn + 1 - continue + self.lastDate = datetime.datetime(2000, 1, 1) + self.highTS = 1408893517 - 315360000 + self.timestampSplit = False + self.fileOutput = True + self.setFileTime = False + self.maxOutSz = settings['max_out_sz'] + if 'output' in settings: + self.fileOutput = False + if settings['file_timestamp'] != 0: + self.setFileTime = True + if settings['split_timestamp'] != 0: + self.timestampSplit = True + # Extents and cache for out-of-order blocks + self.blockExtents = {} + self.outOfOrderData = {} + self.outOfOrderSize = 0 # running total size for items in outOfOrderData - inMagic = inhdr[:4] - if (inMagic != settings['netmagic']): - print("Invalid magic:" + inMagic) - return - inLenLE = inhdr[4:] - su = struct.unpack(" maxOutSz): - outF.close() - if setFileTime: + def writeBlock(self, inhdr, blk_hdr, rawblock): + if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz): + self.outF.close() + if self.setFileTime: os.utime(outFname, (int(time.time()), highTS)) - outF = None - outFname = None - outFn = outFn + 1 - outsz = 0 + self.outF = None + self.outFname = None + self.outFn = outFn + 1 + self.outsz = 0 (blkDate, blkTS) = get_blk_dt(blk_hdr) - if timestampSplit and (blkDate > lastDate): + if self.timestampSplit and (blkDate > self.lastDate): print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str) lastDate = blkDate if outF: outF.close() if setFileTime: os.utime(outFname, (int(time.time()), highTS)) - outF = None - outFname = None - outFn = outFn + 1 - outsz = 0 + self.outF = None + self.outFname = None + self.outFn = self.outFn + 1 + self.outsz = 0 - if not outF: - if fileOutput: - outFname = settings['output_file'] + if not self.outF: + if self.fileOutput: + outFname = self.settings['output_file'] else: - outFname = "%s/blk%05d.dat" % (settings['output'], outFn) + outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn) print("Output file" + outFname) - outF = open(outFname, "wb") + self.outF = open(outFname, "wb") - outF.write(inhdr) - outF.write(rawblock) - outsz = outsz + inLen + 8 + self.outF.write(inhdr) + self.outF.write(blk_hdr) + self.outF.write(rawblock) + self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) - blkCount = blkCount + 1 - if blkTS > highTS: - highTS = blkTS + self.blkCountOut = self.blkCountOut + 1 + if blkTS > self.highTS: + self.highTS = blkTS - if (blkCount % 1000) == 0: - print("Wrote " + str(blkCount) + " blocks") + if (self.blkCountOut % 1000) == 0: + print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % + (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) + + def inFileName(self, fn): + return "%s/blk%05d.dat" % (self.settings['input'], fn) + + def fetchBlock(self, extent): + '''Fetch block contents from disk given extents''' + with open(self.inFileName(extent.fn), "rb") as f: + f.seek(extent.offset) + return f.read(extent.size) + + def copyOneBlock(self): + '''Find the next block to be written in the input, and copy it to the output.''' + extent = self.blockExtents.pop(self.blkCountOut) + if self.blkCountOut in self.outOfOrderData: + # If the data is cached, use it from memory and remove from the cache + rawblock = self.outOfOrderData.pop(self.blkCountOut) + self.outOfOrderSize -= len(rawblock) + else: # Otherwise look up data on disk + rawblock = self.fetchBlock(extent) + + self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) + + def run(self): + while self.blkCountOut < len(self.blkindex): + if not self.inF: + fname = self.inFileName(self.inFn) + print("Input file" + fname) + try: + self.inF = open(fname, "rb") + except IOError: + print("Premature end of block data") + return + + inhdr = self.inF.read(8) + if (not inhdr or (inhdr[0] == "\0")): + self.inF.close() + self.inF = None + self.inFn = self.inFn + 1 + continue + + inMagic = inhdr[:4] + if (inMagic != self.settings['netmagic']): + print("Invalid magic:" + inMagic) + return + inLenLE = inhdr[4:] + su = struct.unpack("