From b9d95bd9a1b3e8613373ecd228805518f2852985 Mon Sep 17 00:00:00 2001 From: Douglas Roark Date: Wed, 18 Jan 2017 22:22:46 -0800 Subject: [PATCH] Fix various minor linearization script issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - The last-timestamp-encountered variable wasn’t being used properly. Rewrite code to properly allow for new blockchain files to be written when split by month. - Properly set a blockchain file’s access and modify times. - Add a “debug output” option to quiet certain output that might not always be desirable. - Update the README. --- contrib/linearize/README.md | 10 +++++-- contrib/linearize/example-linearize.cfg | 13 ++++++++- contrib/linearize/linearize-data.py | 36 ++++++++++++++----------- 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/contrib/linearize/README.md b/contrib/linearize/README.md index adc9a559cc..0971e7816b 100644 --- a/contrib/linearize/README.md +++ b/contrib/linearize/README.md @@ -32,8 +32,11 @@ Required configuration file settings: * `output`: Output directory for linearized `blocks/blkNNNNN.dat` output. Optional config file setting for linearize-data: -* `file_timestamp`: Set each file's last-modified time to that of the most -recent block in that file. +* `debug_output`: Some printouts may not always be desired. If true, such output +will be printed. +* `file_timestamp`: Set each file's last-accessed and last-modified times, +respectively, to the current time and to the timestamp of the most recent block +written to the script's blockchain. * `genesis`: The hash of the genesis block in the blockchain. * `input`: bitcoind blocks/ directory containing blkNNNNN.dat * `hashlist`: text file containing list of block hashes created by @@ -41,6 +44,9 @@ linearize-hashes.py. * `max_out_sz`: Maximum size for files created by the `output_file` option. (Default: `1000*1000*1000 bytes`) * `netmagic`: Network magic number. +* `out_of_order_cache_sz`: If out-of-order blocks are being read, the block can +be written to a cache so that the blockchain doesn't have to be seeked again. +This option specifies the cache size. (Default: `100*1000*1000 bytes`) * `rev_hash_bytes`: If true, the block hash list written by linearize-hashes.py will be byte-reversed when read by linearize-data.py. See the linearize-hashes entry for more information. diff --git a/contrib/linearize/example-linearize.cfg b/contrib/linearize/example-linearize.cfg index cccdd79213..db53dc0ef6 100644 --- a/contrib/linearize/example-linearize.cfg +++ b/contrib/linearize/example-linearize.cfg @@ -1,4 +1,3 @@ - # bitcoind RPC settings (linearize-hashes) rpcuser=someuser rpcpassword=somepassword @@ -21,6 +20,9 @@ input=/home/example/.bitcoin/blocks #genesis=000000000933ea01ad0ee984209779baaec3ced90fa3f408719526f8d77f4943 #input=/home/example/.bitcoin/testnet3/blocks +# "output" option causes blockchain files to be written to the given location, +# with "output_file" ignored. If not used, "output_file" is used instead. +# output=/home/example/blockchain_directory output_file=/home/example/Downloads/bootstrap.dat hashlist=hashlist.txt @@ -29,3 +31,12 @@ out_of_order_cache_sz = 100000000 # Do we want the reverse the hash bytes coming from getblockhash? rev_hash_bytes = False + +# On a new month, do we want to set the access and modify times of the new +# blockchain file? +file_timestamp = 0 +# Do we want to split the blockchain files given a new month or specific height? +split_timestamp = 0 + +# Do we want debug printouts? +debug_output = False diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py index 3fdec134b8..afcec2b60a 100755 --- a/contrib/linearize/linearize-data.py +++ b/contrib/linearize/linearize-data.py @@ -134,7 +134,7 @@ class BlockDataCopier: if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz): self.outF.close() if self.setFileTime: - os.utime(outFname, (int(time.time()), highTS)) + os.utime(self.outFname, (int(time.time()), self.highTS)) self.outF = None self.outFname = None self.outFn = self.outFn + 1 @@ -142,12 +142,12 @@ class BlockDataCopier: (blkDate, blkTS) = get_blk_dt(blk_hdr) if self.timestampSplit and (blkDate > self.lastDate): - print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str) - lastDate = blkDate - if outF: - outF.close() - if setFileTime: - os.utime(outFname, (int(time.time()), highTS)) + print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str) + self.lastDate = blkDate + if self.outF: + self.outF.close() + if self.setFileTime: + os.utime(self.outFname, (int(time.time()), self.highTS)) self.outF = None self.outFname = None self.outFn = self.outFn + 1 @@ -155,11 +155,11 @@ class BlockDataCopier: if not self.outF: if self.fileOutput: - outFname = self.settings['output_file'] + self.outFname = self.settings['output_file'] else: - outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn) - print("Output file " + outFname) - self.outF = open(outFname, "wb") + self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn) + print("Output file " + self.outFname) + self.outF = open(self.outFname, "wb") self.outF.write(inhdr) self.outF.write(blk_hdr) @@ -223,13 +223,16 @@ class BlockDataCopier: blk_hdr = self.inF.read(80) inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) - hash_str = calc_hash_str(blk_hdr) - if not hash_str in blkmap: - print("Skipping unknown block " + hash_str) + self.hash_str = calc_hash_str(blk_hdr) + if not self.hash_str in blkmap: + # Because blocks can be written to files out-of-order as of 0.10, the script + # may encounter blocks it doesn't know about. Treat as debug output. + if settings['debug_output'] == 'true': + print("Skipping unknown block " + self.hash_str) self.inF.seek(inLen, os.SEEK_CUR) continue - blkHeight = self.blkmap[hash_str] + blkHeight = self.blkmap[self.hash_str] self.blkCountIn += 1 if self.blkCountOut == blkHeight: @@ -295,12 +298,15 @@ if __name__ == '__main__': settings['max_out_sz'] = 1000 * 1000 * 1000 if 'out_of_order_cache_sz' not in settings: settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 + if 'debug_output' not in settings: + settings['debug_output'] = 'false' settings['max_out_sz'] = int(settings['max_out_sz']) settings['split_timestamp'] = int(settings['split_timestamp']) settings['file_timestamp'] = int(settings['file_timestamp']) settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8')) settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) + settings['debug_output'] = settings['debug_output'].lower() if 'output_file' not in settings and 'output' not in settings: print("Missing output file / directory")