Merge bitcoin/bitcoin#32621: contrib: utxo_to_sqlite.py: add option to store txid/spk as BLOBs

7378f27b4f test: run utxo-to-sqlite script test with spk/txid format option combinations (Sebastian Falbesoner)
b30fca7498 contrib: utxo_to_sqlite.py: add options to store txid/spk as BLOBs (Sebastian Falbesoner)

Pull request description:

  This PR is a late follow-up to https://github.com/bitcoin/bitcoin/pull/27432, introducing an option for the utxo-to-sqlite script to store the txid/scriptPubKey columns as bytes (= `BLOB` storage class in sqlite, see e.g. https://www.sqlite.org/datatype3.html in sqlite) rather than hex strings. This was proposed in earlier reviews (https://github.com/bitcoin/bitcoin/pull/27432#issuecomment-1516857024, https://github.com/bitcoin/bitcoin/pull/27432#issuecomment-1653739351) and has the obvious advantage of a significantly smaller size of the resulting database (and with that, faster conversion) and the avoidance of hex-to-bytes conversion for further processing of the data [1]. The rationale on why hex strings were chosen back then (and still stays the default, if only for compatibility reasons) is laid out in https://github.com/bitcoin/bitcoin/pull/27432#issuecomment-1516922824 [2].

  The approach taken is introducing new parameters `--spk` and `--txid` which can either have the values "hex", "raw" (for scriptpubkey) and "hex", "raw", "rawle" (for txid). Thanks to ajtowns for providing this suggestion. Happy to take further inputs on naming and thoughts on future extensibility etc.

  [1] For a concrete example, I found that having these columns as bytes would be nice while working on a SwiftSync hints generator tool (https://github.com/theStack/swiftsync-hints-gen), which takes the result of the utxo-to-sqlite tool as input.
  [2] note that in contrast what I wrote back then, I think there is no ambiguity on byte-string-serialization of txids; they are ultimately just hash results and hence, they should be stored as such, and adding a big/little endian knob wouldn't make much sense. The drawback of not being able to immediately show txid-strings (as one would need to do the bytes-reversal step first, which is not possible in sqlite, see e.g. https://github.com/bitcoin/bitcoin/pull/24952#issuecomment-1165499803) still remains though.

ACKs for top commit:
  ajtowns:
    ACK 7378f27b4f
  w0xlt:
    reACK 7378f27b4f
  sedited:
    ACK 7378f27b4f

Tree-SHA512: 265991a1f00e3d69e06dd9adc34684720affd416042789db2d76226e4b31cf20adc433a74d14140f17739707dee57e6703f72c20bd0f8dd08b6d383d3f28b450
This commit is contained in:
merge-script
2026-02-08 10:37:45 +01:00
2 changed files with 59 additions and 21 deletions

View File

@@ -9,6 +9,9 @@ $ bitcoin-cli dumptxoutset ~/utxos.dat latest
The created database contains a table `utxos` with the following schema:
(txid TEXT, vout INT, value INT, coinbase INT, height INT, scriptpubkey TEXT)
If --txid=raw or --txid=rawle is specified, txid will be BLOB instead;
if --spk=raw, then scriptpubkey will be BLOB instead.
"""
import argparse
import os
@@ -111,7 +114,9 @@ def main():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('infile', help='filename of compact-serialized UTXO set (input)')
parser.add_argument('outfile', help='filename of created SQLite3 database (output)')
parser.add_argument('-v', '--verbose', action='store_true', help='show details about each UTXO')
parser.add_argument('--verbose', action='store_true', help='show details about each UTXO')
parser.add_argument('--spk', choices=['hex', 'raw'], default='hex', help='encode scriptPubKey as hex or raw bytes')
parser.add_argument('--txid', choices=['hex', 'raw', 'rawle'], default='hex', help='encode txid as hex, raw bytes (sha256 byteorder), or reversed raw bytes (little endian)')
args = parser.parse_args()
if not os.path.exists(args.infile):
@@ -122,9 +127,15 @@ def main():
print(f"Error: provided output file '{args.outfile}' already exists.")
sys.exit(1)
spk_hex = (args.spk == 'hex')
txid_hex = (args.txid == 'hex')
txid_reverse = (args.txid != 'raw')
# create database table
txid_fmt = "TEXT" if txid_hex else "BLOB"
spk_fmt = "TEXT" if spk_hex else "BLOB"
con = sqlite3.connect(args.outfile)
con.execute("CREATE TABLE utxos(txid TEXT, vout INT, value INT, coinbase INT, height INT, scriptpubkey TEXT)")
con.execute(f"CREATE TABLE utxos(txid {txid_fmt}, vout INT, value INT, coinbase INT, height INT, scriptpubkey {spk_fmt})")
# read metadata (magic bytes, version, network magic, block hash, UTXO count)
f = open(args.infile, 'rb')
@@ -153,7 +164,7 @@ def main():
for coin_idx in range(1, num_utxos+1):
# read key (COutPoint)
if coins_per_hash_left == 0: # read next prevout hash
prevout_hash = f.read(32)[::-1].hex()
prevout_hash = f.read(32)
coins_per_hash_left = read_compactsize(f)
prevout_index = read_compactsize(f)
# read value (Coin)
@@ -161,17 +172,21 @@ def main():
height = code >> 1
is_coinbase = code & 1
amount = decompress_amount(read_varint(f))
scriptpubkey = decompress_script(f).hex()
write_batch.append((prevout_hash, prevout_index, amount, is_coinbase, height, scriptpubkey))
scriptpubkey = decompress_script(f)
scriptpubkey_write = scriptpubkey.hex() if spk_hex else scriptpubkey
txid_write = prevout_hash[::-1] if txid_reverse else prevout_hash
txid_write = txid_write.hex() if txid_hex else txid_write
write_batch.append((txid_write, prevout_index, amount, is_coinbase, height, scriptpubkey_write))
if height > max_height:
max_height = height
coins_per_hash_left -= 1
if args.verbose:
print(f"Coin {coin_idx}/{num_utxos}:")
print(f" prevout = {prevout_hash}:{prevout_index}")
print(f" prevout = {prevout_hash[::-1].hex()}:{prevout_index}")
print(f" amount = {amount}, height = {height}, coinbase = {is_coinbase}")
print(f" scriptPubKey = {scriptpubkey}\n")
print(f" scriptPubKey = {scriptpubkey.hex()}\n")
if coin_idx % (16*1024) == 0 or coin_idx == num_utxos:
# write utxo batch to database

View File

@@ -3,6 +3,7 @@
# Distributed under the MIT software license, see the accompanying
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
"""Test utxo-to-sqlite conversion tool"""
from itertools import product
import os.path
try:
import sqlite3
@@ -15,6 +16,7 @@ from test_framework.key import ECKey
from test_framework.messages import (
COutPoint,
CTxOut,
uint256_from_str,
)
from test_framework.crypto.muhash import MuHash3072
from test_framework.script import (
@@ -38,15 +40,33 @@ from test_framework.util import (
from test_framework.wallet import MiniWallet
def calculate_muhash_from_sqlite_utxos(filename):
def calculate_muhash_from_sqlite_utxos(filename, txid_format, spk_format):
muhash = MuHash3072()
con = sqlite3.connect(filename)
cur = con.cursor()
for (txid_hex, vout, value, coinbase, height, spk_hex) in cur.execute("SELECT * FROM utxos"):
for (txid, vout, value, coinbase, height, spk) in cur.execute("SELECT * FROM utxos"):
match txid_format:
case "hex":
assert type(txid) is str
txid_bytes = bytes.fromhex(txid)[::-1]
case "raw":
assert type(txid) is bytes
txid_bytes = txid
case "rawle":
assert type(txid) is bytes
txid_bytes = txid[::-1]
match spk_format:
case "hex":
assert type(spk) is str
spk_bytes = bytes.fromhex(spk)
case "raw":
assert type(spk) is bytes
spk_bytes = spk
# serialize UTXO for MuHash (see function `TxOutSer` in the coinstats module)
utxo_ser = COutPoint(int(txid_hex, 16), vout).serialize()
utxo_ser = COutPoint(uint256_from_str(txid_bytes), vout).serialize()
utxo_ser += (height * 2 + coinbase).to_bytes(4, 'little')
utxo_ser += CTxOut(value, bytes.fromhex(spk_hex)).serialize()
utxo_ser += CTxOut(value, spk_bytes).serialize()
muhash.insert(utxo_ser)
con.close()
return muhash.digest()[::-1].hex()
@@ -100,17 +120,20 @@ class UtxoToSqliteTest(BitcoinTestFramework):
input_filename = os.path.join(self.options.tmpdir, "utxos.dat")
node.dumptxoutset(input_filename, "latest")
self.log.info('Convert UTXO set from compact-serialized format to sqlite format')
output_filename = os.path.join(self.options.tmpdir, "utxos.sqlite")
base_dir = self.config["environment"]["SRCDIR"]
utxo_to_sqlite_path = os.path.join(base_dir, "contrib", "utxo-tools", "utxo_to_sqlite.py")
subprocess.run([sys.executable, utxo_to_sqlite_path, input_filename, output_filename],
check=True, stderr=subprocess.STDOUT)
for i, (txid_format, spk_format) in enumerate(product(["hex", "raw", "rawle"], ["hex", "raw"])):
self.log.info(f'Test utxo-to-sqlite script using txid format "{txid_format}" and spk format "{spk_format}" ({i+1})')
self.log.info('-> Convert UTXO set from compact-serialized format to sqlite format')
output_filename = os.path.join(self.options.tmpdir, f"utxos_{i+1}.sqlite")
base_dir = self.config["environment"]["SRCDIR"]
utxo_to_sqlite_path = os.path.join(base_dir, "contrib", "utxo-tools", "utxo_to_sqlite.py")
arguments = [input_filename, output_filename, f'--txid={txid_format}', f'--spk={spk_format}']
subprocess.run([sys.executable, utxo_to_sqlite_path] + arguments, check=True, stderr=subprocess.STDOUT)
self.log.info('Verify that both UTXO sets match by comparing their MuHash')
muhash_sqlite = calculate_muhash_from_sqlite_utxos(output_filename)
muhash_compact_serialized = node.gettxoutsetinfo('muhash')['muhash']
assert_equal(muhash_sqlite, muhash_compact_serialized)
self.log.info('-> Verify that both UTXO sets match by comparing their MuHash')
muhash_sqlite = calculate_muhash_from_sqlite_utxos(output_filename, txid_format, spk_format)
muhash_compact_serialized = node.gettxoutsetinfo('muhash')['muhash']
assert_equal(muhash_sqlite, muhash_compact_serialized)
self.log.info('')
if __name__ == "__main__":