checkpoint: pre list-to-vector refactor, fixes dictionary crash, mostly
- Add uncompressed dictionary (.dict) file support to avoid decompression memory issues - Implement chunked on-demand parsing for large definitions - Add backward navigation with re-parse capability - Limit cached pages to MAX_CACHED_PAGES (4) to prevent memory exhaustion - Add helper script for extracting/recompressing dictzip files
This commit is contained in:
335
scripts/recompress_dictzip.py
Normal file
335
scripts/recompress_dictzip.py
Normal file
@@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Recompress a dictzip file with a custom chunk size.
|
||||
|
||||
Dictzip is a gzip-compatible format that allows random access by compressing
|
||||
data in independent chunks. The standard dictzip uses ~58KB chunks, but this
|
||||
can cause memory issues on embedded devices like ESP32.
|
||||
|
||||
This script recompresses dictionary files with smaller chunks (default 16KB)
|
||||
to reduce memory requirements during decompression.
|
||||
|
||||
Usage:
|
||||
# From uncompressed .dict file:
|
||||
python recompress_dictzip.py reader.dict reader.dict.dz --chunk-size 16384
|
||||
|
||||
# From existing .dict.dz file (will decompress first):
|
||||
python recompress_dictzip.py reader.dict.dz reader_small.dict.dz --chunk-size 16384
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
import zlib
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def read_input_file(input_path: Path) -> bytes:
|
||||
"""Read input file, decompressing if it's a .dz or .gz file."""
|
||||
suffix = input_path.suffix.lower()
|
||||
|
||||
if suffix in ('.dz', '.gz'):
|
||||
print(f"Decompressing {input_path}...")
|
||||
with gzip.open(input_path, 'rb') as f:
|
||||
data = f.read()
|
||||
print(f" Decompressed size: {len(data):,} bytes")
|
||||
return data
|
||||
else:
|
||||
print(f"Reading {input_path}...")
|
||||
with open(input_path, 'rb') as f:
|
||||
data = f.read()
|
||||
print(f" Size: {len(data):,} bytes")
|
||||
return data
|
||||
|
||||
|
||||
def compress_chunk(data: bytes, level: int = 9) -> bytes:
|
||||
"""Compress a single chunk using raw deflate (no zlib header)."""
|
||||
# Use raw deflate (-15 for raw, 15 for window size)
|
||||
compressor = zlib.compressobj(level, zlib.DEFLATED, -15)
|
||||
compressed = compressor.compress(data)
|
||||
compressed += compressor.flush()
|
||||
return compressed
|
||||
|
||||
|
||||
def create_dictzip(data: bytes, output_path: Path, chunk_size: int = 16384,
|
||||
compression_level: int = 9) -> None:
|
||||
"""
|
||||
Create a dictzip file from uncompressed data.
|
||||
|
||||
Dictzip format:
|
||||
- Standard gzip header with FEXTRA flag
|
||||
- Extra field containing 'RA' subfield with chunk info
|
||||
- Compressed chunks (raw deflate, no headers)
|
||||
- Standard gzip trailer (CRC32 + ISIZE)
|
||||
"""
|
||||
# Validate chunk size (must fit in 16-bit field)
|
||||
if chunk_size > 65535:
|
||||
raise ValueError(f"Chunk size {chunk_size} exceeds maximum of 65535")
|
||||
if chunk_size < 1024:
|
||||
raise ValueError(f"Chunk size {chunk_size} is too small (minimum 1024)")
|
||||
|
||||
# Calculate number of chunks
|
||||
num_chunks = (len(data) + chunk_size - 1) // chunk_size
|
||||
|
||||
# Check if we can fit all chunk sizes in the extra field
|
||||
# Extra field max is 65535 bytes, each chunk size takes 2 bytes, plus 6 bytes header
|
||||
max_chunks = (65535 - 6) // 2
|
||||
if num_chunks > max_chunks:
|
||||
raise ValueError(f"Too many chunks ({num_chunks}) for dictzip format (max {max_chunks})")
|
||||
|
||||
print(f"Compressing into {num_chunks} chunks of {chunk_size} bytes...")
|
||||
|
||||
# Compress each chunk and collect sizes
|
||||
compressed_chunks = []
|
||||
chunk_sizes = []
|
||||
|
||||
for i in range(num_chunks):
|
||||
start = i * chunk_size
|
||||
end = min(start + chunk_size, len(data))
|
||||
chunk_data = data[start:end]
|
||||
|
||||
compressed = compress_chunk(chunk_data, compression_level)
|
||||
compressed_chunks.append(compressed)
|
||||
chunk_sizes.append(len(compressed))
|
||||
|
||||
if (i + 1) % 500 == 0 or i == num_chunks - 1:
|
||||
print(f" Compressed chunk {i + 1}/{num_chunks}")
|
||||
|
||||
# Calculate CRC32 and size for gzip trailer
|
||||
crc32 = zlib.crc32(data) & 0xffffffff
|
||||
isize = len(data) & 0xffffffff
|
||||
|
||||
# Build the extra field
|
||||
# RA subfield: VER(2) + CHLEN(2) + CHCNT(2) + sizes[CHCNT](2 each)
|
||||
ra_subfield_len = 6 + 2 * num_chunks
|
||||
extra_field = bytearray()
|
||||
extra_field.extend(b'RA') # SI1, SI2
|
||||
extra_field.extend(struct.pack('<H', ra_subfield_len)) # LEN
|
||||
extra_field.extend(struct.pack('<H', 1)) # VER
|
||||
extra_field.extend(struct.pack('<H', chunk_size)) # CHLEN
|
||||
extra_field.extend(struct.pack('<H', num_chunks)) # CHCNT
|
||||
for size in chunk_sizes:
|
||||
if size > 65535:
|
||||
raise ValueError(f"Compressed chunk size {size} exceeds 65535 bytes")
|
||||
extra_field.extend(struct.pack('<H', size))
|
||||
|
||||
xlen = len(extra_field)
|
||||
|
||||
# Build gzip header
|
||||
# Flags: FEXTRA (0x04)
|
||||
timestamp = int(time.time())
|
||||
xfl = 2 if compression_level == 9 else (4 if compression_level == 1 else 0)
|
||||
|
||||
header = bytearray()
|
||||
header.extend(b'\x1f\x8b') # Magic number
|
||||
header.append(0x08) # Compression method (deflate)
|
||||
header.append(0x04) # Flags: FEXTRA
|
||||
header.extend(struct.pack('<I', timestamp)) # MTIME
|
||||
header.append(xfl) # XFL
|
||||
header.append(0xff) # OS (unknown)
|
||||
header.extend(struct.pack('<H', xlen)) # XLEN
|
||||
header.extend(extra_field)
|
||||
|
||||
# Write output file
|
||||
print(f"Writing {output_path}...")
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(header)
|
||||
for chunk in compressed_chunks:
|
||||
f.write(chunk)
|
||||
f.write(struct.pack('<I', crc32))
|
||||
f.write(struct.pack('<I', isize))
|
||||
|
||||
# Report stats
|
||||
output_size = output_path.stat().st_size
|
||||
ratio = (1 - output_size / len(data)) * 100
|
||||
print(f" Output size: {output_size:,} bytes ({ratio:.1f}% compression)")
|
||||
print(f" Chunk size: {chunk_size} bytes")
|
||||
print(f" Number of chunks: {num_chunks}")
|
||||
|
||||
|
||||
def verify_dictzip(path: Path) -> bool:
|
||||
"""Verify a dictzip file by reading its header and decompressing chunk by chunk."""
|
||||
print(f"Verifying {path}...")
|
||||
|
||||
with open(path, 'rb') as f:
|
||||
# Read gzip header
|
||||
magic = f.read(2)
|
||||
if magic != b'\x1f\x8b':
|
||||
print(f" ERROR: Invalid gzip magic number")
|
||||
return False
|
||||
|
||||
method = f.read(1)[0]
|
||||
if method != 8:
|
||||
print(f" ERROR: Unknown compression method: {method}")
|
||||
return False
|
||||
|
||||
flags = f.read(1)[0]
|
||||
if not (flags & 0x04):
|
||||
print(f" ERROR: FEXTRA flag not set - not a dictzip file")
|
||||
return False
|
||||
|
||||
f.read(4) # MTIME
|
||||
f.read(1) # XFL
|
||||
f.read(1) # OS
|
||||
|
||||
# Read extra field
|
||||
xlen = struct.unpack('<H', f.read(2))[0]
|
||||
extra = f.read(xlen)
|
||||
|
||||
# Parse extra field for RA subfield
|
||||
pos = 0
|
||||
found_ra = False
|
||||
chlen = 0
|
||||
chcnt = 0
|
||||
chunk_sizes = []
|
||||
|
||||
while pos < len(extra):
|
||||
si1 = extra[pos]
|
||||
si2 = extra[pos + 1]
|
||||
slen = struct.unpack('<H', extra[pos + 2:pos + 4])[0]
|
||||
|
||||
if si1 == ord('R') and si2 == ord('A'):
|
||||
found_ra = True
|
||||
ra_data = extra[pos + 4:pos + 4 + slen]
|
||||
|
||||
ver = struct.unpack('<H', ra_data[0:2])[0]
|
||||
chlen = struct.unpack('<H', ra_data[2:4])[0]
|
||||
chcnt = struct.unpack('<H', ra_data[4:6])[0]
|
||||
|
||||
print(f" Version: {ver}")
|
||||
print(f" Chunk size: {chlen} bytes")
|
||||
print(f" Chunk count: {chcnt}")
|
||||
|
||||
# Verify chunk sizes array
|
||||
if len(ra_data) != 6 + 2 * chcnt:
|
||||
print(f" ERROR: Chunk sizes array length mismatch")
|
||||
return False
|
||||
|
||||
for i in range(chcnt):
|
||||
size = struct.unpack('<H', ra_data[6 + 2*i:8 + 2*i])[0]
|
||||
chunk_sizes.append(size)
|
||||
|
||||
print(f" Total compressed data: {sum(chunk_sizes):,} bytes")
|
||||
break
|
||||
|
||||
pos += 4 + slen
|
||||
|
||||
if not found_ra:
|
||||
print(f" ERROR: RA subfield not found - not a dictzip file")
|
||||
return False
|
||||
|
||||
# Decompress chunk by chunk (like the firmware does)
|
||||
data_start = f.tell()
|
||||
decompressed_data = bytearray()
|
||||
|
||||
try:
|
||||
for i, comp_size in enumerate(chunk_sizes):
|
||||
f.seek(data_start + sum(chunk_sizes[:i]))
|
||||
compressed_chunk = f.read(comp_size)
|
||||
|
||||
# Decompress using raw inflate (no zlib header)
|
||||
decompressor = zlib.decompressobj(-15)
|
||||
decompressed_chunk = decompressor.decompress(compressed_chunk)
|
||||
decompressed_chunk += decompressor.flush()
|
||||
decompressed_data.extend(decompressed_chunk)
|
||||
|
||||
print(f" Decompressed size: {len(decompressed_data):,} bytes")
|
||||
|
||||
# Verify CRC32 from trailer
|
||||
f.seek(-8, 2) # Seek to 8 bytes before end
|
||||
expected_crc = struct.unpack('<I', f.read(4))[0]
|
||||
expected_size = struct.unpack('<I', f.read(4))[0]
|
||||
|
||||
actual_crc = zlib.crc32(bytes(decompressed_data)) & 0xffffffff
|
||||
actual_size = len(decompressed_data) & 0xffffffff
|
||||
|
||||
if actual_crc != expected_crc:
|
||||
print(f" ERROR: CRC mismatch: expected {expected_crc:08x}, got {actual_crc:08x}")
|
||||
return False
|
||||
|
||||
if actual_size != expected_size:
|
||||
print(f" ERROR: Size mismatch: expected {expected_size}, got {actual_size}")
|
||||
return False
|
||||
|
||||
print(f" CRC32: {actual_crc:08x} (verified)")
|
||||
print(f" Verification: PASSED")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: Decompression failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Recompress a dictzip file with a custom chunk size.',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Recompress with 16KB chunks (recommended for ESP32):
|
||||
%(prog)s reader.dict reader.dict.dz --chunk-size 16384
|
||||
|
||||
# Recompress from existing .dz file:
|
||||
%(prog)s reader.dict.dz reader_small.dict.dz --chunk-size 16384
|
||||
|
||||
# Verify a dictzip file:
|
||||
%(prog)s --verify reader.dict.dz
|
||||
""")
|
||||
|
||||
parser.add_argument('input', nargs='?', help='Input .dict or .dict.dz file')
|
||||
parser.add_argument('output', nargs='?', help='Output .dict.dz file')
|
||||
parser.add_argument('--chunk-size', '-c', type=int, default=16384,
|
||||
help='Chunk size in bytes (default: 16384, i.e., 16KB)')
|
||||
parser.add_argument('--compression-level', '-l', type=int, default=9,
|
||||
choices=range(1, 10), metavar='1-9',
|
||||
help='Compression level 1-9 (default: 9)')
|
||||
parser.add_argument('--verify', '-v', action='store_true',
|
||||
help='Verify a dictzip file instead of compressing')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verify:
|
||||
if not args.input:
|
||||
parser.error("Input file required for verification")
|
||||
input_path = Path(args.input)
|
||||
if not input_path.exists():
|
||||
print(f"Error: File not found: {input_path}")
|
||||
sys.exit(1)
|
||||
success = verify_dictzip(input_path)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
if not args.input or not args.output:
|
||||
parser.error("Both input and output files are required")
|
||||
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file not found: {input_path}")
|
||||
sys.exit(1)
|
||||
|
||||
if output_path.exists():
|
||||
response = input(f"Output file {output_path} exists. Overwrite? [y/N] ")
|
||||
if response.lower() != 'y':
|
||||
print("Aborted.")
|
||||
sys.exit(1)
|
||||
|
||||
# Read and decompress input if needed
|
||||
data = read_input_file(input_path)
|
||||
|
||||
# Create new dictzip with specified chunk size
|
||||
create_dictzip(data, output_path, args.chunk_size, args.compression_level)
|
||||
|
||||
# Verify the output
|
||||
print()
|
||||
if verify_dictzip(output_path):
|
||||
print(f"\nSuccess! Created {output_path} with {args.chunk_size}-byte chunks.")
|
||||
else:
|
||||
print(f"\nError: Verification failed!")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user