checkpoint: pre list-to-vector refactor, fixes dictionary crash, mostly

- Add uncompressed dictionary (.dict) file support to avoid decompression memory issues - Implement chunked on-demand parsing for large definitions - Add backward navigation with re-parse capability - Limit cached pages to MAX_CACHED_PAGES (4) to prevent memory exhaustion - Add helper script for extracting/recompressing dictzip files
2026-01-29 09:33:40 -05:00
parent 8b41dccfb9
commit 62643ae933
5 changed files with 770 additions and 55 deletions
--- a/scripts/recompress_dictzip.py
+++ b/scripts/recompress_dictzip.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+"""
+Recompress a dictzip file with a custom chunk size.
+
+Dictzip is a gzip-compatible format that allows random access by compressing
+data in independent chunks. The standard dictzip uses ~58KB chunks, but this
+can cause memory issues on embedded devices like ESP32.
+
+This script recompresses dictionary files with smaller chunks (default 16KB)
+to reduce memory requirements during decompression.
+
+Usage:
+    # From uncompressed .dict file:
+    python recompress_dictzip.py reader.dict reader.dict.dz --chunk-size 16384
+
+    # From existing .dict.dz file (will decompress first):
+    python recompress_dictzip.py reader.dict.dz reader_small.dict.dz --chunk-size 16384
+"""
+
+import argparse
+import gzip
+import struct
+import sys
+import time
+import zlib
+from pathlib import Path
+
+
+def read_input_file(input_path: Path) -> bytes:
+    """Read input file, decompressing if it's a .dz or .gz file."""
+    suffix = input_path.suffix.lower()
+    
+    if suffix in ('.dz', '.gz'):
+        print(f"Decompressing {input_path}...")
+        with gzip.open(input_path, 'rb') as f:
+            data = f.read()
+        print(f"  Decompressed size: {len(data):,} bytes")
+        return data
+    else:
+        print(f"Reading {input_path}...")
+        with open(input_path, 'rb') as f:
+            data = f.read()
+        print(f"  Size: {len(data):,} bytes")
+        return data
+
+
+def compress_chunk(data: bytes, level: int = 9) -> bytes:
+    """Compress a single chunk using raw deflate (no zlib header)."""
+    # Use raw deflate (-15 for raw, 15 for window size)
+    compressor = zlib.compressobj(level, zlib.DEFLATED, -15)
+    compressed = compressor.compress(data)
+    compressed += compressor.flush()
+    return compressed
+
+
+def create_dictzip(data: bytes, output_path: Path, chunk_size: int = 16384, 
+                   compression_level: int = 9) -> None:
+    """
+    Create a dictzip file from uncompressed data.
+    
+    Dictzip format:
+    - Standard gzip header with FEXTRA flag
+    - Extra field containing 'RA' subfield with chunk info
+    - Compressed chunks (raw deflate, no headers)
+    - Standard gzip trailer (CRC32 + ISIZE)
+    """
+    # Validate chunk size (must fit in 16-bit field)
+    if chunk_size > 65535:
+        raise ValueError(f"Chunk size {chunk_size} exceeds maximum of 65535")
+    if chunk_size < 1024:
+        raise ValueError(f"Chunk size {chunk_size} is too small (minimum 1024)")
+    
+    # Calculate number of chunks
+    num_chunks = (len(data) + chunk_size - 1) // chunk_size
+    
+    # Check if we can fit all chunk sizes in the extra field
+    # Extra field max is 65535 bytes, each chunk size takes 2 bytes, plus 6 bytes header
+    max_chunks = (65535 - 6) // 2
+    if num_chunks > max_chunks:
+        raise ValueError(f"Too many chunks ({num_chunks}) for dictzip format (max {max_chunks})")
+    
+    print(f"Compressing into {num_chunks} chunks of {chunk_size} bytes...")
+    
+    # Compress each chunk and collect sizes
+    compressed_chunks = []
+    chunk_sizes = []
+    
+    for i in range(num_chunks):
+        start = i * chunk_size
+        end = min(start + chunk_size, len(data))
+        chunk_data = data[start:end]
+        
+        compressed = compress_chunk(chunk_data, compression_level)
+        compressed_chunks.append(compressed)
+        chunk_sizes.append(len(compressed))
+        
+        if (i + 1) % 500 == 0 or i == num_chunks - 1:
+            print(f"  Compressed chunk {i + 1}/{num_chunks}")
+    
+    # Calculate CRC32 and size for gzip trailer
+    crc32 = zlib.crc32(data) & 0xffffffff
+    isize = len(data) & 0xffffffff
+    
+    # Build the extra field
+    # RA subfield: VER(2) + CHLEN(2) + CHCNT(2) + sizes[CHCNT](2 each)
+    ra_subfield_len = 6 + 2 * num_chunks
+    extra_field = bytearray()
+    extra_field.extend(b'RA')  # SI1, SI2
+    extra_field.extend(struct.pack('<H', ra_subfield_len))  # LEN
+    extra_field.extend(struct.pack('<H', 1))  # VER
+    extra_field.extend(struct.pack('<H', chunk_size))  # CHLEN
+    extra_field.extend(struct.pack('<H', num_chunks))  # CHCNT
+    for size in chunk_sizes:
+        if size > 65535:
+            raise ValueError(f"Compressed chunk size {size} exceeds 65535 bytes")
+        extra_field.extend(struct.pack('<H', size))
+    
+    xlen = len(extra_field)
+    
+    # Build gzip header
+    # Flags: FEXTRA (0x04)
+    timestamp = int(time.time())
+    xfl = 2 if compression_level == 9 else (4 if compression_level == 1 else 0)
+    
+    header = bytearray()
+    header.extend(b'\x1f\x8b')  # Magic number
+    header.append(0x08)  # Compression method (deflate)
+    header.append(0x04)  # Flags: FEXTRA
+    header.extend(struct.pack('<I', timestamp))  # MTIME
+    header.append(xfl)  # XFL
+    header.append(0xff)  # OS (unknown)
+    header.extend(struct.pack('<H', xlen))  # XLEN
+    header.extend(extra_field)
+    
+    # Write output file
+    print(f"Writing {output_path}...")
+    with open(output_path, 'wb') as f:
+        f.write(header)
+        for chunk in compressed_chunks:
+            f.write(chunk)
+        f.write(struct.pack('<I', crc32))
+        f.write(struct.pack('<I', isize))
+    
+    # Report stats
+    output_size = output_path.stat().st_size
+    ratio = (1 - output_size / len(data)) * 100
+    print(f"  Output size: {output_size:,} bytes ({ratio:.1f}% compression)")
+    print(f"  Chunk size: {chunk_size} bytes")
+    print(f"  Number of chunks: {num_chunks}")
+
+
+def verify_dictzip(path: Path) -> bool:
+    """Verify a dictzip file by reading its header and decompressing chunk by chunk."""
+    print(f"Verifying {path}...")
+    
+    with open(path, 'rb') as f:
+        # Read gzip header
+        magic = f.read(2)
+        if magic != b'\x1f\x8b':
+            print(f"  ERROR: Invalid gzip magic number")
+            return False
+        
+        method = f.read(1)[0]
+        if method != 8:
+            print(f"  ERROR: Unknown compression method: {method}")
+            return False
+        
+        flags = f.read(1)[0]
+        if not (flags & 0x04):
+            print(f"  ERROR: FEXTRA flag not set - not a dictzip file")
+            return False
+        
+        f.read(4)  # MTIME
+        f.read(1)  # XFL
+        f.read(1)  # OS
+        
+        # Read extra field
+        xlen = struct.unpack('<H', f.read(2))[0]
+        extra = f.read(xlen)
+        
+        # Parse extra field for RA subfield
+        pos = 0
+        found_ra = False
+        chlen = 0
+        chcnt = 0
+        chunk_sizes = []
+        
+        while pos < len(extra):
+            si1 = extra[pos]
+            si2 = extra[pos + 1]
+            slen = struct.unpack('<H', extra[pos + 2:pos + 4])[0]
+            
+            if si1 == ord('R') and si2 == ord('A'):
+                found_ra = True
+                ra_data = extra[pos + 4:pos + 4 + slen]
+                
+                ver = struct.unpack('<H', ra_data[0:2])[0]
+                chlen = struct.unpack('<H', ra_data[2:4])[0]
+                chcnt = struct.unpack('<H', ra_data[4:6])[0]
+                
+                print(f"  Version: {ver}")
+                print(f"  Chunk size: {chlen} bytes")
+                print(f"  Chunk count: {chcnt}")
+                
+                # Verify chunk sizes array
+                if len(ra_data) != 6 + 2 * chcnt:
+                    print(f"  ERROR: Chunk sizes array length mismatch")
+                    return False
+                
+                for i in range(chcnt):
+                    size = struct.unpack('<H', ra_data[6 + 2*i:8 + 2*i])[0]
+                    chunk_sizes.append(size)
+                
+                print(f"  Total compressed data: {sum(chunk_sizes):,} bytes")
+                break
+            
+            pos += 4 + slen
+        
+        if not found_ra:
+            print(f"  ERROR: RA subfield not found - not a dictzip file")
+            return False
+        
+        # Decompress chunk by chunk (like the firmware does)
+        data_start = f.tell()
+        decompressed_data = bytearray()
+        
+        try:
+            for i, comp_size in enumerate(chunk_sizes):
+                f.seek(data_start + sum(chunk_sizes[:i]))
+                compressed_chunk = f.read(comp_size)
+                
+                # Decompress using raw inflate (no zlib header)
+                decompressor = zlib.decompressobj(-15)
+                decompressed_chunk = decompressor.decompress(compressed_chunk)
+                decompressed_chunk += decompressor.flush()
+                decompressed_data.extend(decompressed_chunk)
+            
+            print(f"  Decompressed size: {len(decompressed_data):,} bytes")
+            
+            # Verify CRC32 from trailer
+            f.seek(-8, 2)  # Seek to 8 bytes before end
+            expected_crc = struct.unpack('<I', f.read(4))[0]
+            expected_size = struct.unpack('<I', f.read(4))[0]
+            
+            actual_crc = zlib.crc32(bytes(decompressed_data)) & 0xffffffff
+            actual_size = len(decompressed_data) & 0xffffffff
+            
+            if actual_crc != expected_crc:
+                print(f"  ERROR: CRC mismatch: expected {expected_crc:08x}, got {actual_crc:08x}")
+                return False
+            
+            if actual_size != expected_size:
+                print(f"  ERROR: Size mismatch: expected {expected_size}, got {actual_size}")
+                return False
+            
+            print(f"  CRC32: {actual_crc:08x} (verified)")
+            print(f"  Verification: PASSED")
+            return True
+            
+        except Exception as e:
+            print(f"  ERROR: Decompression failed: {e}")
+            return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Recompress a dictzip file with a custom chunk size.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Recompress with 16KB chunks (recommended for ESP32):
+  %(prog)s reader.dict reader.dict.dz --chunk-size 16384
+
+  # Recompress from existing .dz file:
+  %(prog)s reader.dict.dz reader_small.dict.dz --chunk-size 16384
+
+  # Verify a dictzip file:
+  %(prog)s --verify reader.dict.dz
+""")
+    
+    parser.add_argument('input', nargs='?', help='Input .dict or .dict.dz file')
+    parser.add_argument('output', nargs='?', help='Output .dict.dz file')
+    parser.add_argument('--chunk-size', '-c', type=int, default=16384,
+                        help='Chunk size in bytes (default: 16384, i.e., 16KB)')
+    parser.add_argument('--compression-level', '-l', type=int, default=9,
+                        choices=range(1, 10), metavar='1-9',
+                        help='Compression level 1-9 (default: 9)')
+    parser.add_argument('--verify', '-v', action='store_true',
+                        help='Verify a dictzip file instead of compressing')
+    
+    args = parser.parse_args()
+    
+    if args.verify:
+        if not args.input:
+            parser.error("Input file required for verification")
+        input_path = Path(args.input)
+        if not input_path.exists():
+            print(f"Error: File not found: {input_path}")
+            sys.exit(1)
+        success = verify_dictzip(input_path)
+        sys.exit(0 if success else 1)
+    
+    if not args.input or not args.output:
+        parser.error("Both input and output files are required")
+    
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    
+    if not input_path.exists():
+        print(f"Error: Input file not found: {input_path}")
+        sys.exit(1)
+    
+    if output_path.exists():
+        response = input(f"Output file {output_path} exists. Overwrite? [y/N] ")
+        if response.lower() != 'y':
+            print("Aborted.")
+            sys.exit(1)
+    
+    # Read and decompress input if needed
+    data = read_input_file(input_path)
+    
+    # Create new dictzip with specified chunk size
+    create_dictzip(data, output_path, args.chunk_size, args.compression_level)
+    
+    # Verify the output
+    print()
+    if verify_dictzip(output_path):
+        print(f"\nSuccess! Created {output_path} with {args.chunk_size}-byte chunks.")
+    else:
+        print(f"\nError: Verification failed!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()