checkpoint: pre list-to-vector refactor, fixes dictionary crash, mostly

- Add uncompressed dictionary (.dict) file support to avoid decompression memory issues - Implement chunked on-demand parsing for large definitions - Add backward navigation with re-parse capability - Limit cached pages to MAX_CACHED_PAGES (4) to prevent memory exhaustion - Add helper script for extracting/recompressing dictzip files
2026-01-29 09:33:40 -05:00 · 2026-01-29 09:33:40 -05:00 · 62643ae933
commit 62643ae933
parent 8b41dccfb9
5 changed files with 770 additions and 55 deletions
--- a/lib/StarDict/StarDict.cpp
+++ b/lib/StarDict/StarDict.cpp
@ -205,6 +205,19 @@ bool StarDict::loadDictzipHeader() {

 bool StarDict::begin() {
  if (!loadInfo()) return false;
+
+  // Try uncompressed .dict file first (preferred - no memory overhead)
+  const std::string dictPath = basePath + ".dict";
+  FsFile testFile;
+  if (SdMan.openFileForRead("DICT", dictPath, testFile)) {
+    testFile.close();
+    useUncompressed = true;
+    Serial.printf("[%lu] [DICT] Using uncompressed .dict file (no decompression needed)\n", millis());
+    return true;
+  }
+
+  // Fall back to compressed .dict.dz
+  useUncompressed = false;
  if (!loadDictzipHeader()) return false;
  return true;
 }
@ -238,12 +251,46 @@ bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::stri
  return true;
 }

+bool StarDict::readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition) {
+  // Read directly from uncompressed .dict file - no decompression needed!
+  const std::string dictPath = basePath + ".dict";
+  FsFile file;
+  if (!SdMan.openFileForRead("DICT", dictPath, file)) {
+    Serial.printf("[DICT-DBG] Failed to open .dict file\n");
+    return false;
+  }
+
+  // Seek to the definition offset
+  if (!file.seek(offset)) {
+    Serial.printf("[DICT-DBG] Failed to seek to offset %lu\n", offset);
+    file.close();
+    return false;
+  }
+
+  // Read the definition directly into the string
+  definition.resize(size);
+  const int bytesRead = file.read(&definition[0], size);
+  file.close();
+
+  if (bytesRead != static_cast<int>(size)) {
+    Serial.printf("[DICT-DBG] Read %d bytes, expected %lu\n", bytesRead, size);
+    definition.clear();
+    return false;
+  }
+
+  return true;
+}
+
 bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
-  if (!dzInfo.loaded) return false;
+  if (!dzInfo.loaded) {
+    Serial.printf("[DICT-DBG] dzInfo not loaded!\n");
+    return false;
+  }

  const std::string dzPath = basePath + ".dict.dz";
  FsFile file;
  if (!SdMan.openFileForRead("DICT", dzPath, file)) {
+    Serial.printf("[DICT-DBG] Failed to open dict.dz file\n");
    return false;
  }

@ -252,7 +299,11 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
  const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
  const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;

+  Serial.printf("[DICT-DBG] Chunks: start=%lu, end=%lu, total=%u\n", 
+                startChunk, endChunk, dzInfo.chunkCount);
+
  if (endChunk >= dzInfo.chunkCount) {
+    Serial.printf("[DICT-DBG] endChunk %lu >= chunkCount %u\n", endChunk, dzInfo.chunkCount);
    file.close();
    return false;
  }
@ -263,13 +314,38 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
    fileOffset += dzInfo.chunkSizes[i];
  }

-  // Allocate buffers
-  const uint32_t maxCompressedSize = 65536;  // Max compressed chunk size
+  // Calculate actual max compressed size needed for the chunks we'll process
+  uint32_t maxCompressedSize = 0;
+  for (uint32_t i = startChunk; i <= endChunk; i++) {
+    if (dzInfo.chunkSizes[i] > maxCompressedSize) {
+      maxCompressedSize = dzInfo.chunkSizes[i];
+    }
+  }
+
+  // Allocate buffers - allocate inflator FIRST (smallest) to reduce fragmentation impact
+  // tinfl_decompressor is ~11KB, so total allocations are ~85KB
+  Serial.printf("[DICT-DBG] Allocating inflator=%u, comp=%lu, decomp=%u bytes\n", 
+                sizeof(tinfl_decompressor), maxCompressedSize, dzInfo.chunkLength);
+  
+  auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
+  if (!inflator) {
+    Serial.printf("[DICT-DBG] inflator alloc failed! (need %u bytes)\n", sizeof(tinfl_decompressor));
+    file.close();
+    return false;
+  }
+  
  auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
+  if (!compressedBuf) {
+    Serial.printf("[DICT-DBG] compressedBuf alloc failed!\n");
+    free(inflator);
+    file.close();
+    return false;
+  }
  auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
-  if (!compressedBuf || !decompressedBuf) {
+  if (!decompressedBuf) {
+    Serial.printf("[DICT-DBG] decompressedBuf alloc failed!\n");
+    free(inflator);
    free(compressedBuf);
-    free(decompressedBuf);
    file.close();
    return false;
  }
@ -277,13 +353,15 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
  definition.clear();
  definition.reserve(size);

-  // Process each needed chunk
+  // Process each needed chunk (reusing inflator allocation)
  for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
    const uint16_t compressedSize = dzInfo.chunkSizes[chunk];

    // Seek and read compressed data
    file.seek(fileOffset);
    if (file.read(compressedBuf, compressedSize) != compressedSize) {
+      Serial.printf("[DICT-DBG] File read failed at offset %lu, size %u\n", fileOffset, compressedSize);
+      free(inflator);
      free(compressedBuf);
      free(decompressedBuf);
      file.close();
@ -291,13 +369,6 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
    }

    // Decompress using raw inflate (no zlib header)
-    auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
-    if (!inflator) {
-      free(compressedBuf);
-      free(decompressedBuf);
-      file.close();
-      return false;
-    }
    tinfl_init(inflator);

    size_t inBytes = compressedSize;
@ -306,19 +377,13 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
        tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
                         TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);

-    free(inflator);
-
    if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
      // Try without zlib header flag
-      inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
-      if (inflator) {
-        tinfl_init(inflator);
-        inBytes = compressedSize;
-        outBytes = dzInfo.chunkLength;
-        tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
-                         TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
-        free(inflator);
-      }
+      tinfl_init(inflator);
+      inBytes = compressedSize;
+      outBytes = dzInfo.chunkLength;
+      tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
+                       TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
    }

    // Extract the portion we need from this chunk
@ -342,6 +407,7 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
    fileOffset += compressedSize;
  }

+  free(inflator);
  free(compressedBuf);
  free(decompressedBuf);
  file.close();
@ -349,9 +415,9 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
  return true;
 }

-// StarDict comparison function: case-insensitive first, then case-sensitive as tiebreaker
+// StarDict comparison function: case-insensitive matching
 int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
-  // First: case-insensitive comparison (like g_ascii_strcasecmp)
+  // Case-insensitive comparison (like g_ascii_strcasecmp)
  size_t i = 0;
  while (i < a.length() && i < b.length()) {
    const int ca = std::tolower(static_cast<unsigned char>(a[i]));
@ -362,8 +428,8 @@ int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
  if (a.length() != b.length()) {
    return static_cast<int>(a.length()) - static_cast<int>(b.length());
  }
-  // If case-insensitive equal, use case-sensitive as tiebreaker
-  return a.compare(b);
+  // Case-insensitive match found
+  return 0;
 }

 std::string StarDict::normalizeWord(const std::string& word) {
@ -403,6 +469,9 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
    return result;
  }

+  Serial.printf("[DICT-DBG] Searching for: '%s' (normalized: '%s')\n", 
+                word.c_str(), normalizedSearch.c_str());
+
  // First try .idx (main entries) - use prefix jump table for fast lookup
  const std::string idxPath = basePath + ".idx";
  FsFile idxFile;
@ -418,7 +487,10 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
    const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
    position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
  }
+  Serial.printf("[DICT-DBG] Starting at position %lu (prefix: %c%c)\n", 
+                position, normalizedSearch[0], normalizedSearch[1]);
  bool found = false;
+  uint32_t wordCount = 0;

  while (position < info.idxfilesize) {
    std::string currentWord;
@ -427,13 +499,24 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
    if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
      break;
    }
+    wordCount++;
+    if (wordCount % 50000 == 0) {
+      Serial.printf("[DICT-DBG] Progress: %lu words scanned, pos=%lu, current='%s'\n",
+                    wordCount, position, currentWord.c_str());
+    }

    // Use stardictStrcmp for case-insensitive matching
    const int cmp = stardictStrcmp(normalizedSearch, currentWord);

    if (cmp == 0) {
+      Serial.printf("[DICT-DBG] MATCH: '%s' == '%s' (offset=%lu, size=%lu)\n", 
+                    normalizedSearch.c_str(), currentWord.c_str(), dictOffset, dictSize);
      std::string definition;
-      if (decompressDefinition(dictOffset, dictSize, definition)) {
+      const bool loaded = useUncompressed 
+          ? readDefinitionDirect(dictOffset, dictSize, definition)
+          : decompressDefinition(dictOffset, dictSize, definition);
+      if (loaded) {
+        Serial.printf("[DICT-DBG] Definition loaded, %u bytes\n", definition.length());
        if (!found) {
          result.word = currentWord;
          result.definition = definition;
@ -442,14 +525,20 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
        } else {
          result.definition += "</html>" + definition;
        }
+      } else {
+        Serial.printf("[DICT-DBG] Definition load FAILED!\n");
      }
      // Continue scanning for additional matches (same word, different case)
-    } else if (cmp < 0) {
-      // Passed where target would be (file is sorted)
+    } else if (found) {
+      // We had matches but now moved past them - safe to stop
      break;
    }
+    // Note: Cannot use early-break before first match because prefix index
+    // may not land exactly at target position
  }

+  Serial.printf("[DICT-DBG] Search complete: %lu words scanned, found=%s\n",
+                wordCount, found ? "YES" : "NO");
  idxFile.close();

  // If not found in main index, try synonym file with prefix jump
@ -502,7 +591,10 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
              uint32_t dictOffset, dictSize;
              if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
                std::string definition;
-                if (decompressDefinition(dictOffset, dictSize, definition)) {
+                const bool loaded = useUncompressed 
+                    ? readDefinitionDirect(dictOffset, dictSize, definition)
+                    : decompressDefinition(dictOffset, dictSize, definition);
+                if (loaded) {
                  result.word = synWord;
                  result.definition = definition;
                  result.found = true;
@ -513,10 +605,9 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
            idxFile2.close();
          }
          break;  // Found a match, stop searching
-        } else if (cmp < 0) {
-          // Passed where it would be (file is sorted)
-          break;
        }
+        // Note: Cannot use early-break optimization here because prefix index
+        // may not land exactly at target position
      }
      synFile.close();
    }
--- a/lib/StarDict/StarDict.h
+++ b/lib/StarDict/StarDict.h
@ -6,7 +6,7 @@
 #include <string>

 // StarDict dictionary lookup library
-// Supports .ifo/.idx/.dict.dz format with linear scan lookup
+// Supports .ifo/.idx/.dict (uncompressed) and .ifo/.idx/.dict.dz (compressed) formats
 class StarDict {
 public:
  struct DictInfo {
@ -38,16 +38,22 @@ class StarDict {
  };
  DictzipInfo dzInfo;

+  // Whether to use uncompressed .dict file (preferred) or compressed .dict.dz
+  bool useUncompressed = false;
+
  // Parse .ifo file
  bool loadInfo();

-  // Load dictzip header for random access
+  // Load dictzip header for random access (only if using compressed)
  bool loadDictzipHeader();

  // Read word at given index file position, returns word and advances position
  bool readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
                          uint32_t& dictSize);

+  // Read definition directly from uncompressed .dict file (no decompression needed)
+  bool readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition);
+
  // Decompress a portion of the .dict.dz file
  bool decompressDefinition(uint32_t offset, uint32_t size, std::string& definition);

--- a/scripts/recompress_dictzip.py
+++ b/scripts/recompress_dictzip.py
@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+"""
+Recompress a dictzip file with a custom chunk size.
+
+Dictzip is a gzip-compatible format that allows random access by compressing
+data in independent chunks. The standard dictzip uses ~58KB chunks, but this
+can cause memory issues on embedded devices like ESP32.
+
+This script recompresses dictionary files with smaller chunks (default 16KB)
+to reduce memory requirements during decompression.
+
+Usage:
+    # From uncompressed .dict file:
+    python recompress_dictzip.py reader.dict reader.dict.dz --chunk-size 16384
+
+    # From existing .dict.dz file (will decompress first):
+    python recompress_dictzip.py reader.dict.dz reader_small.dict.dz --chunk-size 16384
+"""
+
+import argparse
+import gzip
+import struct
+import sys
+import time
+import zlib
+from pathlib import Path
+
+
+def read_input_file(input_path: Path) -> bytes:
+    """Read input file, decompressing if it's a .dz or .gz file."""
+    suffix = input_path.suffix.lower()
+    
+    if suffix in ('.dz', '.gz'):
+        print(f"Decompressing {input_path}...")
+        with gzip.open(input_path, 'rb') as f:
+            data = f.read()
+        print(f"  Decompressed size: {len(data):,} bytes")
+        return data
+    else:
+        print(f"Reading {input_path}...")
+        with open(input_path, 'rb') as f:
+            data = f.read()
+        print(f"  Size: {len(data):,} bytes")
+        return data
+
+
+def compress_chunk(data: bytes, level: int = 9) -> bytes:
+    """Compress a single chunk using raw deflate (no zlib header)."""
+    # Use raw deflate (-15 for raw, 15 for window size)
+    compressor = zlib.compressobj(level, zlib.DEFLATED, -15)
+    compressed = compressor.compress(data)
+    compressed += compressor.flush()
+    return compressed
+
+
+def create_dictzip(data: bytes, output_path: Path, chunk_size: int = 16384, 
+                   compression_level: int = 9) -> None:
+    """
+    Create a dictzip file from uncompressed data.
+    
+    Dictzip format:
+    - Standard gzip header with FEXTRA flag
+    - Extra field containing 'RA' subfield with chunk info
+    - Compressed chunks (raw deflate, no headers)
+    - Standard gzip trailer (CRC32 + ISIZE)
+    """
+    # Validate chunk size (must fit in 16-bit field)
+    if chunk_size > 65535:
+        raise ValueError(f"Chunk size {chunk_size} exceeds maximum of 65535")
+    if chunk_size < 1024:
+        raise ValueError(f"Chunk size {chunk_size} is too small (minimum 1024)")
+    
+    # Calculate number of chunks
+    num_chunks = (len(data) + chunk_size - 1) // chunk_size
+    
+    # Check if we can fit all chunk sizes in the extra field
+    # Extra field max is 65535 bytes, each chunk size takes 2 bytes, plus 6 bytes header
+    max_chunks = (65535 - 6) // 2
+    if num_chunks > max_chunks:
+        raise ValueError(f"Too many chunks ({num_chunks}) for dictzip format (max {max_chunks})")
+    
+    print(f"Compressing into {num_chunks} chunks of {chunk_size} bytes...")
+    
+    # Compress each chunk and collect sizes
+    compressed_chunks = []
+    chunk_sizes = []
+    
+    for i in range(num_chunks):
+        start = i * chunk_size
+        end = min(start + chunk_size, len(data))
+        chunk_data = data[start:end]
+        
+        compressed = compress_chunk(chunk_data, compression_level)
+        compressed_chunks.append(compressed)
+        chunk_sizes.append(len(compressed))
+        
+        if (i + 1) % 500 == 0 or i == num_chunks - 1:
+            print(f"  Compressed chunk {i + 1}/{num_chunks}")
+    
+    # Calculate CRC32 and size for gzip trailer
+    crc32 = zlib.crc32(data) & 0xffffffff
+    isize = len(data) & 0xffffffff
+    
+    # Build the extra field
+    # RA subfield: VER(2) + CHLEN(2) + CHCNT(2) + sizes[CHCNT](2 each)
+    ra_subfield_len = 6 + 2 * num_chunks
+    extra_field = bytearray()
+    extra_field.extend(b'RA')  # SI1, SI2
+    extra_field.extend(struct.pack('<H', ra_subfield_len))  # LEN
+    extra_field.extend(struct.pack('<H', 1))  # VER
+    extra_field.extend(struct.pack('<H', chunk_size))  # CHLEN
+    extra_field.extend(struct.pack('<H', num_chunks))  # CHCNT
+    for size in chunk_sizes:
+        if size > 65535:
+            raise ValueError(f"Compressed chunk size {size} exceeds 65535 bytes")
+        extra_field.extend(struct.pack('<H', size))
+    
+    xlen = len(extra_field)
+    
+    # Build gzip header
+    # Flags: FEXTRA (0x04)
+    timestamp = int(time.time())
+    xfl = 2 if compression_level == 9 else (4 if compression_level == 1 else 0)
+    
+    header = bytearray()
+    header.extend(b'\x1f\x8b')  # Magic number
+    header.append(0x08)  # Compression method (deflate)
+    header.append(0x04)  # Flags: FEXTRA
+    header.extend(struct.pack('<I', timestamp))  # MTIME
+    header.append(xfl)  # XFL
+    header.append(0xff)  # OS (unknown)
+    header.extend(struct.pack('<H', xlen))  # XLEN
+    header.extend(extra_field)
+    
+    # Write output file
+    print(f"Writing {output_path}...")
+    with open(output_path, 'wb') as f:
+        f.write(header)
+        for chunk in compressed_chunks:
+            f.write(chunk)
+        f.write(struct.pack('<I', crc32))
+        f.write(struct.pack('<I', isize))
+    
+    # Report stats
+    output_size = output_path.stat().st_size
+    ratio = (1 - output_size / len(data)) * 100
+    print(f"  Output size: {output_size:,} bytes ({ratio:.1f}% compression)")
+    print(f"  Chunk size: {chunk_size} bytes")
+    print(f"  Number of chunks: {num_chunks}")
+
+
+def verify_dictzip(path: Path) -> bool:
+    """Verify a dictzip file by reading its header and decompressing chunk by chunk."""
+    print(f"Verifying {path}...")
+    
+    with open(path, 'rb') as f:
+        # Read gzip header
+        magic = f.read(2)
+        if magic != b'\x1f\x8b':
+            print(f"  ERROR: Invalid gzip magic number")
+            return False
+        
+        method = f.read(1)[0]
+        if method != 8:
+            print(f"  ERROR: Unknown compression method: {method}")
+            return False
+        
+        flags = f.read(1)[0]
+        if not (flags & 0x04):
+            print(f"  ERROR: FEXTRA flag not set - not a dictzip file")
+            return False
+        
+        f.read(4)  # MTIME
+        f.read(1)  # XFL
+        f.read(1)  # OS
+        
+        # Read extra field
+        xlen = struct.unpack('<H', f.read(2))[0]
+        extra = f.read(xlen)
+        
+        # Parse extra field for RA subfield
+        pos = 0
+        found_ra = False
+        chlen = 0
+        chcnt = 0
+        chunk_sizes = []
+        
+        while pos < len(extra):
+            si1 = extra[pos]
+            si2 = extra[pos + 1]
+            slen = struct.unpack('<H', extra[pos + 2:pos + 4])[0]
+            
+            if si1 == ord('R') and si2 == ord('A'):
+                found_ra = True
+                ra_data = extra[pos + 4:pos + 4 + slen]
+                
+                ver = struct.unpack('<H', ra_data[0:2])[0]
+                chlen = struct.unpack('<H', ra_data[2:4])[0]
+                chcnt = struct.unpack('<H', ra_data[4:6])[0]
+                
+                print(f"  Version: {ver}")
+                print(f"  Chunk size: {chlen} bytes")
+                print(f"  Chunk count: {chcnt}")
+                
+                # Verify chunk sizes array
+                if len(ra_data) != 6 + 2 * chcnt:
+                    print(f"  ERROR: Chunk sizes array length mismatch")
+                    return False
+                
+                for i in range(chcnt):
+                    size = struct.unpack('<H', ra_data[6 + 2*i:8 + 2*i])[0]
+                    chunk_sizes.append(size)
+                
+                print(f"  Total compressed data: {sum(chunk_sizes):,} bytes")
+                break
+            
+            pos += 4 + slen
+        
+        if not found_ra:
+            print(f"  ERROR: RA subfield not found - not a dictzip file")
+            return False
+        
+        # Decompress chunk by chunk (like the firmware does)
+        data_start = f.tell()
+        decompressed_data = bytearray()
+        
+        try:
+            for i, comp_size in enumerate(chunk_sizes):
+                f.seek(data_start + sum(chunk_sizes[:i]))
+                compressed_chunk = f.read(comp_size)
+                
+                # Decompress using raw inflate (no zlib header)
+                decompressor = zlib.decompressobj(-15)
+                decompressed_chunk = decompressor.decompress(compressed_chunk)
+                decompressed_chunk += decompressor.flush()
+                decompressed_data.extend(decompressed_chunk)
+            
+            print(f"  Decompressed size: {len(decompressed_data):,} bytes")
+            
+            # Verify CRC32 from trailer
+            f.seek(-8, 2)  # Seek to 8 bytes before end
+            expected_crc = struct.unpack('<I', f.read(4))[0]
+            expected_size = struct.unpack('<I', f.read(4))[0]
+            
+            actual_crc = zlib.crc32(bytes(decompressed_data)) & 0xffffffff
+            actual_size = len(decompressed_data) & 0xffffffff
+            
+            if actual_crc != expected_crc:
+                print(f"  ERROR: CRC mismatch: expected {expected_crc:08x}, got {actual_crc:08x}")
+                return False
+            
+            if actual_size != expected_size:
+                print(f"  ERROR: Size mismatch: expected {expected_size}, got {actual_size}")
+                return False
+            
+            print(f"  CRC32: {actual_crc:08x} (verified)")
+            print(f"  Verification: PASSED")
+            return True
+            
+        except Exception as e:
+            print(f"  ERROR: Decompression failed: {e}")
+            return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Recompress a dictzip file with a custom chunk size.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Recompress with 16KB chunks (recommended for ESP32):
+  %(prog)s reader.dict reader.dict.dz --chunk-size 16384
+
+  # Recompress from existing .dz file:
+  %(prog)s reader.dict.dz reader_small.dict.dz --chunk-size 16384
+
+  # Verify a dictzip file:
+  %(prog)s --verify reader.dict.dz
+""")
+    
+    parser.add_argument('input', nargs='?', help='Input .dict or .dict.dz file')
+    parser.add_argument('output', nargs='?', help='Output .dict.dz file')
+    parser.add_argument('--chunk-size', '-c', type=int, default=16384,
+                        help='Chunk size in bytes (default: 16384, i.e., 16KB)')
+    parser.add_argument('--compression-level', '-l', type=int, default=9,
+                        choices=range(1, 10), metavar='1-9',
+                        help='Compression level 1-9 (default: 9)')
+    parser.add_argument('--verify', '-v', action='store_true',
+                        help='Verify a dictzip file instead of compressing')
+    
+    args = parser.parse_args()
+    
+    if args.verify:
+        if not args.input:
+            parser.error("Input file required for verification")
+        input_path = Path(args.input)
+        if not input_path.exists():
+            print(f"Error: File not found: {input_path}")
+            sys.exit(1)
+        success = verify_dictzip(input_path)
+        sys.exit(0 if success else 1)
+    
+    if not args.input or not args.output:
+        parser.error("Both input and output files are required")
+    
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    
+    if not input_path.exists():
+        print(f"Error: Input file not found: {input_path}")
+        sys.exit(1)
+    
+    if output_path.exists():
+        response = input(f"Output file {output_path} exists. Overwrite? [y/N] ")
+        if response.lower() != 'y':
+            print("Aborted.")
+            sys.exit(1)
+    
+    # Read and decompress input if needed
+    data = read_input_file(input_path)
+    
+    # Create new dictzip with specified chunk size
+    create_dictzip(data, output_path, args.chunk_size, args.compression_level)
+    
+    # Verify the output
+    print()
+    if verify_dictzip(output_path):
+        print(f"\nSuccess! Created {output_path} with {args.chunk_size}-byte chunks.")
+    else:
+        print(f"\nError: Verification failed!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/src/activities/dictionary/DictionaryResultActivity.cpp
+++ b/src/activities/dictionary/DictionaryResultActivity.cpp
@ -3,6 +3,10 @@
 #include <DictHtmlParser.h>
 #include <GfxRenderer.h>

+#include <algorithm>
+#include <cctype>
+#include <cstring>
+
 #include "DictionaryMargins.h"
 #include "MappedInputManager.h"
 #include "fontIds.h"
@ -15,22 +19,28 @@ void DictionaryResultActivity::taskTrampoline(void* param) {
 void DictionaryResultActivity::onEnter() {
  Activity::onEnter();

+  Serial.printf("[DICT-DBG] DictionaryResult onEnter, defLen=%u\n", rawDefinition.length());
+
  renderingMutex = xSemaphoreCreateMutex();
  currentPage = 0;

  // Process definition for display
  if (!notFound) {
+    Serial.printf("[DICT-DBG] Starting paginateDefinition...\n");
    paginateDefinition();
+    Serial.printf("[DICT-DBG] Pagination done, %u pages\n", pages.size());
  }

  updateRequired = true;

+  Serial.printf("[DICT-DBG] Creating display task...\n");
  xTaskCreate(&DictionaryResultActivity::taskTrampoline, "DictResultTask",
              4096,               // Stack size
              this,               // Parameters
              1,                  // Priority
              &displayTaskHandle  // Task handle
  );
+  Serial.printf("[DICT-DBG] Task created\n");
 }

 void DictionaryResultActivity::onExit() {
@ -61,24 +71,51 @@ void DictionaryResultActivity::loop() {
  }

  // Handle page navigation - use orientation-aware PageBack/PageForward buttons
-  if (!notFound && pages.size() > 1) {
+  if (!notFound && !pages.empty()) {
    const bool prevPressed = mappedInput.wasPressed(MappedInputManager::Button::PageBack) ||
                             mappedInput.wasPressed(MappedInputManager::Button::Left);
    const bool nextPressed = mappedInput.wasPressed(MappedInputManager::Button::PageForward) ||
                             mappedInput.wasPressed(MappedInputManager::Button::Right);

-    if (prevPressed && currentPage > 0) {
-      currentPage--;
-      updateRequired = true;
-    } else if (nextPressed && currentPage < static_cast<int>(pages.size()) - 1) {
-      currentPage++;
-      updateRequired = true;
+    if (prevPressed) {
+      if (currentPage > 0) {
+        // Navigate within cached pages
+        currentPage--;
+        updateRequired = true;
+      } else if (firstPageNumber > 1) {
+        // At first cached page but earlier pages exist - re-parse to get them
+        const int targetPage = firstPageNumber - 1;  // Go to the page before current first
+        Serial.printf("[DICT-DBG] Re-parsing to reach page %d\n", targetPage);
+        reparseToPage(targetPage);
+        updateRequired = true;
+      }
+    } else if (nextPressed) {
+      // Check if we can navigate to existing cached page
+      if (currentPage < static_cast<int>(pages.size()) - 1) {
+        currentPage++;
+        updateRequired = true;
+      } else if (hasMoreContent) {
+        // At end of cached pages but more content available - parse next chunk
+        Serial.printf("[DICT-DBG] Parsing next chunk on navigation (page %d)\n", currentPage);
+        const size_t pagesBefore = pages.size();
+        parseNextChunk();
+        
+        // If new pages were added, navigate to the next one
+        if (pages.size() > pagesBefore) {
+          currentPage++;
+          updateRequired = true;
+        }
+      }
+      // else: at true end of content, do nothing
    }
  }
 }

 void DictionaryResultActivity::paginateDefinition() {
  pages.clear();
+  parsePosition = 0;
+  hasMoreContent = false;
+  firstPageNumber = 1;

  if (rawDefinition.empty()) {
    notFound = true;
@ -99,14 +136,55 @@ void DictionaryResultActivity::paginateDefinition() {
  const int textWidth = pageWidth - textMargin - marginRight - 10;
  const int textHeight = pageHeight - marginTop - marginBottom - headerHeight - footerHeight;
  const int lineHeight = renderer.getLineHeight(UI_10_FONT_ID);
+  const int linesPerPage = textHeight / lineHeight;

-  // Collect all TextBlocks from the HTML parser
+  // For chunked parsing, we estimate how much HTML to parse at a time
+  // Roughly: each line is ~40-60 chars, so one page ≈ linesPerPage * 60 bytes of text
+  // With HTML overhead, multiply by ~2, plus buffer for finding break points
+  constexpr size_t CHUNK_SIZE_BASE = 1500;  // Base chunk size
+  const size_t chunkSize = std::max(CHUNK_SIZE_BASE, static_cast<size_t>(linesPerPage * 120));
+  
+  Serial.printf("[DICT-DBG] Chunked parsing: defLen=%u, chunkSize=%u, linesPerPage=%d\n",
+                rawDefinition.length(), chunkSize, linesPerPage);
+
+  // Determine how much to parse for first page
+  size_t parseEnd;
+  if (rawDefinition.length() <= chunkSize) {
+    // Small definition - parse it all
+    parseEnd = rawDefinition.length();
+    hasMoreContent = false;
+  } else {
+    // Large definition - find a good break point
+    parseEnd = findHtmlBreakPoint(rawDefinition, chunkSize / 2, chunkSize);
+    hasMoreContent = (parseEnd < rawDefinition.length());
+  }
+  
+  // Extract the chunk to parse
+  std::string chunk = rawDefinition.substr(0, parseEnd);
+  parsePosition = parseEnd;
+  
+  Serial.printf("[DICT-DBG] Parsing first chunk: 0-%u of %u, hasMore=%d\n",
+                parseEnd, rawDefinition.length(), hasMoreContent);
+
+  // Parse this chunk into TextBlocks
  std::vector<std::shared_ptr<TextBlock>> allBlocks;
-  DictHtmlParser::parse(rawDefinition, UI_10_FONT_ID, renderer, textWidth,
-                        [&allBlocks](std::shared_ptr<TextBlock> block) { allBlocks.push_back(block); });
+  DictHtmlParser::parse(chunk, UI_10_FONT_ID, renderer, textWidth,
+                        [&allBlocks](std::shared_ptr<TextBlock> block) { 
+                          allBlocks.push_back(block); 
+                        });
+  Serial.printf("[DICT-DBG] First chunk parsed, %u TextBlocks\n", allBlocks.size());

  if (allBlocks.empty()) {
-    notFound = true;
+    // Check if there's more to parse - maybe first chunk had no displayable content
+    if (hasMoreContent) {
+      // Try parsing more
+      parseNextChunk();
+      if (pages.empty()) {
+        notFound = true;
+      }
+    } else {
+      notFound = true;
+    }
    return;
  }

@ -131,6 +209,189 @@ void DictionaryResultActivity::paginateDefinition() {
  if (!currentPageBlocks.empty()) {
    pages.push_back(currentPageBlocks);
  }
+  
+  Serial.printf("[DICT-DBG] Initial pagination: %u pages\n", pages.size());
+}
+
+size_t DictionaryResultActivity::findHtmlBreakPoint(const std::string& html, size_t searchStart, size_t maxPos) {
+  // Search backwards from maxPos for good HTML break points
+  // Priority: </li>, </p>, </ol>, </ul>, </div> then any '>' then whitespace
+  
+  if (maxPos >= html.length()) {
+    return html.length();
+  }
+  
+  // Clamp searchStart to not exceed maxPos
+  if (searchStart > maxPos) {
+    searchStart = maxPos;
+  }
+  
+  // Search for closing block tags (best break points)
+  const char* closingTags[] = {"</li>", "</p>", "</ol>", "</ul>", "</div>", "</dd>", "</dt>"};
+  size_t bestBreak = std::string::npos;
+  
+  for (const char* tag : closingTags) {
+    size_t pos = html.rfind(tag, maxPos);
+    if (pos != std::string::npos && pos >= searchStart) {
+      // Found a closing tag - break after it
+      size_t breakAfter = pos + strlen(tag);
+      if (bestBreak == std::string::npos || breakAfter > bestBreak) {
+        bestBreak = breakAfter;
+      }
+    }
+  }
+  
+  if (bestBreak != std::string::npos) {
+    return bestBreak;
+  }
+  
+  // Fallback: search for any '>' (end of tag)
+  size_t tagEnd = html.rfind('>', maxPos);
+  if (tagEnd != std::string::npos && tagEnd >= searchStart) {
+    return tagEnd + 1;
+  }
+  
+  // Last resort: search for whitespace
+  for (size_t i = maxPos; i >= searchStart && i != std::string::npos; i--) {
+    if (std::isspace(static_cast<unsigned char>(html[i]))) {
+      return i + 1;
+    }
+    if (i == 0) break;
+  }
+  
+  // No good break point found - use maxPos
+  return maxPos;
+}
+
+void DictionaryResultActivity::parseNextChunk() {
+  if (!hasMoreContent || parsePosition >= rawDefinition.length()) {
+    hasMoreContent = false;
+    return;
+  }
+
+  Serial.printf("[DICT-DBG] parseNextChunk starting at position %u of %u\n",
+                parsePosition, rawDefinition.length());
+
+  // Get margins for calculating page dimensions
+  int marginTop, marginRight, marginBottom, marginLeft;
+  getDictionaryContentMargins(renderer, &marginTop, &marginRight, &marginBottom, &marginLeft);
+
+  const auto pageWidth = renderer.getScreenWidth();
+  const auto pageHeight = renderer.getScreenHeight();
+
+  // Calculate text area dimensions (must match paginateDefinition and render)
+  constexpr int headerHeight = 80;
+  constexpr int footerHeight = 30;
+  const int textMargin = marginLeft + 10;
+  const int textWidth = pageWidth - textMargin - marginRight - 10;
+  const int textHeight = pageHeight - marginTop - marginBottom - headerHeight - footerHeight;
+  const int lineHeight = renderer.getLineHeight(UI_10_FONT_ID);
+  const int linesPerPage = textHeight / lineHeight;
+
+  // Chunk size estimation (same as paginateDefinition)
+  constexpr size_t CHUNK_SIZE_BASE = 1500;
+  const size_t chunkSize = std::max(CHUNK_SIZE_BASE, static_cast<size_t>(linesPerPage * 120));
+
+  // Determine parse range for this chunk
+  size_t parseStart = parsePosition;
+  size_t parseEnd;
+  
+  if (parsePosition + chunkSize >= rawDefinition.length()) {
+    // This will be the last chunk
+    parseEnd = rawDefinition.length();
+    hasMoreContent = false;
+  } else {
+    // Find a good break point
+    parseEnd = findHtmlBreakPoint(rawDefinition, parsePosition + chunkSize / 2, parsePosition + chunkSize);
+    hasMoreContent = (parseEnd < rawDefinition.length());
+  }
+
+  // Extract the chunk to parse
+  std::string chunk = rawDefinition.substr(parseStart, parseEnd - parseStart);
+  parsePosition = parseEnd;
+
+  Serial.printf("[DICT-DBG] Parsing chunk %u-%u, hasMore=%d\n", parseStart, parseEnd, hasMoreContent);
+
+  // Parse this chunk into TextBlocks
+  std::vector<std::shared_ptr<TextBlock>> allBlocks;
+  DictHtmlParser::parse(chunk, UI_10_FONT_ID, renderer, textWidth,
+                        [&allBlocks](std::shared_ptr<TextBlock> block) { 
+                          allBlocks.push_back(block); 
+                        });
+
+  Serial.printf("[DICT-DBG] Chunk parsed, %u TextBlocks\n", allBlocks.size());
+
+  if (allBlocks.empty()) {
+    // No content in this chunk - try parsing more if available
+    if (hasMoreContent) {
+      parseNextChunk();
+    }
+    return;
+  }
+
+  // Paginate: group TextBlocks into pages based on available height
+  std::vector<std::shared_ptr<TextBlock>> currentPageBlocks;
+  int currentY = 0;
+
+  for (const auto& block : allBlocks) {
+    if (currentY + lineHeight > textHeight && !currentPageBlocks.empty()) {
+      // Page is full, start new page
+      pages.push_back(currentPageBlocks);
+      currentPageBlocks.clear();
+      currentY = 0;
+    }
+
+    currentPageBlocks.push_back(block);
+    currentY += lineHeight;
+  }
+
+  // Add remaining blocks as last page
+  if (!currentPageBlocks.empty()) {
+    pages.push_back(currentPageBlocks);
+  }
+
+  // Trim old pages if we exceed the limit to prevent memory exhaustion
+  while (static_cast<int>(pages.size()) > MAX_CACHED_PAGES && currentPage > 0) {
+    // Remove the oldest page and adjust indices
+    pages.erase(pages.begin());
+    currentPage--;
+    firstPageNumber++;
+    Serial.printf("[DICT-DBG] Trimmed old page, firstPageNumber now %d\n", firstPageNumber);
+  }
+
+  Serial.printf("[DICT-DBG] After chunk: %u cached pages (pages %d-%d)\n", 
+                pages.size(), firstPageNumber, firstPageNumber + static_cast<int>(pages.size()) - 1);
+}
+
+void DictionaryResultActivity::reparseToPage(int targetPageNumber) {
+  // Re-parse from the beginning to reach an earlier page that was trimmed
+  // This allows backward navigation through the entire definition
+  
+  Serial.printf("[DICT-DBG] reparseToPage: target=%d, clearing and re-parsing\n", targetPageNumber);
+  
+  // Clear current state and start fresh
+  pages.clear();
+  parsePosition = 0;
+  firstPageNumber = 1;
+  hasMoreContent = !rawDefinition.empty();
+  
+  // Parse chunks until we have the target page
+  while (hasMoreContent && firstPageNumber + static_cast<int>(pages.size()) - 1 < targetPageNumber) {
+    parseNextChunk();
+  }
+  
+  // Now position currentPage to show the target page
+  if (targetPageNumber >= firstPageNumber && 
+      targetPageNumber < firstPageNumber + static_cast<int>(pages.size())) {
+    currentPage = targetPageNumber - firstPageNumber;
+  } else {
+    // Target page doesn't exist (definition is shorter than expected)
+    currentPage = static_cast<int>(pages.size()) - 1;
+    if (currentPage < 0) currentPage = 0;
+  }
+  
+  Serial.printf("[DICT-DBG] reparseToPage done: currentPage=%d, firstPageNumber=%d, pages=%u\n",
+                currentPage, firstPageNumber, pages.size());
 }

 void DictionaryResultActivity::displayTaskLoop() {
@ -181,17 +442,29 @@ void DictionaryResultActivity::render() const {
      y += lineHeight;
    }

-    // Draw page indicator if multiple pages
-    if (pages.size() > 1) {
-      char pageIndicator[32];
-      snprintf(pageIndicator, sizeof(pageIndicator), "Page %d of %d", currentPage + 1, static_cast<int>(pages.size()));
+    // Draw page indicator if multiple pages or more content available
+    const bool hasMultiplePages = pages.size() > 1 || hasMoreContent || firstPageNumber > 1;
+    if (hasMultiplePages) {
+      char pageIndicator[48];
+      const int displayPageNum = firstPageNumber + currentPage;
+      const int lastKnownPage = firstPageNumber + static_cast<int>(pages.size()) - 1;
+      if (hasMoreContent) {
+        // More content to load - show "Page X of Y+" to indicate more pages coming
+        snprintf(pageIndicator, sizeof(pageIndicator), "Page %d of %d+", displayPageNum, lastKnownPage);
+      } else {
+        snprintf(pageIndicator, sizeof(pageIndicator), "Page %d of %d", displayPageNum, lastKnownPage);
+      }
      renderer.drawCenteredText(SMALL_FONT_ID, pageHeight - marginBottom - 5, pageIndicator);
    }
  }

  // Draw button hints
-  const char* leftHint = (pages.size() > 1 && currentPage > 0) ? "< Prev" : "";
-  const char* rightHint = (pages.size() > 1 && currentPage < static_cast<int>(pages.size()) - 1) ? "Next >" : "";
+  // Show navigation hints when there are multiple pages or more content to load
+  // canGoBack is true if we have previous cached pages OR if earlier pages were trimmed
+  const bool canGoBack = currentPage > 0 || firstPageNumber > 1;
+  const bool canGoForward = currentPage < static_cast<int>(pages.size()) - 1 || hasMoreContent;
+  const char* leftHint = canGoBack ? "< Prev" : "";
+  const char* rightHint = canGoForward ? "Next >" : "";
  const auto labels = mappedInput.mapLabels("\xc2\xab Back", "Search", leftHint, rightHint);
  renderer.drawButtonHints(UI_10_FONT_ID, labels.btn1, labels.btn2, labels.btn3, labels.btn4);

--- a/src/activities/dictionary/DictionaryResultActivity.h
+++ b/src/activities/dictionary/DictionaryResultActivity.h
@ -26,14 +26,24 @@ class DictionaryResultActivity final : public Activity {
  const std::function<void()> onSearchAnother;

  // Pagination - each page contains TextBlocks with styled text
+  // We limit cached pages to prevent memory exhaustion on long definitions
+  static constexpr int MAX_CACHED_PAGES = 4;
  std::vector<std::vector<std::shared_ptr<TextBlock>>> pages;
-  int currentPage = 0;
+  int currentPage = 0;       // Index into pages vector
+  int firstPageNumber = 1;   // The page number of pages[0] (1-based for display)
  bool notFound = false;

+  // Chunked parsing state - parse definition on-demand as user navigates
+  size_t parsePosition = 0;     // Current position in rawDefinition HTML
+  bool hasMoreContent = false;  // True if more HTML remains to parse
+
  static void taskTrampoline(void* param);
  [[noreturn]] void displayTaskLoop();
  void render() const;
  void paginateDefinition();
+  void parseNextChunk();
+  void reparseToPage(int targetPageNumber);  // Re-parse from beginning to reach earlier page
+  static size_t findHtmlBreakPoint(const std::string& html, size_t searchStart, size_t maxPos);

 public:
  /**