checkpoint: pre list-to-vector refactor, fixes dictionary crash, mostly

- Add uncompressed dictionary (.dict) file support to avoid decompression memory issues - Implement chunked on-demand parsing for large definitions - Add backward navigation with re-parse capability - Limit cached pages to MAX_CACHED_PAGES (4) to prevent memory exhaustion - Add helper script for extracting/recompressing dictzip files
2026-01-29 09:33:40 -05:00
parent 8b41dccfb9
commit 62643ae933
5 changed files with 770 additions and 55 deletions
--- a/lib/StarDict/StarDict.cpp
+++ b/lib/StarDict/StarDict.cpp
@@ -205,6 +205,19 @@ bool StarDict::loadDictzipHeader() {

 bool StarDict::begin() {
  if (!loadInfo()) return false;
+
+  // Try uncompressed .dict file first (preferred - no memory overhead)
+  const std::string dictPath = basePath + ".dict";
+  FsFile testFile;
+  if (SdMan.openFileForRead("DICT", dictPath, testFile)) {
+    testFile.close();
+    useUncompressed = true;
+    Serial.printf("[%lu] [DICT] Using uncompressed .dict file (no decompression needed)\n", millis());
+    return true;
+  }
+
+  // Fall back to compressed .dict.dz
+  useUncompressed = false;
  if (!loadDictzipHeader()) return false;
  return true;
 }
@@ -238,12 +251,46 @@ bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::stri
  return true;
 }

+bool StarDict::readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition) {
+  // Read directly from uncompressed .dict file - no decompression needed!
+  const std::string dictPath = basePath + ".dict";
+  FsFile file;
+  if (!SdMan.openFileForRead("DICT", dictPath, file)) {
+    Serial.printf("[DICT-DBG] Failed to open .dict file\n");
+    return false;
+  }
+
+  // Seek to the definition offset
+  if (!file.seek(offset)) {
+    Serial.printf("[DICT-DBG] Failed to seek to offset %lu\n", offset);
+    file.close();
+    return false;
+  }
+
+  // Read the definition directly into the string
+  definition.resize(size);
+  const int bytesRead = file.read(&definition[0], size);
+  file.close();
+
+  if (bytesRead != static_cast<int>(size)) {
+    Serial.printf("[DICT-DBG] Read %d bytes, expected %lu\n", bytesRead, size);
+    definition.clear();
+    return false;
+  }
+
+  return true;
+}
+
 bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
-  if (!dzInfo.loaded) return false;
+  if (!dzInfo.loaded) {
+    Serial.printf("[DICT-DBG] dzInfo not loaded!\n");
+    return false;
+  }

  const std::string dzPath = basePath + ".dict.dz";
  FsFile file;
  if (!SdMan.openFileForRead("DICT", dzPath, file)) {
+    Serial.printf("[DICT-DBG] Failed to open dict.dz file\n");
    return false;
  }

@@ -252,7 +299,11 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
  const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
  const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;

+  Serial.printf("[DICT-DBG] Chunks: start=%lu, end=%lu, total=%u\n", 
+                startChunk, endChunk, dzInfo.chunkCount);
+
  if (endChunk >= dzInfo.chunkCount) {
+    Serial.printf("[DICT-DBG] endChunk %lu >= chunkCount %u\n", endChunk, dzInfo.chunkCount);
    file.close();
    return false;
  }
@@ -263,13 +314,38 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
    fileOffset += dzInfo.chunkSizes[i];
  }

-  // Allocate buffers
-  const uint32_t maxCompressedSize = 65536;  // Max compressed chunk size
+  // Calculate actual max compressed size needed for the chunks we'll process
+  uint32_t maxCompressedSize = 0;
+  for (uint32_t i = startChunk; i <= endChunk; i++) {
+    if (dzInfo.chunkSizes[i] > maxCompressedSize) {
+      maxCompressedSize = dzInfo.chunkSizes[i];
+    }
+  }
+
+  // Allocate buffers - allocate inflator FIRST (smallest) to reduce fragmentation impact
+  // tinfl_decompressor is ~11KB, so total allocations are ~85KB
+  Serial.printf("[DICT-DBG] Allocating inflator=%u, comp=%lu, decomp=%u bytes\n", 
+                sizeof(tinfl_decompressor), maxCompressedSize, dzInfo.chunkLength);
+  
+  auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
+  if (!inflator) {
+    Serial.printf("[DICT-DBG] inflator alloc failed! (need %u bytes)\n", sizeof(tinfl_decompressor));
+    file.close();
+    return false;
+  }
+  
  auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
+  if (!compressedBuf) {
+    Serial.printf("[DICT-DBG] compressedBuf alloc failed!\n");
+    free(inflator);
+    file.close();
+    return false;
+  }
  auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
-  if (!compressedBuf || !decompressedBuf) {
+  if (!decompressedBuf) {
+    Serial.printf("[DICT-DBG] decompressedBuf alloc failed!\n");
+    free(inflator);
    free(compressedBuf);
-    free(decompressedBuf);
    file.close();
    return false;
  }
@@ -277,13 +353,15 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
  definition.clear();
  definition.reserve(size);

-  // Process each needed chunk
+  // Process each needed chunk (reusing inflator allocation)
  for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
    const uint16_t compressedSize = dzInfo.chunkSizes[chunk];

    // Seek and read compressed data
    file.seek(fileOffset);
    if (file.read(compressedBuf, compressedSize) != compressedSize) {
+      Serial.printf("[DICT-DBG] File read failed at offset %lu, size %u\n", fileOffset, compressedSize);
+      free(inflator);
      free(compressedBuf);
      free(decompressedBuf);
      file.close();
@@ -291,13 +369,6 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
    }

    // Decompress using raw inflate (no zlib header)
-    auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
-    if (!inflator) {
-      free(compressedBuf);
-      free(decompressedBuf);
-      file.close();
-      return false;
-    }
    tinfl_init(inflator);

    size_t inBytes = compressedSize;
@@ -306,19 +377,13 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
        tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
                         TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);

-    free(inflator);
-
    if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
      // Try without zlib header flag
-      inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
-      if (inflator) {
-        tinfl_init(inflator);
-        inBytes = compressedSize;
-        outBytes = dzInfo.chunkLength;
-        tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
-                         TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
-        free(inflator);
-      }
+      tinfl_init(inflator);
+      inBytes = compressedSize;
+      outBytes = dzInfo.chunkLength;
+      tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
+                       TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
    }

    // Extract the portion we need from this chunk
@@ -342,6 +407,7 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
    fileOffset += compressedSize;
  }

+  free(inflator);
  free(compressedBuf);
  free(decompressedBuf);
  file.close();
@@ -349,9 +415,9 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
  return true;
 }

-// StarDict comparison function: case-insensitive first, then case-sensitive as tiebreaker
+// StarDict comparison function: case-insensitive matching
 int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
-  // First: case-insensitive comparison (like g_ascii_strcasecmp)
+  // Case-insensitive comparison (like g_ascii_strcasecmp)
  size_t i = 0;
  while (i < a.length() && i < b.length()) {
    const int ca = std::tolower(static_cast<unsigned char>(a[i]));
@@ -362,8 +428,8 @@ int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
  if (a.length() != b.length()) {
    return static_cast<int>(a.length()) - static_cast<int>(b.length());
  }
-  // If case-insensitive equal, use case-sensitive as tiebreaker
-  return a.compare(b);
+  // Case-insensitive match found
+  return 0;
 }

 std::string StarDict::normalizeWord(const std::string& word) {
@@ -403,6 +469,9 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
    return result;
  }

+  Serial.printf("[DICT-DBG] Searching for: '%s' (normalized: '%s')\n", 
+                word.c_str(), normalizedSearch.c_str());
+
  // First try .idx (main entries) - use prefix jump table for fast lookup
  const std::string idxPath = basePath + ".idx";
  FsFile idxFile;
@@ -418,7 +487,10 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
    const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
    position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
  }
+  Serial.printf("[DICT-DBG] Starting at position %lu (prefix: %c%c)\n", 
+                position, normalizedSearch[0], normalizedSearch[1]);
  bool found = false;
+  uint32_t wordCount = 0;

  while (position < info.idxfilesize) {
    std::string currentWord;
@@ -427,13 +499,24 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
    if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
      break;
    }
+    wordCount++;
+    if (wordCount % 50000 == 0) {
+      Serial.printf("[DICT-DBG] Progress: %lu words scanned, pos=%lu, current='%s'\n",
+                    wordCount, position, currentWord.c_str());
+    }

    // Use stardictStrcmp for case-insensitive matching
    const int cmp = stardictStrcmp(normalizedSearch, currentWord);

    if (cmp == 0) {
+      Serial.printf("[DICT-DBG] MATCH: '%s' == '%s' (offset=%lu, size=%lu)\n", 
+                    normalizedSearch.c_str(), currentWord.c_str(), dictOffset, dictSize);
      std::string definition;
-      if (decompressDefinition(dictOffset, dictSize, definition)) {
+      const bool loaded = useUncompressed 
+          ? readDefinitionDirect(dictOffset, dictSize, definition)
+          : decompressDefinition(dictOffset, dictSize, definition);
+      if (loaded) {
+        Serial.printf("[DICT-DBG] Definition loaded, %u bytes\n", definition.length());
        if (!found) {
          result.word = currentWord;
          result.definition = definition;
@@ -442,14 +525,20 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
        } else {
          result.definition += "</html>" + definition;
        }
+      } else {
+        Serial.printf("[DICT-DBG] Definition load FAILED!\n");
      }
      // Continue scanning for additional matches (same word, different case)
-    } else if (cmp < 0) {
-      // Passed where target would be (file is sorted)
+    } else if (found) {
+      // We had matches but now moved past them - safe to stop
      break;
    }
+    // Note: Cannot use early-break before first match because prefix index
+    // may not land exactly at target position
  }

+  Serial.printf("[DICT-DBG] Search complete: %lu words scanned, found=%s\n",
+                wordCount, found ? "YES" : "NO");
  idxFile.close();

  // If not found in main index, try synonym file with prefix jump
@@ -502,7 +591,10 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
              uint32_t dictOffset, dictSize;
              if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
                std::string definition;
-                if (decompressDefinition(dictOffset, dictSize, definition)) {
+                const bool loaded = useUncompressed 
+                    ? readDefinitionDirect(dictOffset, dictSize, definition)
+                    : decompressDefinition(dictOffset, dictSize, definition);
+                if (loaded) {
                  result.word = synWord;
                  result.definition = definition;
                  result.found = true;
@@ -513,10 +605,9 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
            idxFile2.close();
          }
          break;  // Found a match, stop searching
-        } else if (cmp < 0) {
-          // Passed where it would be (file is sorted)
-          break;
        }
+        // Note: Cannot use early-break optimization here because prefix index
+        // may not land exactly at target position
      }
      synFile.close();
    }
--- a/lib/StarDict/StarDict.h
+++ b/lib/StarDict/StarDict.h
@@ -6,7 +6,7 @@
 #include <string>

 // StarDict dictionary lookup library
-// Supports .ifo/.idx/.dict.dz format with linear scan lookup
+// Supports .ifo/.idx/.dict (uncompressed) and .ifo/.idx/.dict.dz (compressed) formats
 class StarDict {
 public:
  struct DictInfo {
@@ -38,16 +38,22 @@ class StarDict {
  };
  DictzipInfo dzInfo;

+  // Whether to use uncompressed .dict file (preferred) or compressed .dict.dz
+  bool useUncompressed = false;
+
  // Parse .ifo file
  bool loadInfo();

-  // Load dictzip header for random access
+  // Load dictzip header for random access (only if using compressed)
  bool loadDictzipHeader();

  // Read word at given index file position, returns word and advances position
  bool readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
                          uint32_t& dictSize);

+  // Read definition directly from uncompressed .dict file (no decompression needed)
+  bool readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition);
+
  // Decompress a portion of the .dict.dz file
  bool decompressDefinition(uint32_t offset, uint32_t size, std::string& definition);