checkpoint: pre list-to-vector refactor, fixes dictionary crash, mostly

- Add uncompressed dictionary (.dict) file support to avoid decompression memory issues
- Implement chunked on-demand parsing for large definitions
- Add backward navigation with re-parse capability
- Limit cached pages to MAX_CACHED_PAGES (4) to prevent memory exhaustion
- Add helper script for extracting/recompressing dictzip files
This commit is contained in:
cottongin
2026-01-29 09:33:40 -05:00
parent 8b41dccfb9
commit 62643ae933
5 changed files with 770 additions and 55 deletions

View File

@@ -205,6 +205,19 @@ bool StarDict::loadDictzipHeader() {
bool StarDict::begin() {
if (!loadInfo()) return false;
// Try uncompressed .dict file first (preferred - no memory overhead)
const std::string dictPath = basePath + ".dict";
FsFile testFile;
if (SdMan.openFileForRead("DICT", dictPath, testFile)) {
testFile.close();
useUncompressed = true;
Serial.printf("[%lu] [DICT] Using uncompressed .dict file (no decompression needed)\n", millis());
return true;
}
// Fall back to compressed .dict.dz
useUncompressed = false;
if (!loadDictzipHeader()) return false;
return true;
}
@@ -238,12 +251,46 @@ bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::stri
return true;
}
bool StarDict::readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition) {
// Read directly from uncompressed .dict file - no decompression needed!
const std::string dictPath = basePath + ".dict";
FsFile file;
if (!SdMan.openFileForRead("DICT", dictPath, file)) {
Serial.printf("[DICT-DBG] Failed to open .dict file\n");
return false;
}
// Seek to the definition offset
if (!file.seek(offset)) {
Serial.printf("[DICT-DBG] Failed to seek to offset %lu\n", offset);
file.close();
return false;
}
// Read the definition directly into the string
definition.resize(size);
const int bytesRead = file.read(&definition[0], size);
file.close();
if (bytesRead != static_cast<int>(size)) {
Serial.printf("[DICT-DBG] Read %d bytes, expected %lu\n", bytesRead, size);
definition.clear();
return false;
}
return true;
}
bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
if (!dzInfo.loaded) return false;
if (!dzInfo.loaded) {
Serial.printf("[DICT-DBG] dzInfo not loaded!\n");
return false;
}
const std::string dzPath = basePath + ".dict.dz";
FsFile file;
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
Serial.printf("[DICT-DBG] Failed to open dict.dz file\n");
return false;
}
@@ -252,7 +299,11 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;
Serial.printf("[DICT-DBG] Chunks: start=%lu, end=%lu, total=%u\n",
startChunk, endChunk, dzInfo.chunkCount);
if (endChunk >= dzInfo.chunkCount) {
Serial.printf("[DICT-DBG] endChunk %lu >= chunkCount %u\n", endChunk, dzInfo.chunkCount);
file.close();
return false;
}
@@ -263,13 +314,38 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
fileOffset += dzInfo.chunkSizes[i];
}
// Allocate buffers
const uint32_t maxCompressedSize = 65536; // Max compressed chunk size
// Calculate actual max compressed size needed for the chunks we'll process
uint32_t maxCompressedSize = 0;
for (uint32_t i = startChunk; i <= endChunk; i++) {
if (dzInfo.chunkSizes[i] > maxCompressedSize) {
maxCompressedSize = dzInfo.chunkSizes[i];
}
}
// Allocate buffers - allocate inflator FIRST (smallest) to reduce fragmentation impact
// tinfl_decompressor is ~11KB, so total allocations are ~85KB
Serial.printf("[DICT-DBG] Allocating inflator=%u, comp=%lu, decomp=%u bytes\n",
sizeof(tinfl_decompressor), maxCompressedSize, dzInfo.chunkLength);
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (!inflator) {
Serial.printf("[DICT-DBG] inflator alloc failed! (need %u bytes)\n", sizeof(tinfl_decompressor));
file.close();
return false;
}
auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
if (!compressedBuf) {
Serial.printf("[DICT-DBG] compressedBuf alloc failed!\n");
free(inflator);
file.close();
return false;
}
auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
if (!compressedBuf || !decompressedBuf) {
if (!decompressedBuf) {
Serial.printf("[DICT-DBG] decompressedBuf alloc failed!\n");
free(inflator);
free(compressedBuf);
free(decompressedBuf);
file.close();
return false;
}
@@ -277,13 +353,15 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
definition.clear();
definition.reserve(size);
// Process each needed chunk
// Process each needed chunk (reusing inflator allocation)
for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
const uint16_t compressedSize = dzInfo.chunkSizes[chunk];
// Seek and read compressed data
file.seek(fileOffset);
if (file.read(compressedBuf, compressedSize) != compressedSize) {
Serial.printf("[DICT-DBG] File read failed at offset %lu, size %u\n", fileOffset, compressedSize);
free(inflator);
free(compressedBuf);
free(decompressedBuf);
file.close();
@@ -291,13 +369,6 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
}
// Decompress using raw inflate (no zlib header)
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (!inflator) {
free(compressedBuf);
free(decompressedBuf);
file.close();
return false;
}
tinfl_init(inflator);
size_t inBytes = compressedSize;
@@ -306,19 +377,13 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);
free(inflator);
if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
// Try without zlib header flag
inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (inflator) {
tinfl_init(inflator);
inBytes = compressedSize;
outBytes = dzInfo.chunkLength;
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
free(inflator);
}
tinfl_init(inflator);
inBytes = compressedSize;
outBytes = dzInfo.chunkLength;
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
}
// Extract the portion we need from this chunk
@@ -342,6 +407,7 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
fileOffset += compressedSize;
}
free(inflator);
free(compressedBuf);
free(decompressedBuf);
file.close();
@@ -349,9 +415,9 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
return true;
}
// StarDict comparison function: case-insensitive first, then case-sensitive as tiebreaker
// StarDict comparison function: case-insensitive matching
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
// First: case-insensitive comparison (like g_ascii_strcasecmp)
// Case-insensitive comparison (like g_ascii_strcasecmp)
size_t i = 0;
while (i < a.length() && i < b.length()) {
const int ca = std::tolower(static_cast<unsigned char>(a[i]));
@@ -362,8 +428,8 @@ int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
if (a.length() != b.length()) {
return static_cast<int>(a.length()) - static_cast<int>(b.length());
}
// If case-insensitive equal, use case-sensitive as tiebreaker
return a.compare(b);
// Case-insensitive match found
return 0;
}
std::string StarDict::normalizeWord(const std::string& word) {
@@ -403,6 +469,9 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
return result;
}
Serial.printf("[DICT-DBG] Searching for: '%s' (normalized: '%s')\n",
word.c_str(), normalizedSearch.c_str());
// First try .idx (main entries) - use prefix jump table for fast lookup
const std::string idxPath = basePath + ".idx";
FsFile idxFile;
@@ -418,7 +487,10 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
}
Serial.printf("[DICT-DBG] Starting at position %lu (prefix: %c%c)\n",
position, normalizedSearch[0], normalizedSearch[1]);
bool found = false;
uint32_t wordCount = 0;
while (position < info.idxfilesize) {
std::string currentWord;
@@ -427,13 +499,24 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
break;
}
wordCount++;
if (wordCount % 50000 == 0) {
Serial.printf("[DICT-DBG] Progress: %lu words scanned, pos=%lu, current='%s'\n",
wordCount, position, currentWord.c_str());
}
// Use stardictStrcmp for case-insensitive matching
const int cmp = stardictStrcmp(normalizedSearch, currentWord);
if (cmp == 0) {
Serial.printf("[DICT-DBG] MATCH: '%s' == '%s' (offset=%lu, size=%lu)\n",
normalizedSearch.c_str(), currentWord.c_str(), dictOffset, dictSize);
std::string definition;
if (decompressDefinition(dictOffset, dictSize, definition)) {
const bool loaded = useUncompressed
? readDefinitionDirect(dictOffset, dictSize, definition)
: decompressDefinition(dictOffset, dictSize, definition);
if (loaded) {
Serial.printf("[DICT-DBG] Definition loaded, %u bytes\n", definition.length());
if (!found) {
result.word = currentWord;
result.definition = definition;
@@ -442,14 +525,20 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
} else {
result.definition += "</html>" + definition;
}
} else {
Serial.printf("[DICT-DBG] Definition load FAILED!\n");
}
// Continue scanning for additional matches (same word, different case)
} else if (cmp < 0) {
// Passed where target would be (file is sorted)
} else if (found) {
// We had matches but now moved past them - safe to stop
break;
}
// Note: Cannot use early-break before first match because prefix index
// may not land exactly at target position
}
Serial.printf("[DICT-DBG] Search complete: %lu words scanned, found=%s\n",
wordCount, found ? "YES" : "NO");
idxFile.close();
// If not found in main index, try synonym file with prefix jump
@@ -502,7 +591,10 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
uint32_t dictOffset, dictSize;
if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
std::string definition;
if (decompressDefinition(dictOffset, dictSize, definition)) {
const bool loaded = useUncompressed
? readDefinitionDirect(dictOffset, dictSize, definition)
: decompressDefinition(dictOffset, dictSize, definition);
if (loaded) {
result.word = synWord;
result.definition = definition;
result.found = true;
@@ -513,10 +605,9 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
idxFile2.close();
}
break; // Found a match, stop searching
} else if (cmp < 0) {
// Passed where it would be (file is sorted)
break;
}
// Note: Cannot use early-break optimization here because prefix index
// may not land exactly at target position
}
synFile.close();
}

View File

@@ -6,7 +6,7 @@
#include <string>
// StarDict dictionary lookup library
// Supports .ifo/.idx/.dict.dz format with linear scan lookup
// Supports .ifo/.idx/.dict (uncompressed) and .ifo/.idx/.dict.dz (compressed) formats
class StarDict {
public:
struct DictInfo {
@@ -38,16 +38,22 @@ class StarDict {
};
DictzipInfo dzInfo;
// Whether to use uncompressed .dict file (preferred) or compressed .dict.dz
bool useUncompressed = false;
// Parse .ifo file
bool loadInfo();
// Load dictzip header for random access
// Load dictzip header for random access (only if using compressed)
bool loadDictzipHeader();
// Read word at given index file position, returns word and advances position
bool readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
uint32_t& dictSize);
// Read definition directly from uncompressed .dict file (no decompression needed)
bool readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition);
// Decompress a portion of the .dict.dz file
bool decompressDefinition(uint32_t offset, uint32_t size, std::string& definition);