checkpoint: pre list-to-vector refactor, fixes dictionary crash, mostly
- Add uncompressed dictionary (.dict) file support to avoid decompression memory issues - Implement chunked on-demand parsing for large definitions - Add backward navigation with re-parse capability - Limit cached pages to MAX_CACHED_PAGES (4) to prevent memory exhaustion - Add helper script for extracting/recompressing dictzip files
This commit is contained in:
@@ -205,6 +205,19 @@ bool StarDict::loadDictzipHeader() {
|
||||
|
||||
bool StarDict::begin() {
|
||||
if (!loadInfo()) return false;
|
||||
|
||||
// Try uncompressed .dict file first (preferred - no memory overhead)
|
||||
const std::string dictPath = basePath + ".dict";
|
||||
FsFile testFile;
|
||||
if (SdMan.openFileForRead("DICT", dictPath, testFile)) {
|
||||
testFile.close();
|
||||
useUncompressed = true;
|
||||
Serial.printf("[%lu] [DICT] Using uncompressed .dict file (no decompression needed)\n", millis());
|
||||
return true;
|
||||
}
|
||||
|
||||
// Fall back to compressed .dict.dz
|
||||
useUncompressed = false;
|
||||
if (!loadDictzipHeader()) return false;
|
||||
return true;
|
||||
}
|
||||
@@ -238,12 +251,46 @@ bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::stri
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StarDict::readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition) {
|
||||
// Read directly from uncompressed .dict file - no decompression needed!
|
||||
const std::string dictPath = basePath + ".dict";
|
||||
FsFile file;
|
||||
if (!SdMan.openFileForRead("DICT", dictPath, file)) {
|
||||
Serial.printf("[DICT-DBG] Failed to open .dict file\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Seek to the definition offset
|
||||
if (!file.seek(offset)) {
|
||||
Serial.printf("[DICT-DBG] Failed to seek to offset %lu\n", offset);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read the definition directly into the string
|
||||
definition.resize(size);
|
||||
const int bytesRead = file.read(&definition[0], size);
|
||||
file.close();
|
||||
|
||||
if (bytesRead != static_cast<int>(size)) {
|
||||
Serial.printf("[DICT-DBG] Read %d bytes, expected %lu\n", bytesRead, size);
|
||||
definition.clear();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
|
||||
if (!dzInfo.loaded) return false;
|
||||
if (!dzInfo.loaded) {
|
||||
Serial.printf("[DICT-DBG] dzInfo not loaded!\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::string dzPath = basePath + ".dict.dz";
|
||||
FsFile file;
|
||||
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
|
||||
Serial.printf("[DICT-DBG] Failed to open dict.dz file\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -252,7 +299,11 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
|
||||
const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
|
||||
const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;
|
||||
|
||||
Serial.printf("[DICT-DBG] Chunks: start=%lu, end=%lu, total=%u\n",
|
||||
startChunk, endChunk, dzInfo.chunkCount);
|
||||
|
||||
if (endChunk >= dzInfo.chunkCount) {
|
||||
Serial.printf("[DICT-DBG] endChunk %lu >= chunkCount %u\n", endChunk, dzInfo.chunkCount);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
@@ -263,13 +314,38 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
|
||||
fileOffset += dzInfo.chunkSizes[i];
|
||||
}
|
||||
|
||||
// Allocate buffers
|
||||
const uint32_t maxCompressedSize = 65536; // Max compressed chunk size
|
||||
// Calculate actual max compressed size needed for the chunks we'll process
|
||||
uint32_t maxCompressedSize = 0;
|
||||
for (uint32_t i = startChunk; i <= endChunk; i++) {
|
||||
if (dzInfo.chunkSizes[i] > maxCompressedSize) {
|
||||
maxCompressedSize = dzInfo.chunkSizes[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate buffers - allocate inflator FIRST (smallest) to reduce fragmentation impact
|
||||
// tinfl_decompressor is ~11KB, so total allocations are ~85KB
|
||||
Serial.printf("[DICT-DBG] Allocating inflator=%u, comp=%lu, decomp=%u bytes\n",
|
||||
sizeof(tinfl_decompressor), maxCompressedSize, dzInfo.chunkLength);
|
||||
|
||||
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
|
||||
if (!inflator) {
|
||||
Serial.printf("[DICT-DBG] inflator alloc failed! (need %u bytes)\n", sizeof(tinfl_decompressor));
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
|
||||
if (!compressedBuf) {
|
||||
Serial.printf("[DICT-DBG] compressedBuf alloc failed!\n");
|
||||
free(inflator);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
|
||||
if (!compressedBuf || !decompressedBuf) {
|
||||
if (!decompressedBuf) {
|
||||
Serial.printf("[DICT-DBG] decompressedBuf alloc failed!\n");
|
||||
free(inflator);
|
||||
free(compressedBuf);
|
||||
free(decompressedBuf);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
@@ -277,13 +353,15 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
|
||||
definition.clear();
|
||||
definition.reserve(size);
|
||||
|
||||
// Process each needed chunk
|
||||
// Process each needed chunk (reusing inflator allocation)
|
||||
for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
|
||||
const uint16_t compressedSize = dzInfo.chunkSizes[chunk];
|
||||
|
||||
// Seek and read compressed data
|
||||
file.seek(fileOffset);
|
||||
if (file.read(compressedBuf, compressedSize) != compressedSize) {
|
||||
Serial.printf("[DICT-DBG] File read failed at offset %lu, size %u\n", fileOffset, compressedSize);
|
||||
free(inflator);
|
||||
free(compressedBuf);
|
||||
free(decompressedBuf);
|
||||
file.close();
|
||||
@@ -291,13 +369,6 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
|
||||
}
|
||||
|
||||
// Decompress using raw inflate (no zlib header)
|
||||
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
|
||||
if (!inflator) {
|
||||
free(compressedBuf);
|
||||
free(decompressedBuf);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
tinfl_init(inflator);
|
||||
|
||||
size_t inBytes = compressedSize;
|
||||
@@ -306,19 +377,13 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
|
||||
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
||||
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);
|
||||
|
||||
free(inflator);
|
||||
|
||||
if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
|
||||
// Try without zlib header flag
|
||||
inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
|
||||
if (inflator) {
|
||||
tinfl_init(inflator);
|
||||
inBytes = compressedSize;
|
||||
outBytes = dzInfo.chunkLength;
|
||||
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
||||
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
|
||||
free(inflator);
|
||||
}
|
||||
tinfl_init(inflator);
|
||||
inBytes = compressedSize;
|
||||
outBytes = dzInfo.chunkLength;
|
||||
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
||||
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
|
||||
}
|
||||
|
||||
// Extract the portion we need from this chunk
|
||||
@@ -342,6 +407,7 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
|
||||
fileOffset += compressedSize;
|
||||
}
|
||||
|
||||
free(inflator);
|
||||
free(compressedBuf);
|
||||
free(decompressedBuf);
|
||||
file.close();
|
||||
@@ -349,9 +415,9 @@ bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string&
|
||||
return true;
|
||||
}
|
||||
|
||||
// StarDict comparison function: case-insensitive first, then case-sensitive as tiebreaker
|
||||
// StarDict comparison function: case-insensitive matching
|
||||
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
|
||||
// First: case-insensitive comparison (like g_ascii_strcasecmp)
|
||||
// Case-insensitive comparison (like g_ascii_strcasecmp)
|
||||
size_t i = 0;
|
||||
while (i < a.length() && i < b.length()) {
|
||||
const int ca = std::tolower(static_cast<unsigned char>(a[i]));
|
||||
@@ -362,8 +428,8 @@ int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
|
||||
if (a.length() != b.length()) {
|
||||
return static_cast<int>(a.length()) - static_cast<int>(b.length());
|
||||
}
|
||||
// If case-insensitive equal, use case-sensitive as tiebreaker
|
||||
return a.compare(b);
|
||||
// Case-insensitive match found
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string StarDict::normalizeWord(const std::string& word) {
|
||||
@@ -403,6 +469,9 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
||||
return result;
|
||||
}
|
||||
|
||||
Serial.printf("[DICT-DBG] Searching for: '%s' (normalized: '%s')\n",
|
||||
word.c_str(), normalizedSearch.c_str());
|
||||
|
||||
// First try .idx (main entries) - use prefix jump table for fast lookup
|
||||
const std::string idxPath = basePath + ".idx";
|
||||
FsFile idxFile;
|
||||
@@ -418,7 +487,10 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
||||
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
|
||||
position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
|
||||
}
|
||||
Serial.printf("[DICT-DBG] Starting at position %lu (prefix: %c%c)\n",
|
||||
position, normalizedSearch[0], normalizedSearch[1]);
|
||||
bool found = false;
|
||||
uint32_t wordCount = 0;
|
||||
|
||||
while (position < info.idxfilesize) {
|
||||
std::string currentWord;
|
||||
@@ -427,13 +499,24 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
||||
if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
|
||||
break;
|
||||
}
|
||||
wordCount++;
|
||||
if (wordCount % 50000 == 0) {
|
||||
Serial.printf("[DICT-DBG] Progress: %lu words scanned, pos=%lu, current='%s'\n",
|
||||
wordCount, position, currentWord.c_str());
|
||||
}
|
||||
|
||||
// Use stardictStrcmp for case-insensitive matching
|
||||
const int cmp = stardictStrcmp(normalizedSearch, currentWord);
|
||||
|
||||
if (cmp == 0) {
|
||||
Serial.printf("[DICT-DBG] MATCH: '%s' == '%s' (offset=%lu, size=%lu)\n",
|
||||
normalizedSearch.c_str(), currentWord.c_str(), dictOffset, dictSize);
|
||||
std::string definition;
|
||||
if (decompressDefinition(dictOffset, dictSize, definition)) {
|
||||
const bool loaded = useUncompressed
|
||||
? readDefinitionDirect(dictOffset, dictSize, definition)
|
||||
: decompressDefinition(dictOffset, dictSize, definition);
|
||||
if (loaded) {
|
||||
Serial.printf("[DICT-DBG] Definition loaded, %u bytes\n", definition.length());
|
||||
if (!found) {
|
||||
result.word = currentWord;
|
||||
result.definition = definition;
|
||||
@@ -442,14 +525,20 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
||||
} else {
|
||||
result.definition += "</html>" + definition;
|
||||
}
|
||||
} else {
|
||||
Serial.printf("[DICT-DBG] Definition load FAILED!\n");
|
||||
}
|
||||
// Continue scanning for additional matches (same word, different case)
|
||||
} else if (cmp < 0) {
|
||||
// Passed where target would be (file is sorted)
|
||||
} else if (found) {
|
||||
// We had matches but now moved past them - safe to stop
|
||||
break;
|
||||
}
|
||||
// Note: Cannot use early-break before first match because prefix index
|
||||
// may not land exactly at target position
|
||||
}
|
||||
|
||||
Serial.printf("[DICT-DBG] Search complete: %lu words scanned, found=%s\n",
|
||||
wordCount, found ? "YES" : "NO");
|
||||
idxFile.close();
|
||||
|
||||
// If not found in main index, try synonym file with prefix jump
|
||||
@@ -502,7 +591,10 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
||||
uint32_t dictOffset, dictSize;
|
||||
if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
|
||||
std::string definition;
|
||||
if (decompressDefinition(dictOffset, dictSize, definition)) {
|
||||
const bool loaded = useUncompressed
|
||||
? readDefinitionDirect(dictOffset, dictSize, definition)
|
||||
: decompressDefinition(dictOffset, dictSize, definition);
|
||||
if (loaded) {
|
||||
result.word = synWord;
|
||||
result.definition = definition;
|
||||
result.found = true;
|
||||
@@ -513,10 +605,9 @@ StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
||||
idxFile2.close();
|
||||
}
|
||||
break; // Found a match, stop searching
|
||||
} else if (cmp < 0) {
|
||||
// Passed where it would be (file is sorted)
|
||||
break;
|
||||
}
|
||||
// Note: Cannot use early-break optimization here because prefix index
|
||||
// may not land exactly at target position
|
||||
}
|
||||
synFile.close();
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <string>
|
||||
|
||||
// StarDict dictionary lookup library
|
||||
// Supports .ifo/.idx/.dict.dz format with linear scan lookup
|
||||
// Supports .ifo/.idx/.dict (uncompressed) and .ifo/.idx/.dict.dz (compressed) formats
|
||||
class StarDict {
|
||||
public:
|
||||
struct DictInfo {
|
||||
@@ -38,16 +38,22 @@ class StarDict {
|
||||
};
|
||||
DictzipInfo dzInfo;
|
||||
|
||||
// Whether to use uncompressed .dict file (preferred) or compressed .dict.dz
|
||||
bool useUncompressed = false;
|
||||
|
||||
// Parse .ifo file
|
||||
bool loadInfo();
|
||||
|
||||
// Load dictzip header for random access
|
||||
// Load dictzip header for random access (only if using compressed)
|
||||
bool loadDictzipHeader();
|
||||
|
||||
// Read word at given index file position, returns word and advances position
|
||||
bool readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
|
||||
uint32_t& dictSize);
|
||||
|
||||
// Read definition directly from uncompressed .dict file (no decompression needed)
|
||||
bool readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition);
|
||||
|
||||
// Decompress a portion of the .dict.dz file
|
||||
bool decompressDefinition(uint32_t offset, uint32_t size, std::string& definition);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user