#include "Dictionary.h" #include #include #include #include namespace { constexpr const char* IDX_PATH = "/.dictionary/dictionary.idx"; constexpr const char* DICT_PATH = "/.dictionary/dictionary.dict"; constexpr const char* CACHE_PATH = "/.dictionary/dictionary.cache"; constexpr uint32_t CACHE_MAGIC = 0x44494358; // "DICX" // g_ascii_strcasecmp equivalent: compare lowercasing only ASCII A-Z. int asciiCaseCmp(const char* s1, const char* s2) { const auto* p1 = reinterpret_cast(s1); const auto* p2 = reinterpret_cast(s2); while (*p1 && *p2) { unsigned char c1 = *p1, c2 = *p2; if (c1 >= 'A' && c1 <= 'Z') c1 += 32; if (c2 >= 'A' && c2 <= 'Z') c2 += 32; if (c1 != c2) return static_cast(c1) - static_cast(c2); ++p1; ++p2; } return static_cast(*p1) - static_cast(*p2); } // StarDict index comparison: case-insensitive first, then case-sensitive tiebreaker. // This matches the stardict_strcmp used by StarDict to sort .idx entries. int stardictCmp(const char* s1, const char* s2) { int ci = asciiCaseCmp(s1, s2); if (ci != 0) return ci; return std::strcmp(s1, s2); } } // namespace std::vector Dictionary::sparseOffsets; uint32_t Dictionary::totalWords = 0; bool Dictionary::indexLoaded = false; bool Dictionary::exists() { return Storage.exists(IDX_PATH); } bool Dictionary::cacheExists() { return Storage.exists(CACHE_PATH); } void Dictionary::deleteCache() { Storage.remove(CACHE_PATH); // Reset in-memory state so next lookup rebuilds from the .idx file. sparseOffsets.clear(); totalWords = 0; indexLoaded = false; } std::string Dictionary::cleanWord(const std::string& word) { if (word.empty()) return ""; // Find first alphanumeric character size_t start = 0; while (start < word.size() && !std::isalnum(static_cast(word[start]))) { start++; } // Find last alphanumeric character size_t end = word.size(); while (end > start && !std::isalnum(static_cast(word[end - 1]))) { end--; } if (start >= end) return ""; std::string result = word.substr(start, end - start); // Lowercase std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { return std::tolower(c); }); return result; } // --------------------------------------------------------------------------- // Cache: persists the sparse offset table to SD card so subsequent boots skip // the full .idx scan. The cache is invalidated when the .idx file size changes. // // Format: [magic 4B][idxFileSize 4B][totalWords 4B][count 4B][offsets N×4B] // All values are stored in native byte order (little-endian on ESP32). // --------------------------------------------------------------------------- bool Dictionary::loadCachedIndex() { FsFile idx; if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false; const uint32_t idxFileSize = static_cast(idx.fileSize()); idx.close(); FsFile cache; if (!Storage.openFileForRead("DICT", CACHE_PATH, cache)) return false; // Read and validate header uint32_t header[4]; // magic, idxFileSize, totalWords, count if (cache.read(reinterpret_cast(header), 16) != 16) { cache.close(); return false; } if (header[0] != CACHE_MAGIC || header[1] != idxFileSize) { cache.close(); return false; } totalWords = header[2]; const uint32_t count = header[3]; sparseOffsets.resize(count); const int bytesToRead = static_cast(count * sizeof(uint32_t)); if (cache.read(reinterpret_cast(sparseOffsets.data()), bytesToRead) != bytesToRead) { cache.close(); sparseOffsets.clear(); totalWords = 0; return false; } cache.close(); indexLoaded = true; return true; } void Dictionary::saveCachedIndex(uint32_t idxFileSize) { FsFile cache; if (!Storage.openFileForWrite("DICT", CACHE_PATH, cache)) return; const uint32_t count = static_cast(sparseOffsets.size()); uint32_t header[4] = {CACHE_MAGIC, idxFileSize, totalWords, count}; cache.write(reinterpret_cast(header), 16); cache.write(reinterpret_cast(sparseOffsets.data()), count * sizeof(uint32_t)); cache.close(); } // Scan the .idx file to build a sparse offset table for fast lookups. // Records the file offset of every SPARSE_INTERVAL-th entry. bool Dictionary::loadIndex(const std::function& onProgress, const std::function& shouldCancel) { // Try loading from cache first (nearly instant) if (loadCachedIndex()) return true; FsFile idx; if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false; const uint32_t fileSize = static_cast(idx.fileSize()); sparseOffsets.clear(); totalWords = 0; uint32_t pos = 0; int lastReportedPercent = -1; while (pos < fileSize) { if (shouldCancel && (totalWords % 100 == 0) && shouldCancel()) { idx.close(); sparseOffsets.clear(); totalWords = 0; return false; } if (totalWords % SPARSE_INTERVAL == 0) { sparseOffsets.push_back(pos); } // Skip word (read until null terminator) int ch; do { ch = idx.read(); if (ch < 0) { pos = fileSize; break; } pos++; } while (ch != 0); if (pos >= fileSize) break; // Skip 8 bytes (4-byte offset + 4-byte size) uint8_t skip[8]; if (idx.read(skip, 8) != 8) break; pos += 8; totalWords++; if (onProgress && fileSize > 0) { int percent = static_cast(static_cast(pos) * 90 / fileSize); if (percent > lastReportedPercent + 4) { lastReportedPercent = percent; onProgress(percent); } } } idx.close(); indexLoaded = true; // Persist to cache so next boot is instant if (totalWords > 0) saveCachedIndex(fileSize); return totalWords > 0; } // Read a null-terminated word string from the current file position. std::string Dictionary::readWord(FsFile& file) { std::string word; while (true) { int ch = file.read(); if (ch <= 0) break; // null terminator (0) or error (-1) word += static_cast(ch); } return word; } // Read a definition from the .dict file at the given offset and size. std::string Dictionary::readDefinition(uint32_t offset, uint32_t size) { FsFile dict; if (!Storage.openFileForRead("DICT", DICT_PATH, dict)) return ""; dict.seekSet(offset); std::string def(size, '\0'); int bytesRead = dict.read(reinterpret_cast(&def[0]), size); dict.close(); if (bytesRead < 0) return ""; if (static_cast(bytesRead) < size) def.resize(bytesRead); return def; } // Binary search the sparse offset table, then linear scan within the matching segment. // Uses StarDict's sort order: case-insensitive first, then case-sensitive tiebreaker. // The exact match is case-insensitive so e.g. "simple" matches "Simple". std::string Dictionary::searchIndex(const std::string& word, const std::function& shouldCancel) { if (sparseOffsets.empty()) return ""; FsFile idx; if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return ""; // Binary search the sparse offset table to find the right segment. int lo = 0, hi = static_cast(sparseOffsets.size()) - 1; while (lo < hi) { if (shouldCancel && shouldCancel()) { idx.close(); return ""; } int mid = lo + (hi - lo + 1) / 2; idx.seekSet(sparseOffsets[mid]); std::string key = readWord(idx); if (stardictCmp(key.c_str(), word.c_str()) <= 0) { lo = mid; } else { hi = mid - 1; } } // Linear scan within the segment starting at sparseOffsets[lo]. idx.seekSet(sparseOffsets[lo]); int maxEntries = SPARSE_INTERVAL; if (lo == static_cast(sparseOffsets.size()) - 1) { maxEntries = static_cast(totalWords - static_cast(lo) * SPARSE_INTERVAL); } // Scan entries, preferring an exact case-sensitive match over a case-insensitive one. // In stardict order, all case variants of a word are adjacent (e.g. "Professor" then "professor"), // and they may have different definitions. We want the lowercase entry when the user searched // for a lowercase word, falling back to any case variant. uint32_t bestOffset = 0, bestSize = 0; bool found = false; for (int i = 0; i < maxEntries; i++) { if (shouldCancel && shouldCancel()) { idx.close(); return ""; } std::string key = readWord(idx); if (key.empty()) break; // Read offset and size (4 bytes each, big-endian) uint8_t buf[8]; if (idx.read(buf, 8) != 8) break; uint32_t dictOffset = (static_cast(buf[0]) << 24) | (static_cast(buf[1]) << 16) | (static_cast(buf[2]) << 8) | static_cast(buf[3]); uint32_t dictSize = (static_cast(buf[4]) << 24) | (static_cast(buf[5]) << 16) | (static_cast(buf[6]) << 8) | static_cast(buf[7]); if (asciiCaseCmp(key.c_str(), word.c_str()) == 0) { // Case-insensitive match — remember the first one as fallback if (!found) { bestOffset = dictOffset; bestSize = dictSize; found = true; } // Exact case-sensitive match — use immediately if (key == word) { idx.close(); return readDefinition(dictOffset, dictSize); } } else if (found) { // We've moved past all case variants of this word — stop break; } else if (stardictCmp(key.c_str(), word.c_str()) > 0) { // Past the target in StarDict sort order — stop scanning break; } } idx.close(); return found ? readDefinition(bestOffset, bestSize) : ""; } std::string Dictionary::lookup(const std::string& word, const std::function& onProgress, const std::function& shouldCancel) { if (!indexLoaded) { if (!loadIndex(onProgress, shouldCancel)) return ""; } // searchIndex uses StarDict sort order + case-insensitive match, // so a single pass handles all casing variants. std::string result = searchIndex(word, shouldCancel); if (onProgress) onProgress(100); return result; }