Implements StarDict-based dictionary lookup from the reader menu, adapted from upstream PR #857 with /.dictionary/ folder path, std::vector compatibility (PR #802), HTML definition rendering, orientation-aware button hints, side button hints with CCW text rotation, sparse index caching to SD card, pronunciation line filtering, and reorganized reader menu with bookmark stubs. Co-authored-by: Cursor <cursoragent@cursor.com>
329 lines
10 KiB
C++
329 lines
10 KiB
C++
#include "Dictionary.h"
|
||
|
||
#include <HalStorage.h>
|
||
|
||
#include <algorithm>
|
||
#include <cctype>
|
||
#include <cstring>
|
||
|
||
namespace {
|
||
constexpr const char* IDX_PATH = "/.dictionary/dictionary.idx";
|
||
constexpr const char* DICT_PATH = "/.dictionary/dictionary.dict";
|
||
constexpr const char* CACHE_PATH = "/.dictionary/dictionary.cache";
|
||
constexpr uint32_t CACHE_MAGIC = 0x44494358; // "DICX"
|
||
|
||
// g_ascii_strcasecmp equivalent: compare lowercasing only ASCII A-Z.
|
||
int asciiCaseCmp(const char* s1, const char* s2) {
|
||
const auto* p1 = reinterpret_cast<const unsigned char*>(s1);
|
||
const auto* p2 = reinterpret_cast<const unsigned char*>(s2);
|
||
while (*p1 && *p2) {
|
||
unsigned char c1 = *p1, c2 = *p2;
|
||
if (c1 >= 'A' && c1 <= 'Z') c1 += 32;
|
||
if (c2 >= 'A' && c2 <= 'Z') c2 += 32;
|
||
if (c1 != c2) return static_cast<int>(c1) - static_cast<int>(c2);
|
||
++p1;
|
||
++p2;
|
||
}
|
||
return static_cast<int>(*p1) - static_cast<int>(*p2);
|
||
}
|
||
|
||
// StarDict index comparison: case-insensitive first, then case-sensitive tiebreaker.
|
||
// This matches the stardict_strcmp used by StarDict to sort .idx entries.
|
||
int stardictCmp(const char* s1, const char* s2) {
|
||
int ci = asciiCaseCmp(s1, s2);
|
||
if (ci != 0) return ci;
|
||
return std::strcmp(s1, s2);
|
||
}
|
||
} // namespace
|
||
|
||
std::vector<uint32_t> Dictionary::sparseOffsets;
|
||
uint32_t Dictionary::totalWords = 0;
|
||
bool Dictionary::indexLoaded = false;
|
||
|
||
bool Dictionary::exists() { return Storage.exists(IDX_PATH); }
|
||
|
||
bool Dictionary::cacheExists() { return Storage.exists(CACHE_PATH); }
|
||
|
||
void Dictionary::deleteCache() {
|
||
Storage.remove(CACHE_PATH);
|
||
// Reset in-memory state so next lookup rebuilds from the .idx file.
|
||
sparseOffsets.clear();
|
||
totalWords = 0;
|
||
indexLoaded = false;
|
||
}
|
||
|
||
std::string Dictionary::cleanWord(const std::string& word) {
|
||
if (word.empty()) return "";
|
||
|
||
// Find first alphanumeric character
|
||
size_t start = 0;
|
||
while (start < word.size() && !std::isalnum(static_cast<unsigned char>(word[start]))) {
|
||
start++;
|
||
}
|
||
|
||
// Find last alphanumeric character
|
||
size_t end = word.size();
|
||
while (end > start && !std::isalnum(static_cast<unsigned char>(word[end - 1]))) {
|
||
end--;
|
||
}
|
||
|
||
if (start >= end) return "";
|
||
|
||
std::string result = word.substr(start, end - start);
|
||
// Lowercase
|
||
std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { return std::tolower(c); });
|
||
return result;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Cache: persists the sparse offset table to SD card so subsequent boots skip
|
||
// the full .idx scan. The cache is invalidated when the .idx file size changes.
|
||
//
|
||
// Format: [magic 4B][idxFileSize 4B][totalWords 4B][count 4B][offsets N×4B]
|
||
// All values are stored in native byte order (little-endian on ESP32).
|
||
// ---------------------------------------------------------------------------
|
||
bool Dictionary::loadCachedIndex() {
|
||
FsFile idx;
|
||
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
|
||
const uint32_t idxFileSize = static_cast<uint32_t>(idx.fileSize());
|
||
idx.close();
|
||
|
||
FsFile cache;
|
||
if (!Storage.openFileForRead("DICT", CACHE_PATH, cache)) return false;
|
||
|
||
// Read and validate header
|
||
uint32_t header[4]; // magic, idxFileSize, totalWords, count
|
||
if (cache.read(reinterpret_cast<uint8_t*>(header), 16) != 16) {
|
||
cache.close();
|
||
return false;
|
||
}
|
||
|
||
if (header[0] != CACHE_MAGIC || header[1] != idxFileSize) {
|
||
cache.close();
|
||
return false;
|
||
}
|
||
|
||
totalWords = header[2];
|
||
const uint32_t count = header[3];
|
||
|
||
sparseOffsets.resize(count);
|
||
const int bytesToRead = static_cast<int>(count * sizeof(uint32_t));
|
||
if (cache.read(reinterpret_cast<uint8_t*>(sparseOffsets.data()), bytesToRead) != bytesToRead) {
|
||
cache.close();
|
||
sparseOffsets.clear();
|
||
totalWords = 0;
|
||
return false;
|
||
}
|
||
|
||
cache.close();
|
||
indexLoaded = true;
|
||
return true;
|
||
}
|
||
|
||
void Dictionary::saveCachedIndex(uint32_t idxFileSize) {
|
||
FsFile cache;
|
||
if (!Storage.openFileForWrite("DICT", CACHE_PATH, cache)) return;
|
||
|
||
const uint32_t count = static_cast<uint32_t>(sparseOffsets.size());
|
||
uint32_t header[4] = {CACHE_MAGIC, idxFileSize, totalWords, count};
|
||
|
||
cache.write(reinterpret_cast<const uint8_t*>(header), 16);
|
||
cache.write(reinterpret_cast<const uint8_t*>(sparseOffsets.data()), count * sizeof(uint32_t));
|
||
cache.close();
|
||
}
|
||
|
||
// Scan the .idx file to build a sparse offset table for fast lookups.
|
||
// Records the file offset of every SPARSE_INTERVAL-th entry.
|
||
bool Dictionary::loadIndex(const std::function<void(int percent)>& onProgress,
|
||
const std::function<bool()>& shouldCancel) {
|
||
// Try loading from cache first (nearly instant)
|
||
if (loadCachedIndex()) return true;
|
||
|
||
FsFile idx;
|
||
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
|
||
|
||
const uint32_t fileSize = static_cast<uint32_t>(idx.fileSize());
|
||
|
||
sparseOffsets.clear();
|
||
totalWords = 0;
|
||
|
||
uint32_t pos = 0;
|
||
int lastReportedPercent = -1;
|
||
|
||
while (pos < fileSize) {
|
||
if (shouldCancel && (totalWords % 100 == 0) && shouldCancel()) {
|
||
idx.close();
|
||
sparseOffsets.clear();
|
||
totalWords = 0;
|
||
return false;
|
||
}
|
||
|
||
if (totalWords % SPARSE_INTERVAL == 0) {
|
||
sparseOffsets.push_back(pos);
|
||
}
|
||
|
||
// Skip word (read until null terminator)
|
||
int ch;
|
||
do {
|
||
ch = idx.read();
|
||
if (ch < 0) {
|
||
pos = fileSize;
|
||
break;
|
||
}
|
||
pos++;
|
||
} while (ch != 0);
|
||
|
||
if (pos >= fileSize) break;
|
||
|
||
// Skip 8 bytes (4-byte offset + 4-byte size)
|
||
uint8_t skip[8];
|
||
if (idx.read(skip, 8) != 8) break;
|
||
pos += 8;
|
||
|
||
totalWords++;
|
||
|
||
if (onProgress && fileSize > 0) {
|
||
int percent = static_cast<int>(static_cast<uint64_t>(pos) * 90 / fileSize);
|
||
if (percent > lastReportedPercent + 4) {
|
||
lastReportedPercent = percent;
|
||
onProgress(percent);
|
||
}
|
||
}
|
||
}
|
||
|
||
idx.close();
|
||
indexLoaded = true;
|
||
|
||
// Persist to cache so next boot is instant
|
||
if (totalWords > 0) saveCachedIndex(fileSize);
|
||
|
||
return totalWords > 0;
|
||
}
|
||
|
||
// Read a null-terminated word string from the current file position.
|
||
std::string Dictionary::readWord(FsFile& file) {
|
||
std::string word;
|
||
while (true) {
|
||
int ch = file.read();
|
||
if (ch <= 0) break; // null terminator (0) or error (-1)
|
||
word += static_cast<char>(ch);
|
||
}
|
||
return word;
|
||
}
|
||
|
||
// Read a definition from the .dict file at the given offset and size.
|
||
std::string Dictionary::readDefinition(uint32_t offset, uint32_t size) {
|
||
FsFile dict;
|
||
if (!Storage.openFileForRead("DICT", DICT_PATH, dict)) return "";
|
||
|
||
dict.seekSet(offset);
|
||
|
||
std::string def(size, '\0');
|
||
int bytesRead = dict.read(reinterpret_cast<uint8_t*>(&def[0]), size);
|
||
dict.close();
|
||
|
||
if (bytesRead < 0) return "";
|
||
if (static_cast<uint32_t>(bytesRead) < size) def.resize(bytesRead);
|
||
return def;
|
||
}
|
||
|
||
// Binary search the sparse offset table, then linear scan within the matching segment.
|
||
// Uses StarDict's sort order: case-insensitive first, then case-sensitive tiebreaker.
|
||
// The exact match is case-insensitive so e.g. "simple" matches "Simple".
|
||
std::string Dictionary::searchIndex(const std::string& word, const std::function<bool()>& shouldCancel) {
|
||
if (sparseOffsets.empty()) return "";
|
||
|
||
FsFile idx;
|
||
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return "";
|
||
|
||
// Binary search the sparse offset table to find the right segment.
|
||
int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
|
||
|
||
while (lo < hi) {
|
||
if (shouldCancel && shouldCancel()) {
|
||
idx.close();
|
||
return "";
|
||
}
|
||
|
||
int mid = lo + (hi - lo + 1) / 2;
|
||
idx.seekSet(sparseOffsets[mid]);
|
||
std::string key = readWord(idx);
|
||
|
||
if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
|
||
lo = mid;
|
||
} else {
|
||
hi = mid - 1;
|
||
}
|
||
}
|
||
|
||
// Linear scan within the segment starting at sparseOffsets[lo].
|
||
idx.seekSet(sparseOffsets[lo]);
|
||
|
||
int maxEntries = SPARSE_INTERVAL;
|
||
if (lo == static_cast<int>(sparseOffsets.size()) - 1) {
|
||
maxEntries = static_cast<int>(totalWords - static_cast<uint32_t>(lo) * SPARSE_INTERVAL);
|
||
}
|
||
|
||
// Scan entries, preferring an exact case-sensitive match over a case-insensitive one.
|
||
// In stardict order, all case variants of a word are adjacent (e.g. "Professor" then "professor"),
|
||
// and they may have different definitions. We want the lowercase entry when the user searched
|
||
// for a lowercase word, falling back to any case variant.
|
||
uint32_t bestOffset = 0, bestSize = 0;
|
||
bool found = false;
|
||
|
||
for (int i = 0; i < maxEntries; i++) {
|
||
if (shouldCancel && shouldCancel()) {
|
||
idx.close();
|
||
return "";
|
||
}
|
||
|
||
std::string key = readWord(idx);
|
||
if (key.empty()) break;
|
||
|
||
// Read offset and size (4 bytes each, big-endian)
|
||
uint8_t buf[8];
|
||
if (idx.read(buf, 8) != 8) break;
|
||
|
||
uint32_t dictOffset = (static_cast<uint32_t>(buf[0]) << 24) | (static_cast<uint32_t>(buf[1]) << 16) |
|
||
(static_cast<uint32_t>(buf[2]) << 8) | static_cast<uint32_t>(buf[3]);
|
||
uint32_t dictSize = (static_cast<uint32_t>(buf[4]) << 24) | (static_cast<uint32_t>(buf[5]) << 16) |
|
||
(static_cast<uint32_t>(buf[6]) << 8) | static_cast<uint32_t>(buf[7]);
|
||
|
||
if (asciiCaseCmp(key.c_str(), word.c_str()) == 0) {
|
||
// Case-insensitive match — remember the first one as fallback
|
||
if (!found) {
|
||
bestOffset = dictOffset;
|
||
bestSize = dictSize;
|
||
found = true;
|
||
}
|
||
// Exact case-sensitive match — use immediately
|
||
if (key == word) {
|
||
idx.close();
|
||
return readDefinition(dictOffset, dictSize);
|
||
}
|
||
} else if (found) {
|
||
// We've moved past all case variants of this word — stop
|
||
break;
|
||
} else if (stardictCmp(key.c_str(), word.c_str()) > 0) {
|
||
// Past the target in StarDict sort order — stop scanning
|
||
break;
|
||
}
|
||
}
|
||
|
||
idx.close();
|
||
return found ? readDefinition(bestOffset, bestSize) : "";
|
||
}
|
||
|
||
std::string Dictionary::lookup(const std::string& word, const std::function<void(int percent)>& onProgress,
|
||
const std::function<bool()>& shouldCancel) {
|
||
if (!indexLoaded) {
|
||
if (!loadIndex(onProgress, shouldCancel)) return "";
|
||
}
|
||
|
||
// searchIndex uses StarDict sort order + case-insensitive match,
|
||
// so a single pass handles all casing variants.
|
||
std::string result = searchIndex(word, shouldCancel);
|
||
if (onProgress) onProgress(100);
|
||
return result;
|
||
}
|