Files
crosspoint-reader-mod/src/util/Dictionary.cpp
cottongin 8d4bbf284d feat: Add dictionary word lookup feature with cached index
Implements StarDict-based dictionary lookup from the reader menu,
adapted from upstream PR #857 with /.dictionary/ folder path,
std::vector compatibility (PR #802), HTML definition rendering,
orientation-aware button hints, side button hints with CCW text
rotation, sparse index caching to SD card, pronunciation line
filtering, and reorganized reader menu with bookmark stubs.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-12 19:36:14 -05:00

329 lines
10 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "Dictionary.h"
#include <HalStorage.h>
#include <algorithm>
#include <cctype>
#include <cstring>
namespace {
constexpr const char* IDX_PATH = "/.dictionary/dictionary.idx";
constexpr const char* DICT_PATH = "/.dictionary/dictionary.dict";
constexpr const char* CACHE_PATH = "/.dictionary/dictionary.cache";
constexpr uint32_t CACHE_MAGIC = 0x44494358; // "DICX"
// g_ascii_strcasecmp equivalent: compare lowercasing only ASCII A-Z.
int asciiCaseCmp(const char* s1, const char* s2) {
const auto* p1 = reinterpret_cast<const unsigned char*>(s1);
const auto* p2 = reinterpret_cast<const unsigned char*>(s2);
while (*p1 && *p2) {
unsigned char c1 = *p1, c2 = *p2;
if (c1 >= 'A' && c1 <= 'Z') c1 += 32;
if (c2 >= 'A' && c2 <= 'Z') c2 += 32;
if (c1 != c2) return static_cast<int>(c1) - static_cast<int>(c2);
++p1;
++p2;
}
return static_cast<int>(*p1) - static_cast<int>(*p2);
}
// StarDict index comparison: case-insensitive first, then case-sensitive tiebreaker.
// This matches the stardict_strcmp used by StarDict to sort .idx entries.
int stardictCmp(const char* s1, const char* s2) {
int ci = asciiCaseCmp(s1, s2);
if (ci != 0) return ci;
return std::strcmp(s1, s2);
}
} // namespace
std::vector<uint32_t> Dictionary::sparseOffsets;
uint32_t Dictionary::totalWords = 0;
bool Dictionary::indexLoaded = false;
bool Dictionary::exists() { return Storage.exists(IDX_PATH); }
bool Dictionary::cacheExists() { return Storage.exists(CACHE_PATH); }
void Dictionary::deleteCache() {
Storage.remove(CACHE_PATH);
// Reset in-memory state so next lookup rebuilds from the .idx file.
sparseOffsets.clear();
totalWords = 0;
indexLoaded = false;
}
std::string Dictionary::cleanWord(const std::string& word) {
if (word.empty()) return "";
// Find first alphanumeric character
size_t start = 0;
while (start < word.size() && !std::isalnum(static_cast<unsigned char>(word[start]))) {
start++;
}
// Find last alphanumeric character
size_t end = word.size();
while (end > start && !std::isalnum(static_cast<unsigned char>(word[end - 1]))) {
end--;
}
if (start >= end) return "";
std::string result = word.substr(start, end - start);
// Lowercase
std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { return std::tolower(c); });
return result;
}
// ---------------------------------------------------------------------------
// Cache: persists the sparse offset table to SD card so subsequent boots skip
// the full .idx scan. The cache is invalidated when the .idx file size changes.
//
// Format: [magic 4B][idxFileSize 4B][totalWords 4B][count 4B][offsets N×4B]
// All values are stored in native byte order (little-endian on ESP32).
// ---------------------------------------------------------------------------
bool Dictionary::loadCachedIndex() {
FsFile idx;
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
const uint32_t idxFileSize = static_cast<uint32_t>(idx.fileSize());
idx.close();
FsFile cache;
if (!Storage.openFileForRead("DICT", CACHE_PATH, cache)) return false;
// Read and validate header
uint32_t header[4]; // magic, idxFileSize, totalWords, count
if (cache.read(reinterpret_cast<uint8_t*>(header), 16) != 16) {
cache.close();
return false;
}
if (header[0] != CACHE_MAGIC || header[1] != idxFileSize) {
cache.close();
return false;
}
totalWords = header[2];
const uint32_t count = header[3];
sparseOffsets.resize(count);
const int bytesToRead = static_cast<int>(count * sizeof(uint32_t));
if (cache.read(reinterpret_cast<uint8_t*>(sparseOffsets.data()), bytesToRead) != bytesToRead) {
cache.close();
sparseOffsets.clear();
totalWords = 0;
return false;
}
cache.close();
indexLoaded = true;
return true;
}
void Dictionary::saveCachedIndex(uint32_t idxFileSize) {
FsFile cache;
if (!Storage.openFileForWrite("DICT", CACHE_PATH, cache)) return;
const uint32_t count = static_cast<uint32_t>(sparseOffsets.size());
uint32_t header[4] = {CACHE_MAGIC, idxFileSize, totalWords, count};
cache.write(reinterpret_cast<const uint8_t*>(header), 16);
cache.write(reinterpret_cast<const uint8_t*>(sparseOffsets.data()), count * sizeof(uint32_t));
cache.close();
}
// Scan the .idx file to build a sparse offset table for fast lookups.
// Records the file offset of every SPARSE_INTERVAL-th entry.
bool Dictionary::loadIndex(const std::function<void(int percent)>& onProgress,
const std::function<bool()>& shouldCancel) {
// Try loading from cache first (nearly instant)
if (loadCachedIndex()) return true;
FsFile idx;
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
const uint32_t fileSize = static_cast<uint32_t>(idx.fileSize());
sparseOffsets.clear();
totalWords = 0;
uint32_t pos = 0;
int lastReportedPercent = -1;
while (pos < fileSize) {
if (shouldCancel && (totalWords % 100 == 0) && shouldCancel()) {
idx.close();
sparseOffsets.clear();
totalWords = 0;
return false;
}
if (totalWords % SPARSE_INTERVAL == 0) {
sparseOffsets.push_back(pos);
}
// Skip word (read until null terminator)
int ch;
do {
ch = idx.read();
if (ch < 0) {
pos = fileSize;
break;
}
pos++;
} while (ch != 0);
if (pos >= fileSize) break;
// Skip 8 bytes (4-byte offset + 4-byte size)
uint8_t skip[8];
if (idx.read(skip, 8) != 8) break;
pos += 8;
totalWords++;
if (onProgress && fileSize > 0) {
int percent = static_cast<int>(static_cast<uint64_t>(pos) * 90 / fileSize);
if (percent > lastReportedPercent + 4) {
lastReportedPercent = percent;
onProgress(percent);
}
}
}
idx.close();
indexLoaded = true;
// Persist to cache so next boot is instant
if (totalWords > 0) saveCachedIndex(fileSize);
return totalWords > 0;
}
// Read a null-terminated word string from the current file position.
std::string Dictionary::readWord(FsFile& file) {
std::string word;
while (true) {
int ch = file.read();
if (ch <= 0) break; // null terminator (0) or error (-1)
word += static_cast<char>(ch);
}
return word;
}
// Read a definition from the .dict file at the given offset and size.
std::string Dictionary::readDefinition(uint32_t offset, uint32_t size) {
FsFile dict;
if (!Storage.openFileForRead("DICT", DICT_PATH, dict)) return "";
dict.seekSet(offset);
std::string def(size, '\0');
int bytesRead = dict.read(reinterpret_cast<uint8_t*>(&def[0]), size);
dict.close();
if (bytesRead < 0) return "";
if (static_cast<uint32_t>(bytesRead) < size) def.resize(bytesRead);
return def;
}
// Binary search the sparse offset table, then linear scan within the matching segment.
// Uses StarDict's sort order: case-insensitive first, then case-sensitive tiebreaker.
// The exact match is case-insensitive so e.g. "simple" matches "Simple".
std::string Dictionary::searchIndex(const std::string& word, const std::function<bool()>& shouldCancel) {
if (sparseOffsets.empty()) return "";
FsFile idx;
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return "";
// Binary search the sparse offset table to find the right segment.
int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
while (lo < hi) {
if (shouldCancel && shouldCancel()) {
idx.close();
return "";
}
int mid = lo + (hi - lo + 1) / 2;
idx.seekSet(sparseOffsets[mid]);
std::string key = readWord(idx);
if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
lo = mid;
} else {
hi = mid - 1;
}
}
// Linear scan within the segment starting at sparseOffsets[lo].
idx.seekSet(sparseOffsets[lo]);
int maxEntries = SPARSE_INTERVAL;
if (lo == static_cast<int>(sparseOffsets.size()) - 1) {
maxEntries = static_cast<int>(totalWords - static_cast<uint32_t>(lo) * SPARSE_INTERVAL);
}
// Scan entries, preferring an exact case-sensitive match over a case-insensitive one.
// In stardict order, all case variants of a word are adjacent (e.g. "Professor" then "professor"),
// and they may have different definitions. We want the lowercase entry when the user searched
// for a lowercase word, falling back to any case variant.
uint32_t bestOffset = 0, bestSize = 0;
bool found = false;
for (int i = 0; i < maxEntries; i++) {
if (shouldCancel && shouldCancel()) {
idx.close();
return "";
}
std::string key = readWord(idx);
if (key.empty()) break;
// Read offset and size (4 bytes each, big-endian)
uint8_t buf[8];
if (idx.read(buf, 8) != 8) break;
uint32_t dictOffset = (static_cast<uint32_t>(buf[0]) << 24) | (static_cast<uint32_t>(buf[1]) << 16) |
(static_cast<uint32_t>(buf[2]) << 8) | static_cast<uint32_t>(buf[3]);
uint32_t dictSize = (static_cast<uint32_t>(buf[4]) << 24) | (static_cast<uint32_t>(buf[5]) << 16) |
(static_cast<uint32_t>(buf[6]) << 8) | static_cast<uint32_t>(buf[7]);
if (asciiCaseCmp(key.c_str(), word.c_str()) == 0) {
// Case-insensitive match — remember the first one as fallback
if (!found) {
bestOffset = dictOffset;
bestSize = dictSize;
found = true;
}
// Exact case-sensitive match — use immediately
if (key == word) {
idx.close();
return readDefinition(dictOffset, dictSize);
}
} else if (found) {
// We've moved past all case variants of this word — stop
break;
} else if (stardictCmp(key.c_str(), word.c_str()) > 0) {
// Past the target in StarDict sort order — stop scanning
break;
}
}
idx.close();
return found ? readDefinition(bestOffset, bestSize) : "";
}
std::string Dictionary::lookup(const std::string& word, const std::function<void(int percent)>& onProgress,
const std::function<bool()>& shouldCancel) {
if (!indexLoaded) {
if (!loadIndex(onProgress, shouldCancel)) return "";
}
// searchIndex uses StarDict sort order + case-insensitive match,
// so a single pass handles all casing variants.
std::string result = searchIndex(word, shouldCancel);
if (onProgress) onProgress(100);
return result;
}