feat: Add dictionary word lookup feature with cached index

Implements StarDict-based dictionary lookup from the reader menu,
adapted from upstream PR #857 with /.dictionary/ folder path,
std::vector compatibility (PR #802), HTML definition rendering,
orientation-aware button hints, side button hints with CCW text
rotation, sparse index caching to SD card, pronunciation line
filtering, and reorganized reader menu with bookmark stubs.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
cottongin
2026-02-12 19:36:14 -05:00
parent 905f694576
commit 8d4bbf284d
17 changed files with 2195 additions and 9 deletions

328
src/util/Dictionary.cpp Normal file
View File

@@ -0,0 +1,328 @@
#include "Dictionary.h"
#include <HalStorage.h>
#include <algorithm>
#include <cctype>
#include <cstring>
namespace {
constexpr const char* IDX_PATH = "/.dictionary/dictionary.idx";
constexpr const char* DICT_PATH = "/.dictionary/dictionary.dict";
constexpr const char* CACHE_PATH = "/.dictionary/dictionary.cache";
constexpr uint32_t CACHE_MAGIC = 0x44494358; // "DICX"
// g_ascii_strcasecmp equivalent: compare lowercasing only ASCII A-Z.
int asciiCaseCmp(const char* s1, const char* s2) {
const auto* p1 = reinterpret_cast<const unsigned char*>(s1);
const auto* p2 = reinterpret_cast<const unsigned char*>(s2);
while (*p1 && *p2) {
unsigned char c1 = *p1, c2 = *p2;
if (c1 >= 'A' && c1 <= 'Z') c1 += 32;
if (c2 >= 'A' && c2 <= 'Z') c2 += 32;
if (c1 != c2) return static_cast<int>(c1) - static_cast<int>(c2);
++p1;
++p2;
}
return static_cast<int>(*p1) - static_cast<int>(*p2);
}
// StarDict index comparison: case-insensitive first, then case-sensitive tiebreaker.
// This matches the stardict_strcmp used by StarDict to sort .idx entries.
int stardictCmp(const char* s1, const char* s2) {
int ci = asciiCaseCmp(s1, s2);
if (ci != 0) return ci;
return std::strcmp(s1, s2);
}
} // namespace
std::vector<uint32_t> Dictionary::sparseOffsets;
uint32_t Dictionary::totalWords = 0;
bool Dictionary::indexLoaded = false;
bool Dictionary::exists() { return Storage.exists(IDX_PATH); }
bool Dictionary::cacheExists() { return Storage.exists(CACHE_PATH); }
void Dictionary::deleteCache() {
Storage.remove(CACHE_PATH);
// Reset in-memory state so next lookup rebuilds from the .idx file.
sparseOffsets.clear();
totalWords = 0;
indexLoaded = false;
}
std::string Dictionary::cleanWord(const std::string& word) {
if (word.empty()) return "";
// Find first alphanumeric character
size_t start = 0;
while (start < word.size() && !std::isalnum(static_cast<unsigned char>(word[start]))) {
start++;
}
// Find last alphanumeric character
size_t end = word.size();
while (end > start && !std::isalnum(static_cast<unsigned char>(word[end - 1]))) {
end--;
}
if (start >= end) return "";
std::string result = word.substr(start, end - start);
// Lowercase
std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { return std::tolower(c); });
return result;
}
// ---------------------------------------------------------------------------
// Cache: persists the sparse offset table to SD card so subsequent boots skip
// the full .idx scan. The cache is invalidated when the .idx file size changes.
//
// Format: [magic 4B][idxFileSize 4B][totalWords 4B][count 4B][offsets N×4B]
// All values are stored in native byte order (little-endian on ESP32).
// ---------------------------------------------------------------------------
bool Dictionary::loadCachedIndex() {
FsFile idx;
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
const uint32_t idxFileSize = static_cast<uint32_t>(idx.fileSize());
idx.close();
FsFile cache;
if (!Storage.openFileForRead("DICT", CACHE_PATH, cache)) return false;
// Read and validate header
uint32_t header[4]; // magic, idxFileSize, totalWords, count
if (cache.read(reinterpret_cast<uint8_t*>(header), 16) != 16) {
cache.close();
return false;
}
if (header[0] != CACHE_MAGIC || header[1] != idxFileSize) {
cache.close();
return false;
}
totalWords = header[2];
const uint32_t count = header[3];
sparseOffsets.resize(count);
const int bytesToRead = static_cast<int>(count * sizeof(uint32_t));
if (cache.read(reinterpret_cast<uint8_t*>(sparseOffsets.data()), bytesToRead) != bytesToRead) {
cache.close();
sparseOffsets.clear();
totalWords = 0;
return false;
}
cache.close();
indexLoaded = true;
return true;
}
void Dictionary::saveCachedIndex(uint32_t idxFileSize) {
FsFile cache;
if (!Storage.openFileForWrite("DICT", CACHE_PATH, cache)) return;
const uint32_t count = static_cast<uint32_t>(sparseOffsets.size());
uint32_t header[4] = {CACHE_MAGIC, idxFileSize, totalWords, count};
cache.write(reinterpret_cast<const uint8_t*>(header), 16);
cache.write(reinterpret_cast<const uint8_t*>(sparseOffsets.data()), count * sizeof(uint32_t));
cache.close();
}
// Scan the .idx file to build a sparse offset table for fast lookups.
// Records the file offset of every SPARSE_INTERVAL-th entry.
bool Dictionary::loadIndex(const std::function<void(int percent)>& onProgress,
const std::function<bool()>& shouldCancel) {
// Try loading from cache first (nearly instant)
if (loadCachedIndex()) return true;
FsFile idx;
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
const uint32_t fileSize = static_cast<uint32_t>(idx.fileSize());
sparseOffsets.clear();
totalWords = 0;
uint32_t pos = 0;
int lastReportedPercent = -1;
while (pos < fileSize) {
if (shouldCancel && (totalWords % 100 == 0) && shouldCancel()) {
idx.close();
sparseOffsets.clear();
totalWords = 0;
return false;
}
if (totalWords % SPARSE_INTERVAL == 0) {
sparseOffsets.push_back(pos);
}
// Skip word (read until null terminator)
int ch;
do {
ch = idx.read();
if (ch < 0) {
pos = fileSize;
break;
}
pos++;
} while (ch != 0);
if (pos >= fileSize) break;
// Skip 8 bytes (4-byte offset + 4-byte size)
uint8_t skip[8];
if (idx.read(skip, 8) != 8) break;
pos += 8;
totalWords++;
if (onProgress && fileSize > 0) {
int percent = static_cast<int>(static_cast<uint64_t>(pos) * 90 / fileSize);
if (percent > lastReportedPercent + 4) {
lastReportedPercent = percent;
onProgress(percent);
}
}
}
idx.close();
indexLoaded = true;
// Persist to cache so next boot is instant
if (totalWords > 0) saveCachedIndex(fileSize);
return totalWords > 0;
}
// Read a null-terminated word string from the current file position.
std::string Dictionary::readWord(FsFile& file) {
std::string word;
while (true) {
int ch = file.read();
if (ch <= 0) break; // null terminator (0) or error (-1)
word += static_cast<char>(ch);
}
return word;
}
// Read a definition from the .dict file at the given offset and size.
std::string Dictionary::readDefinition(uint32_t offset, uint32_t size) {
FsFile dict;
if (!Storage.openFileForRead("DICT", DICT_PATH, dict)) return "";
dict.seekSet(offset);
std::string def(size, '\0');
int bytesRead = dict.read(reinterpret_cast<uint8_t*>(&def[0]), size);
dict.close();
if (bytesRead < 0) return "";
if (static_cast<uint32_t>(bytesRead) < size) def.resize(bytesRead);
return def;
}
// Binary search the sparse offset table, then linear scan within the matching segment.
// Uses StarDict's sort order: case-insensitive first, then case-sensitive tiebreaker.
// The exact match is case-insensitive so e.g. "simple" matches "Simple".
std::string Dictionary::searchIndex(const std::string& word, const std::function<bool()>& shouldCancel) {
if (sparseOffsets.empty()) return "";
FsFile idx;
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return "";
// Binary search the sparse offset table to find the right segment.
int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
while (lo < hi) {
if (shouldCancel && shouldCancel()) {
idx.close();
return "";
}
int mid = lo + (hi - lo + 1) / 2;
idx.seekSet(sparseOffsets[mid]);
std::string key = readWord(idx);
if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
lo = mid;
} else {
hi = mid - 1;
}
}
// Linear scan within the segment starting at sparseOffsets[lo].
idx.seekSet(sparseOffsets[lo]);
int maxEntries = SPARSE_INTERVAL;
if (lo == static_cast<int>(sparseOffsets.size()) - 1) {
maxEntries = static_cast<int>(totalWords - static_cast<uint32_t>(lo) * SPARSE_INTERVAL);
}
// Scan entries, preferring an exact case-sensitive match over a case-insensitive one.
// In stardict order, all case variants of a word are adjacent (e.g. "Professor" then "professor"),
// and they may have different definitions. We want the lowercase entry when the user searched
// for a lowercase word, falling back to any case variant.
uint32_t bestOffset = 0, bestSize = 0;
bool found = false;
for (int i = 0; i < maxEntries; i++) {
if (shouldCancel && shouldCancel()) {
idx.close();
return "";
}
std::string key = readWord(idx);
if (key.empty()) break;
// Read offset and size (4 bytes each, big-endian)
uint8_t buf[8];
if (idx.read(buf, 8) != 8) break;
uint32_t dictOffset = (static_cast<uint32_t>(buf[0]) << 24) | (static_cast<uint32_t>(buf[1]) << 16) |
(static_cast<uint32_t>(buf[2]) << 8) | static_cast<uint32_t>(buf[3]);
uint32_t dictSize = (static_cast<uint32_t>(buf[4]) << 24) | (static_cast<uint32_t>(buf[5]) << 16) |
(static_cast<uint32_t>(buf[6]) << 8) | static_cast<uint32_t>(buf[7]);
if (asciiCaseCmp(key.c_str(), word.c_str()) == 0) {
// Case-insensitive match — remember the first one as fallback
if (!found) {
bestOffset = dictOffset;
bestSize = dictSize;
found = true;
}
// Exact case-sensitive match — use immediately
if (key == word) {
idx.close();
return readDefinition(dictOffset, dictSize);
}
} else if (found) {
// We've moved past all case variants of this word — stop
break;
} else if (stardictCmp(key.c_str(), word.c_str()) > 0) {
// Past the target in StarDict sort order — stop scanning
break;
}
}
idx.close();
return found ? readDefinition(bestOffset, bestSize) : "";
}
std::string Dictionary::lookup(const std::string& word, const std::function<void(int percent)>& onProgress,
const std::function<bool()>& shouldCancel) {
if (!indexLoaded) {
if (!loadIndex(onProgress, shouldCancel)) return "";
}
// searchIndex uses StarDict sort order + case-insensitive match,
// so a single pass handles all casing variants.
std::string result = searchIndex(word, shouldCancel);
if (onProgress) onProgress(100);
return result;
}

31
src/util/Dictionary.h Normal file
View File

@@ -0,0 +1,31 @@
#pragma once
#include <cstdint>
#include <functional>
#include <string>
#include <vector>
class FsFile;
class Dictionary {
public:
static bool exists();
static bool cacheExists();
static void deleteCache();
static std::string lookup(const std::string& word, const std::function<void(int percent)>& onProgress = nullptr,
const std::function<bool()>& shouldCancel = nullptr);
static std::string cleanWord(const std::string& word);
private:
static constexpr int SPARSE_INTERVAL = 512;
static std::vector<uint32_t> sparseOffsets;
static uint32_t totalWords;
static bool indexLoaded;
static bool loadIndex(const std::function<void(int percent)>& onProgress, const std::function<bool()>& shouldCancel);
static bool loadCachedIndex();
static void saveCachedIndex(uint32_t idxFileSize);
static std::string searchIndex(const std::string& word, const std::function<bool()>& shouldCancel);
static std::string readWord(FsFile& file);
static std::string readDefinition(uint32_t offset, uint32_t size);
};

View File

@@ -0,0 +1,88 @@
#include "LookupHistory.h"
#include <HalStorage.h>
#include <algorithm>
std::string LookupHistory::filePath(const std::string& cachePath) { return cachePath + "/lookups.txt"; }
bool LookupHistory::hasHistory(const std::string& cachePath) {
FsFile f;
if (!Storage.openFileForRead("LKH", filePath(cachePath), f)) {
return false;
}
bool nonEmpty = f.available() > 0;
f.close();
return nonEmpty;
}
std::vector<std::string> LookupHistory::load(const std::string& cachePath) {
std::vector<std::string> words;
FsFile f;
if (!Storage.openFileForRead("LKH", filePath(cachePath), f)) {
return words;
}
std::string line;
while (f.available() && static_cast<int>(words.size()) < MAX_ENTRIES) {
char c;
if (f.read(reinterpret_cast<uint8_t*>(&c), 1) != 1) break;
if (c == '\n') {
if (!line.empty()) {
words.push_back(line);
line.clear();
}
} else {
line += c;
}
}
if (!line.empty() && static_cast<int>(words.size()) < MAX_ENTRIES) {
words.push_back(line);
}
f.close();
return words;
}
void LookupHistory::removeWord(const std::string& cachePath, const std::string& word) {
if (word.empty()) return;
auto existing = load(cachePath);
FsFile f;
if (!Storage.openFileForWrite("LKH", filePath(cachePath), f)) {
return;
}
for (const auto& w : existing) {
if (w != word) {
f.write(reinterpret_cast<const uint8_t*>(w.c_str()), w.size());
f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
}
}
f.close();
}
void LookupHistory::addWord(const std::string& cachePath, const std::string& word) {
if (word.empty()) return;
// Check if already present
auto existing = load(cachePath);
if (std::any_of(existing.begin(), existing.end(), [&word](const std::string& w) { return w == word; })) return;
// Cap at max entries
if (static_cast<int>(existing.size()) >= MAX_ENTRIES) return;
FsFile f;
if (!Storage.openFileForWrite("LKH", filePath(cachePath), f)) {
return;
}
// Rewrite existing entries plus new one
for (const auto& w : existing) {
f.write(reinterpret_cast<const uint8_t*>(w.c_str()), w.size());
f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
}
f.write(reinterpret_cast<const uint8_t*>(word.c_str()), word.size());
f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
f.close();
}

15
src/util/LookupHistory.h Normal file
View File

@@ -0,0 +1,15 @@
#pragma once
#include <string>
#include <vector>
class LookupHistory {
public:
static std::vector<std::string> load(const std::string& cachePath);
static void addWord(const std::string& cachePath, const std::string& word);
static void removeWord(const std::string& cachePath, const std::string& word);
static bool hasHistory(const std::string& cachePath);
private:
static std::string filePath(const std::string& cachePath);
static constexpr int MAX_ENTRIES = 500;
};