feat: Add dictionary word lookup feature with cached index
Implements StarDict-based dictionary lookup from the reader menu, adapted from upstream PR #857 with /.dictionary/ folder path, std::vector compatibility (PR #802), HTML definition rendering, orientation-aware button hints, side button hints with CCW text rotation, sparse index caching to SD card, pronunciation line filtering, and reorganized reader menu with bookmark stubs. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
328
src/util/Dictionary.cpp
Normal file
328
src/util/Dictionary.cpp
Normal file
@@ -0,0 +1,328 @@
|
||||
#include "Dictionary.h"
|
||||
|
||||
#include <HalStorage.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cstring>
|
||||
|
||||
namespace {
|
||||
constexpr const char* IDX_PATH = "/.dictionary/dictionary.idx";
|
||||
constexpr const char* DICT_PATH = "/.dictionary/dictionary.dict";
|
||||
constexpr const char* CACHE_PATH = "/.dictionary/dictionary.cache";
|
||||
constexpr uint32_t CACHE_MAGIC = 0x44494358; // "DICX"
|
||||
|
||||
// g_ascii_strcasecmp equivalent: compare lowercasing only ASCII A-Z.
|
||||
int asciiCaseCmp(const char* s1, const char* s2) {
|
||||
const auto* p1 = reinterpret_cast<const unsigned char*>(s1);
|
||||
const auto* p2 = reinterpret_cast<const unsigned char*>(s2);
|
||||
while (*p1 && *p2) {
|
||||
unsigned char c1 = *p1, c2 = *p2;
|
||||
if (c1 >= 'A' && c1 <= 'Z') c1 += 32;
|
||||
if (c2 >= 'A' && c2 <= 'Z') c2 += 32;
|
||||
if (c1 != c2) return static_cast<int>(c1) - static_cast<int>(c2);
|
||||
++p1;
|
||||
++p2;
|
||||
}
|
||||
return static_cast<int>(*p1) - static_cast<int>(*p2);
|
||||
}
|
||||
|
||||
// StarDict index comparison: case-insensitive first, then case-sensitive tiebreaker.
|
||||
// This matches the stardict_strcmp used by StarDict to sort .idx entries.
|
||||
int stardictCmp(const char* s1, const char* s2) {
|
||||
int ci = asciiCaseCmp(s1, s2);
|
||||
if (ci != 0) return ci;
|
||||
return std::strcmp(s1, s2);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::vector<uint32_t> Dictionary::sparseOffsets;
|
||||
uint32_t Dictionary::totalWords = 0;
|
||||
bool Dictionary::indexLoaded = false;
|
||||
|
||||
bool Dictionary::exists() { return Storage.exists(IDX_PATH); }
|
||||
|
||||
bool Dictionary::cacheExists() { return Storage.exists(CACHE_PATH); }
|
||||
|
||||
void Dictionary::deleteCache() {
|
||||
Storage.remove(CACHE_PATH);
|
||||
// Reset in-memory state so next lookup rebuilds from the .idx file.
|
||||
sparseOffsets.clear();
|
||||
totalWords = 0;
|
||||
indexLoaded = false;
|
||||
}
|
||||
|
||||
std::string Dictionary::cleanWord(const std::string& word) {
|
||||
if (word.empty()) return "";
|
||||
|
||||
// Find first alphanumeric character
|
||||
size_t start = 0;
|
||||
while (start < word.size() && !std::isalnum(static_cast<unsigned char>(word[start]))) {
|
||||
start++;
|
||||
}
|
||||
|
||||
// Find last alphanumeric character
|
||||
size_t end = word.size();
|
||||
while (end > start && !std::isalnum(static_cast<unsigned char>(word[end - 1]))) {
|
||||
end--;
|
||||
}
|
||||
|
||||
if (start >= end) return "";
|
||||
|
||||
std::string result = word.substr(start, end - start);
|
||||
// Lowercase
|
||||
std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { return std::tolower(c); });
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cache: persists the sparse offset table to SD card so subsequent boots skip
|
||||
// the full .idx scan. The cache is invalidated when the .idx file size changes.
|
||||
//
|
||||
// Format: [magic 4B][idxFileSize 4B][totalWords 4B][count 4B][offsets N×4B]
|
||||
// All values are stored in native byte order (little-endian on ESP32).
|
||||
// ---------------------------------------------------------------------------
|
||||
bool Dictionary::loadCachedIndex() {
|
||||
FsFile idx;
|
||||
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
|
||||
const uint32_t idxFileSize = static_cast<uint32_t>(idx.fileSize());
|
||||
idx.close();
|
||||
|
||||
FsFile cache;
|
||||
if (!Storage.openFileForRead("DICT", CACHE_PATH, cache)) return false;
|
||||
|
||||
// Read and validate header
|
||||
uint32_t header[4]; // magic, idxFileSize, totalWords, count
|
||||
if (cache.read(reinterpret_cast<uint8_t*>(header), 16) != 16) {
|
||||
cache.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (header[0] != CACHE_MAGIC || header[1] != idxFileSize) {
|
||||
cache.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
totalWords = header[2];
|
||||
const uint32_t count = header[3];
|
||||
|
||||
sparseOffsets.resize(count);
|
||||
const int bytesToRead = static_cast<int>(count * sizeof(uint32_t));
|
||||
if (cache.read(reinterpret_cast<uint8_t*>(sparseOffsets.data()), bytesToRead) != bytesToRead) {
|
||||
cache.close();
|
||||
sparseOffsets.clear();
|
||||
totalWords = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
cache.close();
|
||||
indexLoaded = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
void Dictionary::saveCachedIndex(uint32_t idxFileSize) {
|
||||
FsFile cache;
|
||||
if (!Storage.openFileForWrite("DICT", CACHE_PATH, cache)) return;
|
||||
|
||||
const uint32_t count = static_cast<uint32_t>(sparseOffsets.size());
|
||||
uint32_t header[4] = {CACHE_MAGIC, idxFileSize, totalWords, count};
|
||||
|
||||
cache.write(reinterpret_cast<const uint8_t*>(header), 16);
|
||||
cache.write(reinterpret_cast<const uint8_t*>(sparseOffsets.data()), count * sizeof(uint32_t));
|
||||
cache.close();
|
||||
}
|
||||
|
||||
// Scan the .idx file to build a sparse offset table for fast lookups.
|
||||
// Records the file offset of every SPARSE_INTERVAL-th entry.
|
||||
bool Dictionary::loadIndex(const std::function<void(int percent)>& onProgress,
|
||||
const std::function<bool()>& shouldCancel) {
|
||||
// Try loading from cache first (nearly instant)
|
||||
if (loadCachedIndex()) return true;
|
||||
|
||||
FsFile idx;
|
||||
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
|
||||
|
||||
const uint32_t fileSize = static_cast<uint32_t>(idx.fileSize());
|
||||
|
||||
sparseOffsets.clear();
|
||||
totalWords = 0;
|
||||
|
||||
uint32_t pos = 0;
|
||||
int lastReportedPercent = -1;
|
||||
|
||||
while (pos < fileSize) {
|
||||
if (shouldCancel && (totalWords % 100 == 0) && shouldCancel()) {
|
||||
idx.close();
|
||||
sparseOffsets.clear();
|
||||
totalWords = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (totalWords % SPARSE_INTERVAL == 0) {
|
||||
sparseOffsets.push_back(pos);
|
||||
}
|
||||
|
||||
// Skip word (read until null terminator)
|
||||
int ch;
|
||||
do {
|
||||
ch = idx.read();
|
||||
if (ch < 0) {
|
||||
pos = fileSize;
|
||||
break;
|
||||
}
|
||||
pos++;
|
||||
} while (ch != 0);
|
||||
|
||||
if (pos >= fileSize) break;
|
||||
|
||||
// Skip 8 bytes (4-byte offset + 4-byte size)
|
||||
uint8_t skip[8];
|
||||
if (idx.read(skip, 8) != 8) break;
|
||||
pos += 8;
|
||||
|
||||
totalWords++;
|
||||
|
||||
if (onProgress && fileSize > 0) {
|
||||
int percent = static_cast<int>(static_cast<uint64_t>(pos) * 90 / fileSize);
|
||||
if (percent > lastReportedPercent + 4) {
|
||||
lastReportedPercent = percent;
|
||||
onProgress(percent);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
idx.close();
|
||||
indexLoaded = true;
|
||||
|
||||
// Persist to cache so next boot is instant
|
||||
if (totalWords > 0) saveCachedIndex(fileSize);
|
||||
|
||||
return totalWords > 0;
|
||||
}
|
||||
|
||||
// Read a null-terminated word string from the current file position.
|
||||
std::string Dictionary::readWord(FsFile& file) {
|
||||
std::string word;
|
||||
while (true) {
|
||||
int ch = file.read();
|
||||
if (ch <= 0) break; // null terminator (0) or error (-1)
|
||||
word += static_cast<char>(ch);
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
// Read a definition from the .dict file at the given offset and size.
|
||||
std::string Dictionary::readDefinition(uint32_t offset, uint32_t size) {
|
||||
FsFile dict;
|
||||
if (!Storage.openFileForRead("DICT", DICT_PATH, dict)) return "";
|
||||
|
||||
dict.seekSet(offset);
|
||||
|
||||
std::string def(size, '\0');
|
||||
int bytesRead = dict.read(reinterpret_cast<uint8_t*>(&def[0]), size);
|
||||
dict.close();
|
||||
|
||||
if (bytesRead < 0) return "";
|
||||
if (static_cast<uint32_t>(bytesRead) < size) def.resize(bytesRead);
|
||||
return def;
|
||||
}
|
||||
|
||||
// Binary search the sparse offset table, then linear scan within the matching segment.
|
||||
// Uses StarDict's sort order: case-insensitive first, then case-sensitive tiebreaker.
|
||||
// The exact match is case-insensitive so e.g. "simple" matches "Simple".
|
||||
std::string Dictionary::searchIndex(const std::string& word, const std::function<bool()>& shouldCancel) {
|
||||
if (sparseOffsets.empty()) return "";
|
||||
|
||||
FsFile idx;
|
||||
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return "";
|
||||
|
||||
// Binary search the sparse offset table to find the right segment.
|
||||
int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
|
||||
|
||||
while (lo < hi) {
|
||||
if (shouldCancel && shouldCancel()) {
|
||||
idx.close();
|
||||
return "";
|
||||
}
|
||||
|
||||
int mid = lo + (hi - lo + 1) / 2;
|
||||
idx.seekSet(sparseOffsets[mid]);
|
||||
std::string key = readWord(idx);
|
||||
|
||||
if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
|
||||
lo = mid;
|
||||
} else {
|
||||
hi = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Linear scan within the segment starting at sparseOffsets[lo].
|
||||
idx.seekSet(sparseOffsets[lo]);
|
||||
|
||||
int maxEntries = SPARSE_INTERVAL;
|
||||
if (lo == static_cast<int>(sparseOffsets.size()) - 1) {
|
||||
maxEntries = static_cast<int>(totalWords - static_cast<uint32_t>(lo) * SPARSE_INTERVAL);
|
||||
}
|
||||
|
||||
// Scan entries, preferring an exact case-sensitive match over a case-insensitive one.
|
||||
// In stardict order, all case variants of a word are adjacent (e.g. "Professor" then "professor"),
|
||||
// and they may have different definitions. We want the lowercase entry when the user searched
|
||||
// for a lowercase word, falling back to any case variant.
|
||||
uint32_t bestOffset = 0, bestSize = 0;
|
||||
bool found = false;
|
||||
|
||||
for (int i = 0; i < maxEntries; i++) {
|
||||
if (shouldCancel && shouldCancel()) {
|
||||
idx.close();
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string key = readWord(idx);
|
||||
if (key.empty()) break;
|
||||
|
||||
// Read offset and size (4 bytes each, big-endian)
|
||||
uint8_t buf[8];
|
||||
if (idx.read(buf, 8) != 8) break;
|
||||
|
||||
uint32_t dictOffset = (static_cast<uint32_t>(buf[0]) << 24) | (static_cast<uint32_t>(buf[1]) << 16) |
|
||||
(static_cast<uint32_t>(buf[2]) << 8) | static_cast<uint32_t>(buf[3]);
|
||||
uint32_t dictSize = (static_cast<uint32_t>(buf[4]) << 24) | (static_cast<uint32_t>(buf[5]) << 16) |
|
||||
(static_cast<uint32_t>(buf[6]) << 8) | static_cast<uint32_t>(buf[7]);
|
||||
|
||||
if (asciiCaseCmp(key.c_str(), word.c_str()) == 0) {
|
||||
// Case-insensitive match — remember the first one as fallback
|
||||
if (!found) {
|
||||
bestOffset = dictOffset;
|
||||
bestSize = dictSize;
|
||||
found = true;
|
||||
}
|
||||
// Exact case-sensitive match — use immediately
|
||||
if (key == word) {
|
||||
idx.close();
|
||||
return readDefinition(dictOffset, dictSize);
|
||||
}
|
||||
} else if (found) {
|
||||
// We've moved past all case variants of this word — stop
|
||||
break;
|
||||
} else if (stardictCmp(key.c_str(), word.c_str()) > 0) {
|
||||
// Past the target in StarDict sort order — stop scanning
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
idx.close();
|
||||
return found ? readDefinition(bestOffset, bestSize) : "";
|
||||
}
|
||||
|
||||
std::string Dictionary::lookup(const std::string& word, const std::function<void(int percent)>& onProgress,
|
||||
const std::function<bool()>& shouldCancel) {
|
||||
if (!indexLoaded) {
|
||||
if (!loadIndex(onProgress, shouldCancel)) return "";
|
||||
}
|
||||
|
||||
// searchIndex uses StarDict sort order + case-insensitive match,
|
||||
// so a single pass handles all casing variants.
|
||||
std::string result = searchIndex(word, shouldCancel);
|
||||
if (onProgress) onProgress(100);
|
||||
return result;
|
||||
}
|
||||
31
src/util/Dictionary.h
Normal file
31
src/util/Dictionary.h
Normal file
@@ -0,0 +1,31 @@
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class FsFile;
|
||||
|
||||
class Dictionary {
|
||||
public:
|
||||
static bool exists();
|
||||
static bool cacheExists();
|
||||
static void deleteCache();
|
||||
static std::string lookup(const std::string& word, const std::function<void(int percent)>& onProgress = nullptr,
|
||||
const std::function<bool()>& shouldCancel = nullptr);
|
||||
static std::string cleanWord(const std::string& word);
|
||||
|
||||
private:
|
||||
static constexpr int SPARSE_INTERVAL = 512;
|
||||
|
||||
static std::vector<uint32_t> sparseOffsets;
|
||||
static uint32_t totalWords;
|
||||
static bool indexLoaded;
|
||||
|
||||
static bool loadIndex(const std::function<void(int percent)>& onProgress, const std::function<bool()>& shouldCancel);
|
||||
static bool loadCachedIndex();
|
||||
static void saveCachedIndex(uint32_t idxFileSize);
|
||||
static std::string searchIndex(const std::string& word, const std::function<bool()>& shouldCancel);
|
||||
static std::string readWord(FsFile& file);
|
||||
static std::string readDefinition(uint32_t offset, uint32_t size);
|
||||
};
|
||||
88
src/util/LookupHistory.cpp
Normal file
88
src/util/LookupHistory.cpp
Normal file
@@ -0,0 +1,88 @@
|
||||
#include "LookupHistory.h"
|
||||
|
||||
#include <HalStorage.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
std::string LookupHistory::filePath(const std::string& cachePath) { return cachePath + "/lookups.txt"; }
|
||||
|
||||
bool LookupHistory::hasHistory(const std::string& cachePath) {
|
||||
FsFile f;
|
||||
if (!Storage.openFileForRead("LKH", filePath(cachePath), f)) {
|
||||
return false;
|
||||
}
|
||||
bool nonEmpty = f.available() > 0;
|
||||
f.close();
|
||||
return nonEmpty;
|
||||
}
|
||||
|
||||
std::vector<std::string> LookupHistory::load(const std::string& cachePath) {
|
||||
std::vector<std::string> words;
|
||||
FsFile f;
|
||||
if (!Storage.openFileForRead("LKH", filePath(cachePath), f)) {
|
||||
return words;
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (f.available() && static_cast<int>(words.size()) < MAX_ENTRIES) {
|
||||
char c;
|
||||
if (f.read(reinterpret_cast<uint8_t*>(&c), 1) != 1) break;
|
||||
if (c == '\n') {
|
||||
if (!line.empty()) {
|
||||
words.push_back(line);
|
||||
line.clear();
|
||||
}
|
||||
} else {
|
||||
line += c;
|
||||
}
|
||||
}
|
||||
if (!line.empty() && static_cast<int>(words.size()) < MAX_ENTRIES) {
|
||||
words.push_back(line);
|
||||
}
|
||||
f.close();
|
||||
return words;
|
||||
}
|
||||
|
||||
void LookupHistory::removeWord(const std::string& cachePath, const std::string& word) {
|
||||
if (word.empty()) return;
|
||||
|
||||
auto existing = load(cachePath);
|
||||
|
||||
FsFile f;
|
||||
if (!Storage.openFileForWrite("LKH", filePath(cachePath), f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const auto& w : existing) {
|
||||
if (w != word) {
|
||||
f.write(reinterpret_cast<const uint8_t*>(w.c_str()), w.size());
|
||||
f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
|
||||
}
|
||||
}
|
||||
f.close();
|
||||
}
|
||||
|
||||
void LookupHistory::addWord(const std::string& cachePath, const std::string& word) {
|
||||
if (word.empty()) return;
|
||||
|
||||
// Check if already present
|
||||
auto existing = load(cachePath);
|
||||
if (std::any_of(existing.begin(), existing.end(), [&word](const std::string& w) { return w == word; })) return;
|
||||
|
||||
// Cap at max entries
|
||||
if (static_cast<int>(existing.size()) >= MAX_ENTRIES) return;
|
||||
|
||||
FsFile f;
|
||||
if (!Storage.openFileForWrite("LKH", filePath(cachePath), f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Rewrite existing entries plus new one
|
||||
for (const auto& w : existing) {
|
||||
f.write(reinterpret_cast<const uint8_t*>(w.c_str()), w.size());
|
||||
f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
|
||||
}
|
||||
f.write(reinterpret_cast<const uint8_t*>(word.c_str()), word.size());
|
||||
f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
|
||||
f.close();
|
||||
}
|
||||
15
src/util/LookupHistory.h
Normal file
15
src/util/LookupHistory.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class LookupHistory {
|
||||
public:
|
||||
static std::vector<std::string> load(const std::string& cachePath);
|
||||
static void addWord(const std::string& cachePath, const std::string& word);
|
||||
static void removeWord(const std::string& cachePath, const std::string& word);
|
||||
static bool hasHistory(const std::string& cachePath);
|
||||
|
||||
private:
|
||||
static std::string filePath(const std::string& cachePath);
|
||||
static constexpr int MAX_ENTRIES = 500;
|
||||
};
|
||||
Reference in New Issue
Block a user