329 lines
10 KiB
C++
329 lines
10 KiB
C++
|
|
#include "Dictionary.h"
|
|||
|
|
|
|||
|
|
#include <HalStorage.h>
|
|||
|
|
|
|||
|
|
#include <algorithm>
|
|||
|
|
#include <cctype>
|
|||
|
|
#include <cstring>
|
|||
|
|
|
|||
|
|
namespace {
|
|||
|
|
constexpr const char* IDX_PATH = "/.dictionary/dictionary.idx";
|
|||
|
|
constexpr const char* DICT_PATH = "/.dictionary/dictionary.dict";
|
|||
|
|
constexpr const char* CACHE_PATH = "/.dictionary/dictionary.cache";
|
|||
|
|
constexpr uint32_t CACHE_MAGIC = 0x44494358; // "DICX"
|
|||
|
|
|
|||
|
|
// g_ascii_strcasecmp equivalent: compare lowercasing only ASCII A-Z.
|
|||
|
|
int asciiCaseCmp(const char* s1, const char* s2) {
|
|||
|
|
const auto* p1 = reinterpret_cast<const unsigned char*>(s1);
|
|||
|
|
const auto* p2 = reinterpret_cast<const unsigned char*>(s2);
|
|||
|
|
while (*p1 && *p2) {
|
|||
|
|
unsigned char c1 = *p1, c2 = *p2;
|
|||
|
|
if (c1 >= 'A' && c1 <= 'Z') c1 += 32;
|
|||
|
|
if (c2 >= 'A' && c2 <= 'Z') c2 += 32;
|
|||
|
|
if (c1 != c2) return static_cast<int>(c1) - static_cast<int>(c2);
|
|||
|
|
++p1;
|
|||
|
|
++p2;
|
|||
|
|
}
|
|||
|
|
return static_cast<int>(*p1) - static_cast<int>(*p2);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StarDict index comparison: case-insensitive first, then case-sensitive tiebreaker.
|
|||
|
|
// This matches the stardict_strcmp used by StarDict to sort .idx entries.
|
|||
|
|
int stardictCmp(const char* s1, const char* s2) {
|
|||
|
|
int ci = asciiCaseCmp(s1, s2);
|
|||
|
|
if (ci != 0) return ci;
|
|||
|
|
return std::strcmp(s1, s2);
|
|||
|
|
}
|
|||
|
|
} // namespace
|
|||
|
|
|
|||
|
|
std::vector<uint32_t> Dictionary::sparseOffsets;
|
|||
|
|
uint32_t Dictionary::totalWords = 0;
|
|||
|
|
bool Dictionary::indexLoaded = false;
|
|||
|
|
|
|||
|
|
bool Dictionary::exists() { return Storage.exists(IDX_PATH); }
|
|||
|
|
|
|||
|
|
bool Dictionary::cacheExists() { return Storage.exists(CACHE_PATH); }
|
|||
|
|
|
|||
|
|
void Dictionary::deleteCache() {
|
|||
|
|
Storage.remove(CACHE_PATH);
|
|||
|
|
// Reset in-memory state so next lookup rebuilds from the .idx file.
|
|||
|
|
sparseOffsets.clear();
|
|||
|
|
totalWords = 0;
|
|||
|
|
indexLoaded = false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::string Dictionary::cleanWord(const std::string& word) {
|
|||
|
|
if (word.empty()) return "";
|
|||
|
|
|
|||
|
|
// Find first alphanumeric character
|
|||
|
|
size_t start = 0;
|
|||
|
|
while (start < word.size() && !std::isalnum(static_cast<unsigned char>(word[start]))) {
|
|||
|
|
start++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Find last alphanumeric character
|
|||
|
|
size_t end = word.size();
|
|||
|
|
while (end > start && !std::isalnum(static_cast<unsigned char>(word[end - 1]))) {
|
|||
|
|
end--;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (start >= end) return "";
|
|||
|
|
|
|||
|
|
std::string result = word.substr(start, end - start);
|
|||
|
|
// Lowercase
|
|||
|
|
std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { return std::tolower(c); });
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// Cache: persists the sparse offset table to SD card so subsequent boots skip
|
|||
|
|
// the full .idx scan. The cache is invalidated when the .idx file size changes.
|
|||
|
|
//
|
|||
|
|
// Format: [magic 4B][idxFileSize 4B][totalWords 4B][count 4B][offsets N×4B]
|
|||
|
|
// All values are stored in native byte order (little-endian on ESP32).
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
bool Dictionary::loadCachedIndex() {
|
|||
|
|
FsFile idx;
|
|||
|
|
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
|
|||
|
|
const uint32_t idxFileSize = static_cast<uint32_t>(idx.fileSize());
|
|||
|
|
idx.close();
|
|||
|
|
|
|||
|
|
FsFile cache;
|
|||
|
|
if (!Storage.openFileForRead("DICT", CACHE_PATH, cache)) return false;
|
|||
|
|
|
|||
|
|
// Read and validate header
|
|||
|
|
uint32_t header[4]; // magic, idxFileSize, totalWords, count
|
|||
|
|
if (cache.read(reinterpret_cast<uint8_t*>(header), 16) != 16) {
|
|||
|
|
cache.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (header[0] != CACHE_MAGIC || header[1] != idxFileSize) {
|
|||
|
|
cache.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
totalWords = header[2];
|
|||
|
|
const uint32_t count = header[3];
|
|||
|
|
|
|||
|
|
sparseOffsets.resize(count);
|
|||
|
|
const int bytesToRead = static_cast<int>(count * sizeof(uint32_t));
|
|||
|
|
if (cache.read(reinterpret_cast<uint8_t*>(sparseOffsets.data()), bytesToRead) != bytesToRead) {
|
|||
|
|
cache.close();
|
|||
|
|
sparseOffsets.clear();
|
|||
|
|
totalWords = 0;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
cache.close();
|
|||
|
|
indexLoaded = true;
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void Dictionary::saveCachedIndex(uint32_t idxFileSize) {
|
|||
|
|
FsFile cache;
|
|||
|
|
if (!Storage.openFileForWrite("DICT", CACHE_PATH, cache)) return;
|
|||
|
|
|
|||
|
|
const uint32_t count = static_cast<uint32_t>(sparseOffsets.size());
|
|||
|
|
uint32_t header[4] = {CACHE_MAGIC, idxFileSize, totalWords, count};
|
|||
|
|
|
|||
|
|
cache.write(reinterpret_cast<const uint8_t*>(header), 16);
|
|||
|
|
cache.write(reinterpret_cast<const uint8_t*>(sparseOffsets.data()), count * sizeof(uint32_t));
|
|||
|
|
cache.close();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Scan the .idx file to build a sparse offset table for fast lookups.
|
|||
|
|
// Records the file offset of every SPARSE_INTERVAL-th entry.
|
|||
|
|
bool Dictionary::loadIndex(const std::function<void(int percent)>& onProgress,
|
|||
|
|
const std::function<bool()>& shouldCancel) {
|
|||
|
|
// Try loading from cache first (nearly instant)
|
|||
|
|
if (loadCachedIndex()) return true;
|
|||
|
|
|
|||
|
|
FsFile idx;
|
|||
|
|
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
|
|||
|
|
|
|||
|
|
const uint32_t fileSize = static_cast<uint32_t>(idx.fileSize());
|
|||
|
|
|
|||
|
|
sparseOffsets.clear();
|
|||
|
|
totalWords = 0;
|
|||
|
|
|
|||
|
|
uint32_t pos = 0;
|
|||
|
|
int lastReportedPercent = -1;
|
|||
|
|
|
|||
|
|
while (pos < fileSize) {
|
|||
|
|
if (shouldCancel && (totalWords % 100 == 0) && shouldCancel()) {
|
|||
|
|
idx.close();
|
|||
|
|
sparseOffsets.clear();
|
|||
|
|
totalWords = 0;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (totalWords % SPARSE_INTERVAL == 0) {
|
|||
|
|
sparseOffsets.push_back(pos);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Skip word (read until null terminator)
|
|||
|
|
int ch;
|
|||
|
|
do {
|
|||
|
|
ch = idx.read();
|
|||
|
|
if (ch < 0) {
|
|||
|
|
pos = fileSize;
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
pos++;
|
|||
|
|
} while (ch != 0);
|
|||
|
|
|
|||
|
|
if (pos >= fileSize) break;
|
|||
|
|
|
|||
|
|
// Skip 8 bytes (4-byte offset + 4-byte size)
|
|||
|
|
uint8_t skip[8];
|
|||
|
|
if (idx.read(skip, 8) != 8) break;
|
|||
|
|
pos += 8;
|
|||
|
|
|
|||
|
|
totalWords++;
|
|||
|
|
|
|||
|
|
if (onProgress && fileSize > 0) {
|
|||
|
|
int percent = static_cast<int>(static_cast<uint64_t>(pos) * 90 / fileSize);
|
|||
|
|
if (percent > lastReportedPercent + 4) {
|
|||
|
|
lastReportedPercent = percent;
|
|||
|
|
onProgress(percent);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
idx.close();
|
|||
|
|
indexLoaded = true;
|
|||
|
|
|
|||
|
|
// Persist to cache so next boot is instant
|
|||
|
|
if (totalWords > 0) saveCachedIndex(fileSize);
|
|||
|
|
|
|||
|
|
return totalWords > 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Read a null-terminated word string from the current file position.
|
|||
|
|
std::string Dictionary::readWord(FsFile& file) {
|
|||
|
|
std::string word;
|
|||
|
|
while (true) {
|
|||
|
|
int ch = file.read();
|
|||
|
|
if (ch <= 0) break; // null terminator (0) or error (-1)
|
|||
|
|
word += static_cast<char>(ch);
|
|||
|
|
}
|
|||
|
|
return word;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Read a definition from the .dict file at the given offset and size.
|
|||
|
|
std::string Dictionary::readDefinition(uint32_t offset, uint32_t size) {
|
|||
|
|
FsFile dict;
|
|||
|
|
if (!Storage.openFileForRead("DICT", DICT_PATH, dict)) return "";
|
|||
|
|
|
|||
|
|
dict.seekSet(offset);
|
|||
|
|
|
|||
|
|
std::string def(size, '\0');
|
|||
|
|
int bytesRead = dict.read(reinterpret_cast<uint8_t*>(&def[0]), size);
|
|||
|
|
dict.close();
|
|||
|
|
|
|||
|
|
if (bytesRead < 0) return "";
|
|||
|
|
if (static_cast<uint32_t>(bytesRead) < size) def.resize(bytesRead);
|
|||
|
|
return def;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Binary search the sparse offset table, then linear scan within the matching segment.
|
|||
|
|
// Uses StarDict's sort order: case-insensitive first, then case-sensitive tiebreaker.
|
|||
|
|
// The exact match is case-insensitive so e.g. "simple" matches "Simple".
|
|||
|
|
std::string Dictionary::searchIndex(const std::string& word, const std::function<bool()>& shouldCancel) {
|
|||
|
|
if (sparseOffsets.empty()) return "";
|
|||
|
|
|
|||
|
|
FsFile idx;
|
|||
|
|
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return "";
|
|||
|
|
|
|||
|
|
// Binary search the sparse offset table to find the right segment.
|
|||
|
|
int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
|
|||
|
|
|
|||
|
|
while (lo < hi) {
|
|||
|
|
if (shouldCancel && shouldCancel()) {
|
|||
|
|
idx.close();
|
|||
|
|
return "";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
int mid = lo + (hi - lo + 1) / 2;
|
|||
|
|
idx.seekSet(sparseOffsets[mid]);
|
|||
|
|
std::string key = readWord(idx);
|
|||
|
|
|
|||
|
|
if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
|
|||
|
|
lo = mid;
|
|||
|
|
} else {
|
|||
|
|
hi = mid - 1;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Linear scan within the segment starting at sparseOffsets[lo].
|
|||
|
|
idx.seekSet(sparseOffsets[lo]);
|
|||
|
|
|
|||
|
|
int maxEntries = SPARSE_INTERVAL;
|
|||
|
|
if (lo == static_cast<int>(sparseOffsets.size()) - 1) {
|
|||
|
|
maxEntries = static_cast<int>(totalWords - static_cast<uint32_t>(lo) * SPARSE_INTERVAL);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Scan entries, preferring an exact case-sensitive match over a case-insensitive one.
|
|||
|
|
// In stardict order, all case variants of a word are adjacent (e.g. "Professor" then "professor"),
|
|||
|
|
// and they may have different definitions. We want the lowercase entry when the user searched
|
|||
|
|
// for a lowercase word, falling back to any case variant.
|
|||
|
|
uint32_t bestOffset = 0, bestSize = 0;
|
|||
|
|
bool found = false;
|
|||
|
|
|
|||
|
|
for (int i = 0; i < maxEntries; i++) {
|
|||
|
|
if (shouldCancel && shouldCancel()) {
|
|||
|
|
idx.close();
|
|||
|
|
return "";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::string key = readWord(idx);
|
|||
|
|
if (key.empty()) break;
|
|||
|
|
|
|||
|
|
// Read offset and size (4 bytes each, big-endian)
|
|||
|
|
uint8_t buf[8];
|
|||
|
|
if (idx.read(buf, 8) != 8) break;
|
|||
|
|
|
|||
|
|
uint32_t dictOffset = (static_cast<uint32_t>(buf[0]) << 24) | (static_cast<uint32_t>(buf[1]) << 16) |
|
|||
|
|
(static_cast<uint32_t>(buf[2]) << 8) | static_cast<uint32_t>(buf[3]);
|
|||
|
|
uint32_t dictSize = (static_cast<uint32_t>(buf[4]) << 24) | (static_cast<uint32_t>(buf[5]) << 16) |
|
|||
|
|
(static_cast<uint32_t>(buf[6]) << 8) | static_cast<uint32_t>(buf[7]);
|
|||
|
|
|
|||
|
|
if (asciiCaseCmp(key.c_str(), word.c_str()) == 0) {
|
|||
|
|
// Case-insensitive match — remember the first one as fallback
|
|||
|
|
if (!found) {
|
|||
|
|
bestOffset = dictOffset;
|
|||
|
|
bestSize = dictSize;
|
|||
|
|
found = true;
|
|||
|
|
}
|
|||
|
|
// Exact case-sensitive match — use immediately
|
|||
|
|
if (key == word) {
|
|||
|
|
idx.close();
|
|||
|
|
return readDefinition(dictOffset, dictSize);
|
|||
|
|
}
|
|||
|
|
} else if (found) {
|
|||
|
|
// We've moved past all case variants of this word — stop
|
|||
|
|
break;
|
|||
|
|
} else if (stardictCmp(key.c_str(), word.c_str()) > 0) {
|
|||
|
|
// Past the target in StarDict sort order — stop scanning
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
idx.close();
|
|||
|
|
return found ? readDefinition(bestOffset, bestSize) : "";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::string Dictionary::lookup(const std::string& word, const std::function<void(int percent)>& onProgress,
|
|||
|
|
const std::function<bool()>& shouldCancel) {
|
|||
|
|
if (!indexLoaded) {
|
|||
|
|
if (!loadIndex(onProgress, shouldCancel)) return "";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// searchIndex uses StarDict sort order + case-insensitive match,
|
|||
|
|
// so a single pass handles all casing variants.
|
|||
|
|
std::string result = searchIndex(word, shouldCancel);
|
|||
|
|
if (onProgress) onProgress(100);
|
|||
|
|
return result;
|
|||
|
|
}
|