feat: Add dictionary word lookup feature with cached index

Implements StarDict-based dictionary lookup from the reader menu, adapted from upstream PR #857 with /.dictionary/ folder path, std::vector compatibility (PR #802), HTML definition rendering, orientation-aware button hints, side button hints with CCW text rotation, sparse index caching to SD card, pronunciation line filtering, and reorganized reader menu with bookmark stubs. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-12 19:36:14 -05:00
parent 905f694576
commit 8d4bbf284d
17 changed files with 2195 additions and 9 deletions
--- a/src/util/Dictionary.cpp
+++ b/src/util/Dictionary.cpp
@@ -0,0 +1,328 @@
+#include "Dictionary.h"
+
+#include <HalStorage.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cstring>
+
+namespace {
+constexpr const char* IDX_PATH = "/.dictionary/dictionary.idx";
+constexpr const char* DICT_PATH = "/.dictionary/dictionary.dict";
+constexpr const char* CACHE_PATH = "/.dictionary/dictionary.cache";
+constexpr uint32_t CACHE_MAGIC = 0x44494358;  // "DICX"
+
+// g_ascii_strcasecmp equivalent: compare lowercasing only ASCII A-Z.
+int asciiCaseCmp(const char* s1, const char* s2) {
+  const auto* p1 = reinterpret_cast<const unsigned char*>(s1);
+  const auto* p2 = reinterpret_cast<const unsigned char*>(s2);
+  while (*p1 && *p2) {
+    unsigned char c1 = *p1, c2 = *p2;
+    if (c1 >= 'A' && c1 <= 'Z') c1 += 32;
+    if (c2 >= 'A' && c2 <= 'Z') c2 += 32;
+    if (c1 != c2) return static_cast<int>(c1) - static_cast<int>(c2);
+    ++p1;
+    ++p2;
+  }
+  return static_cast<int>(*p1) - static_cast<int>(*p2);
+}
+
+// StarDict index comparison: case-insensitive first, then case-sensitive tiebreaker.
+// This matches the stardict_strcmp used by StarDict to sort .idx entries.
+int stardictCmp(const char* s1, const char* s2) {
+  int ci = asciiCaseCmp(s1, s2);
+  if (ci != 0) return ci;
+  return std::strcmp(s1, s2);
+}
+}  // namespace
+
+std::vector<uint32_t> Dictionary::sparseOffsets;
+uint32_t Dictionary::totalWords = 0;
+bool Dictionary::indexLoaded = false;
+
+bool Dictionary::exists() { return Storage.exists(IDX_PATH); }
+
+bool Dictionary::cacheExists() { return Storage.exists(CACHE_PATH); }
+
+void Dictionary::deleteCache() {
+  Storage.remove(CACHE_PATH);
+  // Reset in-memory state so next lookup rebuilds from the .idx file.
+  sparseOffsets.clear();
+  totalWords = 0;
+  indexLoaded = false;
+}
+
+std::string Dictionary::cleanWord(const std::string& word) {
+  if (word.empty()) return "";
+
+  // Find first alphanumeric character
+  size_t start = 0;
+  while (start < word.size() && !std::isalnum(static_cast<unsigned char>(word[start]))) {
+    start++;
+  }
+
+  // Find last alphanumeric character
+  size_t end = word.size();
+  while (end > start && !std::isalnum(static_cast<unsigned char>(word[end - 1]))) {
+    end--;
+  }
+
+  if (start >= end) return "";
+
+  std::string result = word.substr(start, end - start);
+  // Lowercase
+  std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { return std::tolower(c); });
+  return result;
+}
+
+// ---------------------------------------------------------------------------
+// Cache: persists the sparse offset table to SD card so subsequent boots skip
+// the full .idx scan.  The cache is invalidated when the .idx file size changes.
+//
+// Format: [magic 4B][idxFileSize 4B][totalWords 4B][count 4B][offsets N×4B]
+// All values are stored in native byte order (little-endian on ESP32).
+// ---------------------------------------------------------------------------
+bool Dictionary::loadCachedIndex() {
+  FsFile idx;
+  if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
+  const uint32_t idxFileSize = static_cast<uint32_t>(idx.fileSize());
+  idx.close();
+
+  FsFile cache;
+  if (!Storage.openFileForRead("DICT", CACHE_PATH, cache)) return false;
+
+  // Read and validate header
+  uint32_t header[4];  // magic, idxFileSize, totalWords, count
+  if (cache.read(reinterpret_cast<uint8_t*>(header), 16) != 16) {
+    cache.close();
+    return false;
+  }
+
+  if (header[0] != CACHE_MAGIC || header[1] != idxFileSize) {
+    cache.close();
+    return false;
+  }
+
+  totalWords = header[2];
+  const uint32_t count = header[3];
+
+  sparseOffsets.resize(count);
+  const int bytesToRead = static_cast<int>(count * sizeof(uint32_t));
+  if (cache.read(reinterpret_cast<uint8_t*>(sparseOffsets.data()), bytesToRead) != bytesToRead) {
+    cache.close();
+    sparseOffsets.clear();
+    totalWords = 0;
+    return false;
+  }
+
+  cache.close();
+  indexLoaded = true;
+  return true;
+}
+
+void Dictionary::saveCachedIndex(uint32_t idxFileSize) {
+  FsFile cache;
+  if (!Storage.openFileForWrite("DICT", CACHE_PATH, cache)) return;
+
+  const uint32_t count = static_cast<uint32_t>(sparseOffsets.size());
+  uint32_t header[4] = {CACHE_MAGIC, idxFileSize, totalWords, count};
+
+  cache.write(reinterpret_cast<const uint8_t*>(header), 16);
+  cache.write(reinterpret_cast<const uint8_t*>(sparseOffsets.data()), count * sizeof(uint32_t));
+  cache.close();
+}
+
+// Scan the .idx file to build a sparse offset table for fast lookups.
+// Records the file offset of every SPARSE_INTERVAL-th entry.
+bool Dictionary::loadIndex(const std::function<void(int percent)>& onProgress,
+                           const std::function<bool()>& shouldCancel) {
+  // Try loading from cache first (nearly instant)
+  if (loadCachedIndex()) return true;
+
+  FsFile idx;
+  if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
+
+  const uint32_t fileSize = static_cast<uint32_t>(idx.fileSize());
+
+  sparseOffsets.clear();
+  totalWords = 0;
+
+  uint32_t pos = 0;
+  int lastReportedPercent = -1;
+
+  while (pos < fileSize) {
+    if (shouldCancel && (totalWords % 100 == 0) && shouldCancel()) {
+      idx.close();
+      sparseOffsets.clear();
+      totalWords = 0;
+      return false;
+    }
+
+    if (totalWords % SPARSE_INTERVAL == 0) {
+      sparseOffsets.push_back(pos);
+    }
+
+    // Skip word (read until null terminator)
+    int ch;
+    do {
+      ch = idx.read();
+      if (ch < 0) {
+        pos = fileSize;
+        break;
+      }
+      pos++;
+    } while (ch != 0);
+
+    if (pos >= fileSize) break;
+
+    // Skip 8 bytes (4-byte offset + 4-byte size)
+    uint8_t skip[8];
+    if (idx.read(skip, 8) != 8) break;
+    pos += 8;
+
+    totalWords++;
+
+    if (onProgress && fileSize > 0) {
+      int percent = static_cast<int>(static_cast<uint64_t>(pos) * 90 / fileSize);
+      if (percent > lastReportedPercent + 4) {
+        lastReportedPercent = percent;
+        onProgress(percent);
+      }
+    }
+  }
+
+  idx.close();
+  indexLoaded = true;
+
+  // Persist to cache so next boot is instant
+  if (totalWords > 0) saveCachedIndex(fileSize);
+
+  return totalWords > 0;
+}
+
+// Read a null-terminated word string from the current file position.
+std::string Dictionary::readWord(FsFile& file) {
+  std::string word;
+  while (true) {
+    int ch = file.read();
+    if (ch <= 0) break;  // null terminator (0) or error (-1)
+    word += static_cast<char>(ch);
+  }
+  return word;
+}
+
+// Read a definition from the .dict file at the given offset and size.
+std::string Dictionary::readDefinition(uint32_t offset, uint32_t size) {
+  FsFile dict;
+  if (!Storage.openFileForRead("DICT", DICT_PATH, dict)) return "";
+
+  dict.seekSet(offset);
+
+  std::string def(size, '\0');
+  int bytesRead = dict.read(reinterpret_cast<uint8_t*>(&def[0]), size);
+  dict.close();
+
+  if (bytesRead < 0) return "";
+  if (static_cast<uint32_t>(bytesRead) < size) def.resize(bytesRead);
+  return def;
+}
+
+// Binary search the sparse offset table, then linear scan within the matching segment.
+// Uses StarDict's sort order: case-insensitive first, then case-sensitive tiebreaker.
+// The exact match is case-insensitive so e.g. "simple" matches "Simple".
+std::string Dictionary::searchIndex(const std::string& word, const std::function<bool()>& shouldCancel) {
+  if (sparseOffsets.empty()) return "";
+
+  FsFile idx;
+  if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return "";
+
+  // Binary search the sparse offset table to find the right segment.
+  int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
+
+  while (lo < hi) {
+    if (shouldCancel && shouldCancel()) {
+      idx.close();
+      return "";
+    }
+
+    int mid = lo + (hi - lo + 1) / 2;
+    idx.seekSet(sparseOffsets[mid]);
+    std::string key = readWord(idx);
+
+    if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
+      lo = mid;
+    } else {
+      hi = mid - 1;
+    }
+  }
+
+  // Linear scan within the segment starting at sparseOffsets[lo].
+  idx.seekSet(sparseOffsets[lo]);
+
+  int maxEntries = SPARSE_INTERVAL;
+  if (lo == static_cast<int>(sparseOffsets.size()) - 1) {
+    maxEntries = static_cast<int>(totalWords - static_cast<uint32_t>(lo) * SPARSE_INTERVAL);
+  }
+
+  // Scan entries, preferring an exact case-sensitive match over a case-insensitive one.
+  // In stardict order, all case variants of a word are adjacent (e.g. "Professor" then "professor"),
+  // and they may have different definitions.  We want the lowercase entry when the user searched
+  // for a lowercase word, falling back to any case variant.
+  uint32_t bestOffset = 0, bestSize = 0;
+  bool found = false;
+
+  for (int i = 0; i < maxEntries; i++) {
+    if (shouldCancel && shouldCancel()) {
+      idx.close();
+      return "";
+    }
+
+    std::string key = readWord(idx);
+    if (key.empty()) break;
+
+    // Read offset and size (4 bytes each, big-endian)
+    uint8_t buf[8];
+    if (idx.read(buf, 8) != 8) break;
+
+    uint32_t dictOffset = (static_cast<uint32_t>(buf[0]) << 24) | (static_cast<uint32_t>(buf[1]) << 16) |
+                          (static_cast<uint32_t>(buf[2]) << 8) | static_cast<uint32_t>(buf[3]);
+    uint32_t dictSize = (static_cast<uint32_t>(buf[4]) << 24) | (static_cast<uint32_t>(buf[5]) << 16) |
+                        (static_cast<uint32_t>(buf[6]) << 8) | static_cast<uint32_t>(buf[7]);
+
+    if (asciiCaseCmp(key.c_str(), word.c_str()) == 0) {
+      // Case-insensitive match — remember the first one as fallback
+      if (!found) {
+        bestOffset = dictOffset;
+        bestSize = dictSize;
+        found = true;
+      }
+      // Exact case-sensitive match — use immediately
+      if (key == word) {
+        idx.close();
+        return readDefinition(dictOffset, dictSize);
+      }
+    } else if (found) {
+      // We've moved past all case variants of this word — stop
+      break;
+    } else if (stardictCmp(key.c_str(), word.c_str()) > 0) {
+      // Past the target in StarDict sort order — stop scanning
+      break;
+    }
+  }
+
+  idx.close();
+  return found ? readDefinition(bestOffset, bestSize) : "";
+}
+
+std::string Dictionary::lookup(const std::string& word, const std::function<void(int percent)>& onProgress,
+                               const std::function<bool()>& shouldCancel) {
+  if (!indexLoaded) {
+    if (!loadIndex(onProgress, shouldCancel)) return "";
+  }
+
+  // searchIndex uses StarDict sort order + case-insensitive match,
+  // so a single pass handles all casing variants.
+  std::string result = searchIndex(word, shouldCancel);
+  if (onProgress) onProgress(100);
+  return result;
+}
--- a/src/util/Dictionary.h
+++ b/src/util/Dictionary.h
@@ -0,0 +1,31 @@
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <vector>
+
+class FsFile;
+
+class Dictionary {
+ public:
+  static bool exists();
+  static bool cacheExists();
+  static void deleteCache();
+  static std::string lookup(const std::string& word, const std::function<void(int percent)>& onProgress = nullptr,
+                            const std::function<bool()>& shouldCancel = nullptr);
+  static std::string cleanWord(const std::string& word);
+
+ private:
+  static constexpr int SPARSE_INTERVAL = 512;
+
+  static std::vector<uint32_t> sparseOffsets;
+  static uint32_t totalWords;
+  static bool indexLoaded;
+
+  static bool loadIndex(const std::function<void(int percent)>& onProgress, const std::function<bool()>& shouldCancel);
+  static bool loadCachedIndex();
+  static void saveCachedIndex(uint32_t idxFileSize);
+  static std::string searchIndex(const std::string& word, const std::function<bool()>& shouldCancel);
+  static std::string readWord(FsFile& file);
+  static std::string readDefinition(uint32_t offset, uint32_t size);
+};
--- a/src/util/LookupHistory.cpp
+++ b/src/util/LookupHistory.cpp
@@ -0,0 +1,88 @@
+#include "LookupHistory.h"
+
+#include <HalStorage.h>
+
+#include <algorithm>
+
+std::string LookupHistory::filePath(const std::string& cachePath) { return cachePath + "/lookups.txt"; }
+
+bool LookupHistory::hasHistory(const std::string& cachePath) {
+  FsFile f;
+  if (!Storage.openFileForRead("LKH", filePath(cachePath), f)) {
+    return false;
+  }
+  bool nonEmpty = f.available() > 0;
+  f.close();
+  return nonEmpty;
+}
+
+std::vector<std::string> LookupHistory::load(const std::string& cachePath) {
+  std::vector<std::string> words;
+  FsFile f;
+  if (!Storage.openFileForRead("LKH", filePath(cachePath), f)) {
+    return words;
+  }
+
+  std::string line;
+  while (f.available() && static_cast<int>(words.size()) < MAX_ENTRIES) {
+    char c;
+    if (f.read(reinterpret_cast<uint8_t*>(&c), 1) != 1) break;
+    if (c == '\n') {
+      if (!line.empty()) {
+        words.push_back(line);
+        line.clear();
+      }
+    } else {
+      line += c;
+    }
+  }
+  if (!line.empty() && static_cast<int>(words.size()) < MAX_ENTRIES) {
+    words.push_back(line);
+  }
+  f.close();
+  return words;
+}
+
+void LookupHistory::removeWord(const std::string& cachePath, const std::string& word) {
+  if (word.empty()) return;
+
+  auto existing = load(cachePath);
+
+  FsFile f;
+  if (!Storage.openFileForWrite("LKH", filePath(cachePath), f)) {
+    return;
+  }
+
+  for (const auto& w : existing) {
+    if (w != word) {
+      f.write(reinterpret_cast<const uint8_t*>(w.c_str()), w.size());
+      f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
+    }
+  }
+  f.close();
+}
+
+void LookupHistory::addWord(const std::string& cachePath, const std::string& word) {
+  if (word.empty()) return;
+
+  // Check if already present
+  auto existing = load(cachePath);
+  if (std::any_of(existing.begin(), existing.end(), [&word](const std::string& w) { return w == word; })) return;
+
+  // Cap at max entries
+  if (static_cast<int>(existing.size()) >= MAX_ENTRIES) return;
+
+  FsFile f;
+  if (!Storage.openFileForWrite("LKH", filePath(cachePath), f)) {
+    return;
+  }
+
+  // Rewrite existing entries plus new one
+  for (const auto& w : existing) {
+    f.write(reinterpret_cast<const uint8_t*>(w.c_str()), w.size());
+    f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
+  }
+  f.write(reinterpret_cast<const uint8_t*>(word.c_str()), word.size());
+  f.write(reinterpret_cast<const uint8_t*>("\n"), 1);
+  f.close();
+}
--- a/src/util/LookupHistory.h
+++ b/src/util/LookupHistory.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <string>
+#include <vector>
+
+class LookupHistory {
+ public:
+  static std::vector<std::string> load(const std::string& cachePath);
+  static void addWord(const std::string& cachePath, const std::string& word);
+  static void removeWord(const std::string& cachePath, const std::string& word);
+  static bool hasHistory(const std::string& cachePath);
+
+ private:
+  static std::string filePath(const std::string& cachePath);
+  static constexpr int MAX_ENTRIES = 500;
+};