feat: Integrate PR #857 dictionary intelligence and sub-activity refactor

Pull in the full feature update from PR #857 while preserving fork advantages (HTML parsing, custom drawHints, PageForward/PageBack, cache management, stardictCmp, /.dictionary/ paths). - Add morphological stemming (getStemVariants), Levenshtein edit distance, and fuzzy matching (findSimilar) to Dictionary - Create DictionarySuggestionsActivity for "Did you mean?" flow - Add onDone callback to DictionaryDefinitionActivity for direct exit-to-reader via "Done" button - Refactor DictionaryWordSelectActivity to ActivityWithSubactivity with cascading lookup (exact → stems → suggestions → not found), en-dash/em-dash splitting, and cross-page hyphenation - Refactor LookedUpWordsActivity with reverse-chronological order, inline cascading lookup, UITheme-aware rendering, and sub-activities - Simplify EpubReaderActivity LOOKUP/LOOKED_UP_WORDS handlers Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-14 20:50:03 -05:00
parent c1dfe92ea3
commit 5dc9d21bdb
12 changed files with 746 additions and 105 deletions
--- a/src/util/Dictionary.cpp
+++ b/src/util/Dictionary.cpp
@@ -326,3 +326,264 @@ std::string Dictionary::lookup(const std::string& word, const std::function<void
  if (onProgress) onProgress(100);
  return result;
 }
+
+std::vector<std::string> Dictionary::getStemVariants(const std::string& word) {
+  std::vector<std::string> variants;
+  size_t len = word.size();
+  if (len < 3) return variants;
+
+  auto endsWith = [&word, len](const char* suffix) {
+    size_t slen = strlen(suffix);
+    return len >= slen && word.compare(len - slen, slen, suffix) == 0;
+  };
+
+  auto add = [&variants](const std::string& s) {
+    if (s.size() >= 2) variants.push_back(s);
+  };
+
+  // Plurals (longer suffixes first to avoid partial matches)
+  if (endsWith("sses")) add(word.substr(0, len - 2));
+  if (endsWith("ses")) add(word.substr(0, len - 2) + "is");  // analyses -> analysis
+  if (endsWith("ies")) {
+    add(word.substr(0, len - 3) + "y");
+    add(word.substr(0, len - 2));  // dies -> die, ties -> tie
+  }
+  if (endsWith("ves")) {
+    add(word.substr(0, len - 3) + "f");   // wolves -> wolf
+    add(word.substr(0, len - 3) + "fe");  // knives -> knife
+    add(word.substr(0, len - 1));          // misgives -> misgive
+  }
+  if (endsWith("men")) add(word.substr(0, len - 3) + "man");  // firemen -> fireman
+  if (endsWith("es") && !endsWith("sses") && !endsWith("ies") && !endsWith("ves")) {
+    add(word.substr(0, len - 2));
+    add(word.substr(0, len - 1));
+  }
+  if (endsWith("s") && !endsWith("ss") && !endsWith("us") && !endsWith("es")) {
+    add(word.substr(0, len - 1));
+  }
+
+  // Past tense
+  if (endsWith("ied")) {
+    add(word.substr(0, len - 3) + "y");
+    add(word.substr(0, len - 1));
+  }
+  if (endsWith("ed") && !endsWith("ied")) {
+    add(word.substr(0, len - 2));
+    add(word.substr(0, len - 1));
+    if (len > 4 && word[len - 3] == word[len - 4]) {
+      add(word.substr(0, len - 3));
+    }
+  }
+
+  // Progressive
+  if (endsWith("ying")) {
+    add(word.substr(0, len - 4) + "ie");
+  }
+  if (endsWith("ing") && !endsWith("ying")) {
+    add(word.substr(0, len - 3));
+    add(word.substr(0, len - 3) + "e");
+    if (len > 5 && word[len - 4] == word[len - 5]) {
+      add(word.substr(0, len - 4));
+    }
+  }
+
+  // Adverb
+  if (endsWith("ically")) {
+    add(word.substr(0, len - 6) + "ic");  // historically -> historic
+    add(word.substr(0, len - 4));          // basically -> basic
+  }
+  if (endsWith("ally") && !endsWith("ically")) {
+    add(word.substr(0, len - 4) + "al");  // accidentally -> accidental
+    add(word.substr(0, len - 2));          // naturally -> natur... (fallback to -ly strip)
+  }
+  if (endsWith("ily") && !endsWith("ally")) {
+    add(word.substr(0, len - 3) + "y");
+  }
+  if (endsWith("ly") && !endsWith("ily") && !endsWith("ally")) {
+    add(word.substr(0, len - 2));
+  }
+
+  // Comparative / superlative
+  if (endsWith("ier")) {
+    add(word.substr(0, len - 3) + "y");
+  }
+  if (endsWith("er") && !endsWith("ier")) {
+    add(word.substr(0, len - 2));
+    add(word.substr(0, len - 1));
+    if (len > 4 && word[len - 3] == word[len - 4]) {
+      add(word.substr(0, len - 3));
+    }
+  }
+  if (endsWith("iest")) {
+    add(word.substr(0, len - 4) + "y");
+  }
+  if (endsWith("est") && !endsWith("iest")) {
+    add(word.substr(0, len - 3));
+    add(word.substr(0, len - 2));
+    if (len > 5 && word[len - 4] == word[len - 5]) {
+      add(word.substr(0, len - 4));
+    }
+  }
+
+  // Derivational suffixes
+  if (endsWith("ness")) add(word.substr(0, len - 4));
+  if (endsWith("ment")) add(word.substr(0, len - 4));
+  if (endsWith("ful")) add(word.substr(0, len - 3));
+  if (endsWith("less")) add(word.substr(0, len - 4));
+  if (endsWith("able")) {
+    add(word.substr(0, len - 4));
+    add(word.substr(0, len - 4) + "e");
+  }
+  if (endsWith("ible")) {
+    add(word.substr(0, len - 4));
+    add(word.substr(0, len - 4) + "e");
+  }
+  if (endsWith("ation")) {
+    add(word.substr(0, len - 5));           // information -> inform
+    add(word.substr(0, len - 5) + "e");     // exploration -> explore
+    add(word.substr(0, len - 5) + "ate");   // donation -> donate
+  }
+  if (endsWith("tion") && !endsWith("ation")) {
+    add(word.substr(0, len - 4) + "te");  // completion -> complete
+    add(word.substr(0, len - 3));          // action -> act
+    add(word.substr(0, len - 3) + "e");   // reduction -> reduce
+  }
+  if (endsWith("ion") && !endsWith("tion")) {
+    add(word.substr(0, len - 3));          // revision -> revis (-> revise via +e)
+    add(word.substr(0, len - 3) + "e");   // revision -> revise
+  }
+  if (endsWith("al") && !endsWith("ial")) {
+    add(word.substr(0, len - 2));
+    add(word.substr(0, len - 2) + "e");
+  }
+  if (endsWith("ial")) {
+    add(word.substr(0, len - 3));
+    add(word.substr(0, len - 3) + "e");
+  }
+  if (endsWith("ous")) {
+    add(word.substr(0, len - 3));          // dangerous -> danger
+    add(word.substr(0, len - 3) + "e");   // famous -> fame
+  }
+  if (endsWith("ive")) {
+    add(word.substr(0, len - 3));          // active -> act
+    add(word.substr(0, len - 3) + "e");   // creative -> create
+  }
+  if (endsWith("ize")) {
+    add(word.substr(0, len - 3));          // modernize -> modern
+    add(word.substr(0, len - 3) + "e");
+  }
+  if (endsWith("ise")) {
+    add(word.substr(0, len - 3));          // advertise -> advert
+    add(word.substr(0, len - 3) + "e");
+  }
+  if (endsWith("en")) {
+    add(word.substr(0, len - 2));          // darken -> dark
+    add(word.substr(0, len - 2) + "e");   // widen -> wide
+  }
+
+  // Prefix removal
+  if (len > 5 && word.compare(0, 2, "un") == 0) add(word.substr(2));
+  if (len > 6 && word.compare(0, 3, "dis") == 0) add(word.substr(3));
+  if (len > 6 && word.compare(0, 3, "mis") == 0) add(word.substr(3));
+  if (len > 6 && word.compare(0, 3, "pre") == 0) add(word.substr(3));
+  if (len > 7 && word.compare(0, 4, "over") == 0) add(word.substr(4));
+  if (len > 5 && word.compare(0, 2, "re") == 0) add(word.substr(2));
+
+  // Deduplicate while preserving insertion order (inflectional stems first, prefixes last)
+  std::vector<std::string> deduped;
+  for (const auto& v : variants) {
+    if (std::find(deduped.begin(), deduped.end(), v) != deduped.end()) continue;
+    // cppcheck-suppress useStlAlgorithm
+    deduped.push_back(v);
+  }
+  return deduped;
+}
+
+int Dictionary::editDistance(const std::string& a, const std::string& b, int maxDist) {
+  int m = static_cast<int>(a.size());
+  int n = static_cast<int>(b.size());
+  if (std::abs(m - n) > maxDist) return maxDist + 1;
+
+  std::vector<int> dp(n + 1);
+  for (int j = 0; j <= n; j++) dp[j] = j;
+
+  for (int i = 1; i <= m; i++) {
+    int prev = dp[0];
+    dp[0] = i;
+    int rowMin = dp[0];
+    for (int j = 1; j <= n; j++) {
+      int temp = dp[j];
+      if (a[i - 1] == b[j - 1]) {
+        dp[j] = prev;
+      } else {
+        dp[j] = 1 + std::min({prev, dp[j], dp[j - 1]});
+      }
+      prev = temp;
+      if (dp[j] < rowMin) rowMin = dp[j];
+    }
+    if (rowMin > maxDist) return maxDist + 1;
+  }
+  return dp[n];
+}
+
+std::vector<std::string> Dictionary::findSimilar(const std::string& word, int maxResults) {
+  if (!indexLoaded || sparseOffsets.empty()) return {};
+
+  FsFile idx;
+  if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return {};
+
+  // Binary search to find the segment containing or nearest to the word
+  int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
+  while (lo < hi) {
+    int mid = lo + (hi - lo + 1) / 2;
+    idx.seekSet(sparseOffsets[mid]);
+    std::string key = readWord(idx);
+    if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
+      lo = mid;
+    } else {
+      hi = mid - 1;
+    }
+  }
+
+  // Scan entries from the segment before through the segment after the target
+  int startSeg = std::max(0, lo - 1);
+  int endSeg = std::min(static_cast<int>(sparseOffsets.size()) - 1, lo + 1);
+  idx.seekSet(sparseOffsets[startSeg]);
+
+  int totalToScan = (endSeg - startSeg + 1) * SPARSE_INTERVAL;
+  int remaining = static_cast<int>(totalWords) - startSeg * SPARSE_INTERVAL;
+  if (totalToScan > remaining) totalToScan = remaining;
+
+  int maxDist = std::max(2, static_cast<int>(word.size()) / 3 + 1);
+
+  struct Candidate {
+    std::string text;
+    int distance;
+  };
+  std::vector<Candidate> candidates;
+
+  for (int i = 0; i < totalToScan; i++) {
+    std::string key = readWord(idx);
+    if (key.empty()) break;
+
+    uint8_t skip[8];
+    if (idx.read(skip, 8) != 8) break;
+
+    if (key == word) continue;
+    int dist = editDistance(key, word, maxDist);
+    if (dist <= maxDist) {
+      candidates.push_back({key, dist});
+    }
+  }
+
+  idx.close();
+
+  std::sort(candidates.begin(), candidates.end(),
+            [](const Candidate& a, const Candidate& b) { return a.distance < b.distance; });
+
+  std::vector<std::string> results;
+  for (size_t i = 0; i < candidates.size() && static_cast<int>(results.size()) < maxResults; i++) {
+    results.push_back(candidates[i].text);
+  }
+  return results;
+}
--- a/src/util/Dictionary.h
+++ b/src/util/Dictionary.h
@@ -14,6 +14,8 @@ class Dictionary {
  static std::string lookup(const std::string& word, const std::function<void(int percent)>& onProgress = nullptr,
                            const std::function<bool()>& shouldCancel = nullptr);
  static std::string cleanWord(const std::string& word);
+  static std::vector<std::string> getStemVariants(const std::string& word);
+  static std::vector<std::string> findSimilar(const std::string& word, int maxResults = 6);

 private:
  static constexpr int SPARSE_INTERVAL = 512;
@@ -28,4 +30,5 @@ class Dictionary {
  static std::string searchIndex(const std::string& word, const std::function<bool()>& shouldCancel);
  static std::string readWord(FsFile& file);
  static std::string readDefinition(uint32_t offset, uint32_t size);
+  static int editDistance(const std::string& a, const std::string& b, int maxDist);
 };