feat: Integrate PR #857 dictionary intelligence and sub-activity refactor

Pull in the full feature update from PR #857 while preserving fork
advantages (HTML parsing, custom drawHints, PageForward/PageBack,
cache management, stardictCmp, /.dictionary/ paths).

- Add morphological stemming (getStemVariants), Levenshtein edit
  distance, and fuzzy matching (findSimilar) to Dictionary
- Create DictionarySuggestionsActivity for "Did you mean?" flow
- Add onDone callback to DictionaryDefinitionActivity for direct
  exit-to-reader via "Done" button
- Refactor DictionaryWordSelectActivity to ActivityWithSubactivity
  with cascading lookup (exact → stems → suggestions → not found),
  en-dash/em-dash splitting, and cross-page hyphenation
- Refactor LookedUpWordsActivity with reverse-chronological order,
  inline cascading lookup, UITheme-aware rendering, and sub-activities
- Simplify EpubReaderActivity LOOKUP/LOOKED_UP_WORDS handlers

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
cottongin
2026-02-14 20:50:03 -05:00
parent c1dfe92ea3
commit 5dc9d21bdb
12 changed files with 746 additions and 105 deletions

View File

@@ -326,3 +326,264 @@ std::string Dictionary::lookup(const std::string& word, const std::function<void
if (onProgress) onProgress(100);
return result;
}
std::vector<std::string> Dictionary::getStemVariants(const std::string& word) {
std::vector<std::string> variants;
size_t len = word.size();
if (len < 3) return variants;
auto endsWith = [&word, len](const char* suffix) {
size_t slen = strlen(suffix);
return len >= slen && word.compare(len - slen, slen, suffix) == 0;
};
auto add = [&variants](const std::string& s) {
if (s.size() >= 2) variants.push_back(s);
};
// Plurals (longer suffixes first to avoid partial matches)
if (endsWith("sses")) add(word.substr(0, len - 2));
if (endsWith("ses")) add(word.substr(0, len - 2) + "is"); // analyses -> analysis
if (endsWith("ies")) {
add(word.substr(0, len - 3) + "y");
add(word.substr(0, len - 2)); // dies -> die, ties -> tie
}
if (endsWith("ves")) {
add(word.substr(0, len - 3) + "f"); // wolves -> wolf
add(word.substr(0, len - 3) + "fe"); // knives -> knife
add(word.substr(0, len - 1)); // misgives -> misgive
}
if (endsWith("men")) add(word.substr(0, len - 3) + "man"); // firemen -> fireman
if (endsWith("es") && !endsWith("sses") && !endsWith("ies") && !endsWith("ves")) {
add(word.substr(0, len - 2));
add(word.substr(0, len - 1));
}
if (endsWith("s") && !endsWith("ss") && !endsWith("us") && !endsWith("es")) {
add(word.substr(0, len - 1));
}
// Past tense
if (endsWith("ied")) {
add(word.substr(0, len - 3) + "y");
add(word.substr(0, len - 1));
}
if (endsWith("ed") && !endsWith("ied")) {
add(word.substr(0, len - 2));
add(word.substr(0, len - 1));
if (len > 4 && word[len - 3] == word[len - 4]) {
add(word.substr(0, len - 3));
}
}
// Progressive
if (endsWith("ying")) {
add(word.substr(0, len - 4) + "ie");
}
if (endsWith("ing") && !endsWith("ying")) {
add(word.substr(0, len - 3));
add(word.substr(0, len - 3) + "e");
if (len > 5 && word[len - 4] == word[len - 5]) {
add(word.substr(0, len - 4));
}
}
// Adverb
if (endsWith("ically")) {
add(word.substr(0, len - 6) + "ic"); // historically -> historic
add(word.substr(0, len - 4)); // basically -> basic
}
if (endsWith("ally") && !endsWith("ically")) {
add(word.substr(0, len - 4) + "al"); // accidentally -> accidental
add(word.substr(0, len - 2)); // naturally -> natur... (fallback to -ly strip)
}
if (endsWith("ily") && !endsWith("ally")) {
add(word.substr(0, len - 3) + "y");
}
if (endsWith("ly") && !endsWith("ily") && !endsWith("ally")) {
add(word.substr(0, len - 2));
}
// Comparative / superlative
if (endsWith("ier")) {
add(word.substr(0, len - 3) + "y");
}
if (endsWith("er") && !endsWith("ier")) {
add(word.substr(0, len - 2));
add(word.substr(0, len - 1));
if (len > 4 && word[len - 3] == word[len - 4]) {
add(word.substr(0, len - 3));
}
}
if (endsWith("iest")) {
add(word.substr(0, len - 4) + "y");
}
if (endsWith("est") && !endsWith("iest")) {
add(word.substr(0, len - 3));
add(word.substr(0, len - 2));
if (len > 5 && word[len - 4] == word[len - 5]) {
add(word.substr(0, len - 4));
}
}
// Derivational suffixes
if (endsWith("ness")) add(word.substr(0, len - 4));
if (endsWith("ment")) add(word.substr(0, len - 4));
if (endsWith("ful")) add(word.substr(0, len - 3));
if (endsWith("less")) add(word.substr(0, len - 4));
if (endsWith("able")) {
add(word.substr(0, len - 4));
add(word.substr(0, len - 4) + "e");
}
if (endsWith("ible")) {
add(word.substr(0, len - 4));
add(word.substr(0, len - 4) + "e");
}
if (endsWith("ation")) {
add(word.substr(0, len - 5)); // information -> inform
add(word.substr(0, len - 5) + "e"); // exploration -> explore
add(word.substr(0, len - 5) + "ate"); // donation -> donate
}
if (endsWith("tion") && !endsWith("ation")) {
add(word.substr(0, len - 4) + "te"); // completion -> complete
add(word.substr(0, len - 3)); // action -> act
add(word.substr(0, len - 3) + "e"); // reduction -> reduce
}
if (endsWith("ion") && !endsWith("tion")) {
add(word.substr(0, len - 3)); // revision -> revis (-> revise via +e)
add(word.substr(0, len - 3) + "e"); // revision -> revise
}
if (endsWith("al") && !endsWith("ial")) {
add(word.substr(0, len - 2));
add(word.substr(0, len - 2) + "e");
}
if (endsWith("ial")) {
add(word.substr(0, len - 3));
add(word.substr(0, len - 3) + "e");
}
if (endsWith("ous")) {
add(word.substr(0, len - 3)); // dangerous -> danger
add(word.substr(0, len - 3) + "e"); // famous -> fame
}
if (endsWith("ive")) {
add(word.substr(0, len - 3)); // active -> act
add(word.substr(0, len - 3) + "e"); // creative -> create
}
if (endsWith("ize")) {
add(word.substr(0, len - 3)); // modernize -> modern
add(word.substr(0, len - 3) + "e");
}
if (endsWith("ise")) {
add(word.substr(0, len - 3)); // advertise -> advert
add(word.substr(0, len - 3) + "e");
}
if (endsWith("en")) {
add(word.substr(0, len - 2)); // darken -> dark
add(word.substr(0, len - 2) + "e"); // widen -> wide
}
// Prefix removal
if (len > 5 && word.compare(0, 2, "un") == 0) add(word.substr(2));
if (len > 6 && word.compare(0, 3, "dis") == 0) add(word.substr(3));
if (len > 6 && word.compare(0, 3, "mis") == 0) add(word.substr(3));
if (len > 6 && word.compare(0, 3, "pre") == 0) add(word.substr(3));
if (len > 7 && word.compare(0, 4, "over") == 0) add(word.substr(4));
if (len > 5 && word.compare(0, 2, "re") == 0) add(word.substr(2));
// Deduplicate while preserving insertion order (inflectional stems first, prefixes last)
std::vector<std::string> deduped;
for (const auto& v : variants) {
if (std::find(deduped.begin(), deduped.end(), v) != deduped.end()) continue;
// cppcheck-suppress useStlAlgorithm
deduped.push_back(v);
}
return deduped;
}
int Dictionary::editDistance(const std::string& a, const std::string& b, int maxDist) {
int m = static_cast<int>(a.size());
int n = static_cast<int>(b.size());
if (std::abs(m - n) > maxDist) return maxDist + 1;
std::vector<int> dp(n + 1);
for (int j = 0; j <= n; j++) dp[j] = j;
for (int i = 1; i <= m; i++) {
int prev = dp[0];
dp[0] = i;
int rowMin = dp[0];
for (int j = 1; j <= n; j++) {
int temp = dp[j];
if (a[i - 1] == b[j - 1]) {
dp[j] = prev;
} else {
dp[j] = 1 + std::min({prev, dp[j], dp[j - 1]});
}
prev = temp;
if (dp[j] < rowMin) rowMin = dp[j];
}
if (rowMin > maxDist) return maxDist + 1;
}
return dp[n];
}
std::vector<std::string> Dictionary::findSimilar(const std::string& word, int maxResults) {
if (!indexLoaded || sparseOffsets.empty()) return {};
FsFile idx;
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return {};
// Binary search to find the segment containing or nearest to the word
int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
while (lo < hi) {
int mid = lo + (hi - lo + 1) / 2;
idx.seekSet(sparseOffsets[mid]);
std::string key = readWord(idx);
if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
lo = mid;
} else {
hi = mid - 1;
}
}
// Scan entries from the segment before through the segment after the target
int startSeg = std::max(0, lo - 1);
int endSeg = std::min(static_cast<int>(sparseOffsets.size()) - 1, lo + 1);
idx.seekSet(sparseOffsets[startSeg]);
int totalToScan = (endSeg - startSeg + 1) * SPARSE_INTERVAL;
int remaining = static_cast<int>(totalWords) - startSeg * SPARSE_INTERVAL;
if (totalToScan > remaining) totalToScan = remaining;
int maxDist = std::max(2, static_cast<int>(word.size()) / 3 + 1);
struct Candidate {
std::string text;
int distance;
};
std::vector<Candidate> candidates;
for (int i = 0; i < totalToScan; i++) {
std::string key = readWord(idx);
if (key.empty()) break;
uint8_t skip[8];
if (idx.read(skip, 8) != 8) break;
if (key == word) continue;
int dist = editDistance(key, word, maxDist);
if (dist <= maxDist) {
candidates.push_back({key, dist});
}
}
idx.close();
std::sort(candidates.begin(), candidates.end(),
[](const Candidate& a, const Candidate& b) { return a.distance < b.distance; });
std::vector<std::string> results;
for (size_t i = 0; i < candidates.size() && static_cast<int>(results.size()) < maxResults; i++) {
results.push_back(candidates[i].text);
}
return results;
}

View File

@@ -14,6 +14,8 @@ class Dictionary {
static std::string lookup(const std::string& word, const std::function<void(int percent)>& onProgress = nullptr,
const std::function<bool()>& shouldCancel = nullptr);
static std::string cleanWord(const std::string& word);
static std::vector<std::string> getStemVariants(const std::string& word);
static std::vector<std::string> findSimilar(const std::string& word, int maxResults = 6);
private:
static constexpr int SPARSE_INTERVAL = 512;
@@ -28,4 +30,5 @@ class Dictionary {
static std::string searchIndex(const std::string& word, const std::function<bool()>& shouldCancel);
static std::string readWord(FsFile& file);
static std::string readDefinition(uint32_t offset, uint32_t size);
static int editDistance(const std::string& a, const std::string& b, int maxDist);
};