feat: Integrate PR #857 dictionary intelligence and sub-activity refactor
Pull in the full feature update from PR #857 while preserving fork advantages (HTML parsing, custom drawHints, PageForward/PageBack, cache management, stardictCmp, /.dictionary/ paths). - Add morphological stemming (getStemVariants), Levenshtein edit distance, and fuzzy matching (findSimilar) to Dictionary - Create DictionarySuggestionsActivity for "Did you mean?" flow - Add onDone callback to DictionaryDefinitionActivity for direct exit-to-reader via "Done" button - Refactor DictionaryWordSelectActivity to ActivityWithSubactivity with cascading lookup (exact → stems → suggestions → not found), en-dash/em-dash splitting, and cross-page hyphenation - Refactor LookedUpWordsActivity with reverse-chronological order, inline cascading lookup, UITheme-aware rendering, and sub-activities - Simplify EpubReaderActivity LOOKUP/LOOKED_UP_WORDS handlers Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -326,3 +326,264 @@ std::string Dictionary::lookup(const std::string& word, const std::function<void
|
||||
if (onProgress) onProgress(100);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<std::string> Dictionary::getStemVariants(const std::string& word) {
|
||||
std::vector<std::string> variants;
|
||||
size_t len = word.size();
|
||||
if (len < 3) return variants;
|
||||
|
||||
auto endsWith = [&word, len](const char* suffix) {
|
||||
size_t slen = strlen(suffix);
|
||||
return len >= slen && word.compare(len - slen, slen, suffix) == 0;
|
||||
};
|
||||
|
||||
auto add = [&variants](const std::string& s) {
|
||||
if (s.size() >= 2) variants.push_back(s);
|
||||
};
|
||||
|
||||
// Plurals (longer suffixes first to avoid partial matches)
|
||||
if (endsWith("sses")) add(word.substr(0, len - 2));
|
||||
if (endsWith("ses")) add(word.substr(0, len - 2) + "is"); // analyses -> analysis
|
||||
if (endsWith("ies")) {
|
||||
add(word.substr(0, len - 3) + "y");
|
||||
add(word.substr(0, len - 2)); // dies -> die, ties -> tie
|
||||
}
|
||||
if (endsWith("ves")) {
|
||||
add(word.substr(0, len - 3) + "f"); // wolves -> wolf
|
||||
add(word.substr(0, len - 3) + "fe"); // knives -> knife
|
||||
add(word.substr(0, len - 1)); // misgives -> misgive
|
||||
}
|
||||
if (endsWith("men")) add(word.substr(0, len - 3) + "man"); // firemen -> fireman
|
||||
if (endsWith("es") && !endsWith("sses") && !endsWith("ies") && !endsWith("ves")) {
|
||||
add(word.substr(0, len - 2));
|
||||
add(word.substr(0, len - 1));
|
||||
}
|
||||
if (endsWith("s") && !endsWith("ss") && !endsWith("us") && !endsWith("es")) {
|
||||
add(word.substr(0, len - 1));
|
||||
}
|
||||
|
||||
// Past tense
|
||||
if (endsWith("ied")) {
|
||||
add(word.substr(0, len - 3) + "y");
|
||||
add(word.substr(0, len - 1));
|
||||
}
|
||||
if (endsWith("ed") && !endsWith("ied")) {
|
||||
add(word.substr(0, len - 2));
|
||||
add(word.substr(0, len - 1));
|
||||
if (len > 4 && word[len - 3] == word[len - 4]) {
|
||||
add(word.substr(0, len - 3));
|
||||
}
|
||||
}
|
||||
|
||||
// Progressive
|
||||
if (endsWith("ying")) {
|
||||
add(word.substr(0, len - 4) + "ie");
|
||||
}
|
||||
if (endsWith("ing") && !endsWith("ying")) {
|
||||
add(word.substr(0, len - 3));
|
||||
add(word.substr(0, len - 3) + "e");
|
||||
if (len > 5 && word[len - 4] == word[len - 5]) {
|
||||
add(word.substr(0, len - 4));
|
||||
}
|
||||
}
|
||||
|
||||
// Adverb
|
||||
if (endsWith("ically")) {
|
||||
add(word.substr(0, len - 6) + "ic"); // historically -> historic
|
||||
add(word.substr(0, len - 4)); // basically -> basic
|
||||
}
|
||||
if (endsWith("ally") && !endsWith("ically")) {
|
||||
add(word.substr(0, len - 4) + "al"); // accidentally -> accidental
|
||||
add(word.substr(0, len - 2)); // naturally -> natur... (fallback to -ly strip)
|
||||
}
|
||||
if (endsWith("ily") && !endsWith("ally")) {
|
||||
add(word.substr(0, len - 3) + "y");
|
||||
}
|
||||
if (endsWith("ly") && !endsWith("ily") && !endsWith("ally")) {
|
||||
add(word.substr(0, len - 2));
|
||||
}
|
||||
|
||||
// Comparative / superlative
|
||||
if (endsWith("ier")) {
|
||||
add(word.substr(0, len - 3) + "y");
|
||||
}
|
||||
if (endsWith("er") && !endsWith("ier")) {
|
||||
add(word.substr(0, len - 2));
|
||||
add(word.substr(0, len - 1));
|
||||
if (len > 4 && word[len - 3] == word[len - 4]) {
|
||||
add(word.substr(0, len - 3));
|
||||
}
|
||||
}
|
||||
if (endsWith("iest")) {
|
||||
add(word.substr(0, len - 4) + "y");
|
||||
}
|
||||
if (endsWith("est") && !endsWith("iest")) {
|
||||
add(word.substr(0, len - 3));
|
||||
add(word.substr(0, len - 2));
|
||||
if (len > 5 && word[len - 4] == word[len - 5]) {
|
||||
add(word.substr(0, len - 4));
|
||||
}
|
||||
}
|
||||
|
||||
// Derivational suffixes
|
||||
if (endsWith("ness")) add(word.substr(0, len - 4));
|
||||
if (endsWith("ment")) add(word.substr(0, len - 4));
|
||||
if (endsWith("ful")) add(word.substr(0, len - 3));
|
||||
if (endsWith("less")) add(word.substr(0, len - 4));
|
||||
if (endsWith("able")) {
|
||||
add(word.substr(0, len - 4));
|
||||
add(word.substr(0, len - 4) + "e");
|
||||
}
|
||||
if (endsWith("ible")) {
|
||||
add(word.substr(0, len - 4));
|
||||
add(word.substr(0, len - 4) + "e");
|
||||
}
|
||||
if (endsWith("ation")) {
|
||||
add(word.substr(0, len - 5)); // information -> inform
|
||||
add(word.substr(0, len - 5) + "e"); // exploration -> explore
|
||||
add(word.substr(0, len - 5) + "ate"); // donation -> donate
|
||||
}
|
||||
if (endsWith("tion") && !endsWith("ation")) {
|
||||
add(word.substr(0, len - 4) + "te"); // completion -> complete
|
||||
add(word.substr(0, len - 3)); // action -> act
|
||||
add(word.substr(0, len - 3) + "e"); // reduction -> reduce
|
||||
}
|
||||
if (endsWith("ion") && !endsWith("tion")) {
|
||||
add(word.substr(0, len - 3)); // revision -> revis (-> revise via +e)
|
||||
add(word.substr(0, len - 3) + "e"); // revision -> revise
|
||||
}
|
||||
if (endsWith("al") && !endsWith("ial")) {
|
||||
add(word.substr(0, len - 2));
|
||||
add(word.substr(0, len - 2) + "e");
|
||||
}
|
||||
if (endsWith("ial")) {
|
||||
add(word.substr(0, len - 3));
|
||||
add(word.substr(0, len - 3) + "e");
|
||||
}
|
||||
if (endsWith("ous")) {
|
||||
add(word.substr(0, len - 3)); // dangerous -> danger
|
||||
add(word.substr(0, len - 3) + "e"); // famous -> fame
|
||||
}
|
||||
if (endsWith("ive")) {
|
||||
add(word.substr(0, len - 3)); // active -> act
|
||||
add(word.substr(0, len - 3) + "e"); // creative -> create
|
||||
}
|
||||
if (endsWith("ize")) {
|
||||
add(word.substr(0, len - 3)); // modernize -> modern
|
||||
add(word.substr(0, len - 3) + "e");
|
||||
}
|
||||
if (endsWith("ise")) {
|
||||
add(word.substr(0, len - 3)); // advertise -> advert
|
||||
add(word.substr(0, len - 3) + "e");
|
||||
}
|
||||
if (endsWith("en")) {
|
||||
add(word.substr(0, len - 2)); // darken -> dark
|
||||
add(word.substr(0, len - 2) + "e"); // widen -> wide
|
||||
}
|
||||
|
||||
// Prefix removal
|
||||
if (len > 5 && word.compare(0, 2, "un") == 0) add(word.substr(2));
|
||||
if (len > 6 && word.compare(0, 3, "dis") == 0) add(word.substr(3));
|
||||
if (len > 6 && word.compare(0, 3, "mis") == 0) add(word.substr(3));
|
||||
if (len > 6 && word.compare(0, 3, "pre") == 0) add(word.substr(3));
|
||||
if (len > 7 && word.compare(0, 4, "over") == 0) add(word.substr(4));
|
||||
if (len > 5 && word.compare(0, 2, "re") == 0) add(word.substr(2));
|
||||
|
||||
// Deduplicate while preserving insertion order (inflectional stems first, prefixes last)
|
||||
std::vector<std::string> deduped;
|
||||
for (const auto& v : variants) {
|
||||
if (std::find(deduped.begin(), deduped.end(), v) != deduped.end()) continue;
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
deduped.push_back(v);
|
||||
}
|
||||
return deduped;
|
||||
}
|
||||
|
||||
int Dictionary::editDistance(const std::string& a, const std::string& b, int maxDist) {
|
||||
int m = static_cast<int>(a.size());
|
||||
int n = static_cast<int>(b.size());
|
||||
if (std::abs(m - n) > maxDist) return maxDist + 1;
|
||||
|
||||
std::vector<int> dp(n + 1);
|
||||
for (int j = 0; j <= n; j++) dp[j] = j;
|
||||
|
||||
for (int i = 1; i <= m; i++) {
|
||||
int prev = dp[0];
|
||||
dp[0] = i;
|
||||
int rowMin = dp[0];
|
||||
for (int j = 1; j <= n; j++) {
|
||||
int temp = dp[j];
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
dp[j] = prev;
|
||||
} else {
|
||||
dp[j] = 1 + std::min({prev, dp[j], dp[j - 1]});
|
||||
}
|
||||
prev = temp;
|
||||
if (dp[j] < rowMin) rowMin = dp[j];
|
||||
}
|
||||
if (rowMin > maxDist) return maxDist + 1;
|
||||
}
|
||||
return dp[n];
|
||||
}
|
||||
|
||||
std::vector<std::string> Dictionary::findSimilar(const std::string& word, int maxResults) {
|
||||
if (!indexLoaded || sparseOffsets.empty()) return {};
|
||||
|
||||
FsFile idx;
|
||||
if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return {};
|
||||
|
||||
// Binary search to find the segment containing or nearest to the word
|
||||
int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;
|
||||
while (lo < hi) {
|
||||
int mid = lo + (hi - lo + 1) / 2;
|
||||
idx.seekSet(sparseOffsets[mid]);
|
||||
std::string key = readWord(idx);
|
||||
if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
|
||||
lo = mid;
|
||||
} else {
|
||||
hi = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Scan entries from the segment before through the segment after the target
|
||||
int startSeg = std::max(0, lo - 1);
|
||||
int endSeg = std::min(static_cast<int>(sparseOffsets.size()) - 1, lo + 1);
|
||||
idx.seekSet(sparseOffsets[startSeg]);
|
||||
|
||||
int totalToScan = (endSeg - startSeg + 1) * SPARSE_INTERVAL;
|
||||
int remaining = static_cast<int>(totalWords) - startSeg * SPARSE_INTERVAL;
|
||||
if (totalToScan > remaining) totalToScan = remaining;
|
||||
|
||||
int maxDist = std::max(2, static_cast<int>(word.size()) / 3 + 1);
|
||||
|
||||
struct Candidate {
|
||||
std::string text;
|
||||
int distance;
|
||||
};
|
||||
std::vector<Candidate> candidates;
|
||||
|
||||
for (int i = 0; i < totalToScan; i++) {
|
||||
std::string key = readWord(idx);
|
||||
if (key.empty()) break;
|
||||
|
||||
uint8_t skip[8];
|
||||
if (idx.read(skip, 8) != 8) break;
|
||||
|
||||
if (key == word) continue;
|
||||
int dist = editDistance(key, word, maxDist);
|
||||
if (dist <= maxDist) {
|
||||
candidates.push_back({key, dist});
|
||||
}
|
||||
}
|
||||
|
||||
idx.close();
|
||||
|
||||
std::sort(candidates.begin(), candidates.end(),
|
||||
[](const Candidate& a, const Candidate& b) { return a.distance < b.distance; });
|
||||
|
||||
std::vector<std::string> results;
|
||||
for (size_t i = 0; i < candidates.size() && static_cast<int>(results.size()) < maxResults; i++) {
|
||||
results.push_back(candidates[i].text);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
@@ -14,6 +14,8 @@ class Dictionary {
|
||||
static std::string lookup(const std::string& word, const std::function<void(int percent)>& onProgress = nullptr,
|
||||
const std::function<bool()>& shouldCancel = nullptr);
|
||||
static std::string cleanWord(const std::string& word);
|
||||
static std::vector<std::string> getStemVariants(const std::string& word);
|
||||
static std::vector<std::string> findSimilar(const std::string& word, int maxResults = 6);
|
||||
|
||||
private:
|
||||
static constexpr int SPARSE_INTERVAL = 512;
|
||||
@@ -28,4 +30,5 @@ class Dictionary {
|
||||
static std::string searchIndex(const std::string& word, const std::function<bool()>& shouldCancel);
|
||||
static std::string readWord(FsFile& file);
|
||||
static std::string readDefinition(uint32_t offset, uint32_t size);
|
||||
static int editDistance(const std::string& a, const std::string& b, int maxDist);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user