From 5d00e5ac0fc2797ae705fc4778f28d3c124c1779 Mon Sep 17 00:00:00 2001 From: Arthur Tazhitdinov Date: Fri, 26 Dec 2025 04:36:19 +0500 Subject: [PATCH] Enhance hyphenation logic: add morphology break handling and improve vowel detection --- .../Epub/hyphenation/EnglishHyphenator.cpp | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp index 1e01b05..cef341d 100644 --- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp @@ -1,7 +1,9 @@ #include "EnglishHyphenator.h" #include +#include #include +#include #include namespace { @@ -45,6 +47,105 @@ bool isEnglishFricativeChar(const char c) { } } +struct LatinLiteral { + const char* text; + size_t length; +}; + +bool nextToApostrophe(const std::vector& cps, size_t index); + +std::string lowercaseLatinWord(const std::vector& cps) { + std::string lower; + lower.reserve(cps.size()); + for (const auto& info : cps) { + lower.push_back(lowerLatinChar(info.value)); + } + return lower; +} + +bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) { + if (!pattern.text || pattern.length == 0) { + return false; + } + if (start + pattern.length > lowerWord.size()) { + return false; + } + for (size_t i = 0; i < pattern.length; ++i) { + if (lowerWord[start + i] != pattern.text[i]) { + return false; + } + } + return true; +} + +bool englishSegmentHasVowel(const std::vector& cps, const size_t start, const size_t end) { + if (start >= end || start >= cps.size()) { + return false; + } + const size_t clampedEnd = std::min(end, cps.size()); + for (size_t i = start; i < clampedEnd; ++i) { + if (isLatinVowel(cps[i].value)) { + return true; + } + } + return false; +} + +void appendMorphologyBreaks(const std::vector& cps, const std::string& lowerWord, + std::vector& indexes) { + static constexpr std::array PREFIXES = {{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, + {"dis", 3}, {"hyper", 5}, {"inter", 5}, {"micro", 5}, + {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, + {"over", 4}, {"post", 4}, {"pre", 3}, {"pro", 3}, + {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}}; + + static constexpr std::array SUFFIXES = {{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, + {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3}, + {"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, + {"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4}, + {"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, + {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}}; + + const size_t length = cps.size(); + if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) { + return; + } + + const auto tryPush = [&](const size_t breakIndex) { + if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) { + return; + } + if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) { + return; + } + if (nextToApostrophe(cps, breakIndex)) { + return; + } + indexes.push_back(breakIndex); + }; + + for (const auto& prefix : PREFIXES) { + if (prefix.length == 0 || prefix.length >= length) { + continue; + } + if (!matchesPatternAt(lowerWord, 0, prefix)) { + continue; + } + tryPush(prefix.length); + } + + for (const auto& suffix : SUFFIXES) { + if (suffix.length == 0 || suffix.length >= length) { + continue; + } + const size_t breakIndex = length - suffix.length; + if (!matchesPatternAt(lowerWord, breakIndex, suffix)) { + continue; + } + tryPush(breakIndex); + } +} + struct CharPair { char first; char second; @@ -225,6 +326,7 @@ std::vector englishBreakIndexes(const std::vector& cps) { return indexes; } + const auto lowerWord = lowercaseLatinWord(cps); std::vector vowelPositions; vowelPositions.reserve(cps.size()); for (size_t i = 0; i < cps.size(); ++i) { @@ -263,6 +365,8 @@ std::vector englishBreakIndexes(const std::vector& cps) { indexes.push_back(breakIndex); } + appendMorphologyBreaks(cps, lowerWord, indexes); + std::sort(indexes.begin(), indexes.end()); indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); return indexes;