diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index 34b1f3c..3b03936 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -57,10 +57,54 @@ bool isCyrillicVowel(uint32_t cp) { bool isCyrillicConsonant(const uint32_t cp) { return isCyrillicLetter(cp) && !isCyrillicVowel(cp); } -bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); } +bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp) || isPunctuation(cp); } bool isVowel(const uint32_t cp) { return isLatinVowel(cp) || isCyrillicVowel(cp); } +bool isPunctuation(const uint32_t cp) { + switch (cp) { + case '.': + case ',': + case '!': + case '?': + case ';': + case ':': + case '"': + case '\'': + case ')': + case '(': + case '«': + case '»': + case '‘': + case '’': + case '“': + case '”': + case '—': + case '-': + case '–': + case '―': + case '[': + case ']': + case '{': + case '}': + case '/': + case 0x2019: // ’ + case 0x201D: // ” + case 0x00BB: // » + case 0x203A: // › + case 0x2026: // … + return true; + default: + return false; + } +} + +void trimTrailingPunctuation(std::vector& cps) { + while (!cps.empty() && isPunctuation(cps.back().value)) { + cps.pop_back(); + } +} + Script detectScript(const std::vector& cps) { bool hasLatin = false; bool hasCyrillic = false; diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index d3f95a4..d60af80 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -27,5 +27,7 @@ bool isCyrillicConsonant(uint32_t cp); bool isAlphabetic(uint32_t cp); bool isVowel(uint32_t cp); +bool isPunctuation(uint32_t cp); +void trimTrailingPunctuation(std::vector& cps); Script detectScript(const std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index de8cd83..888de0c 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -94,6 +94,7 @@ std::vector Hyphenator::breakOffsets(const std::string& word, const bool } auto cps = collectCodepoints(word); + trimTrailingPunctuation(cps); if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { return byteOffsets; }