From 3dabd30287fd164282fccca3f6a04aa736e05d4a Mon Sep 17 00:00:00 2001 From: jpirnay Date: Thu, 12 Mar 2026 00:35:23 +0100 Subject: [PATCH] fix: Add special handling for apostrophe hyphenation (#1318) ## Summary * **What is the goal of this PR?** Fixing / extending the hyphenation logic to deal with words containing an apostophe as raised in #1186 * **What changes are included?** ## Additional Context --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**PARTIALLY**_ (as the user provided a thorough analysis that I followed) --- .../Epub/hyphenation/HyphenationCommon.cpp | 11 ++ lib/Epub/Epub/hyphenation/HyphenationCommon.h | 1 + lib/Epub/Epub/hyphenation/Hyphenator.cpp | 141 +++++++++++++++--- lib/Epub/Epub/hyphenation/Hyphenator.h | 14 +- 4 files changed, 142 insertions(+), 25 deletions(-) diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index b402d5b9..4b2ac4a1 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -107,6 +107,17 @@ bool isPunctuation(const uint32_t cp) { bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; } +bool isApostrophe(const uint32_t cp) { + switch (cp) { + case '\'': + case 0x2018: // left single quotation mark + case 0x2019: // right single quotation mark + return true; + default: + return false; + } +} + bool isExplicitHyphen(const uint32_t cp) { switch (cp) { case '-': diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index 522a4673..1639c257 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -19,6 +19,7 @@ bool isCyrillicLetter(uint32_t cp); bool isAlphabetic(uint32_t cp); bool isPunctuation(uint32_t cp); bool isAsciiDigit(uint32_t cp); +bool isApostrophe(uint32_t cp); bool isExplicitHyphen(uint32_t cp); bool isSoftHyphen(uint32_t cp); void trimSurroundingPunctuationAndFootnote(std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index 4d86febe..eb3bdec2 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -1,6 +1,7 @@ #include "Hyphenator.h" #include +#include #include #include "HyphenationCommon.h" @@ -59,6 +60,94 @@ std::vector buildExplicitBreakInfos(const std::vector& cps, const LanguageHyphenator& hyphenator, + const bool includeFallback, std::vector& outBreaks) { + size_t segStart = 0; + + for (size_t i = 0; i <= cps.size(); ++i) { + const bool atEnd = i == cps.size(); + const bool atSeparator = !atEnd && isSegmentSeparator(cps[i].value); + if (!atEnd && !atSeparator) { + continue; + } + + if (i > segStart) { + std::vector segment(cps.begin() + segStart, cps.begin() + i); + auto segIndexes = hyphenator.breakIndexes(segment); + + if (includeFallback && segIndexes.empty()) { + const size_t minPrefix = hyphenator.minPrefix(); + const size_t minSuffix = hyphenator.minSuffix(); + for (size_t idx = minPrefix; idx + minSuffix <= segment.size(); ++idx) { + segIndexes.push_back(idx); + } + } + + for (const size_t idx : segIndexes) { + assert(idx > 0 && idx < segment.size()); + if (idx == 0 || idx >= segment.size()) continue; + const size_t cpIdx = segStart + idx; + if (cpIdx < cps.size()) { + outBreaks.push_back({cps[cpIdx].byteOffset, true}); + } + } + } + + segStart = i + 1; + } +} + +void appendApostropheContractionBreaks(const std::vector& cps, + std::vector& outBreaks) { + constexpr size_t kMinLeftSegmentLen = 3; + constexpr size_t kMinRightSegmentLen = 2; + size_t segmentStart = 0; + + for (size_t i = 0; i < cps.size(); ++i) { + if (isSegmentSeparator(cps[i].value)) { + if (isApostrophe(cps[i].value) && i > 0 && i + 1 < cps.size() && isAlphabetic(cps[i - 1].value) && + isAlphabetic(cps[i + 1].value)) { + size_t leftPrefixLen = 0; + for (size_t j = segmentStart; j < i; ++j) { + if (isAlphabetic(cps[j].value)) { + ++leftPrefixLen; + } + } + + size_t rightSuffixLen = 0; + for (size_t j = i + 1; j < cps.size() && !isSegmentSeparator(cps[j].value); ++j) { + if (isAlphabetic(cps[j].value)) { + ++rightSuffixLen; + } + } + + // Avoid stranding short clitics like "l'"/"d'" or tiny suffixes like "'t". + if (leftPrefixLen >= kMinLeftSegmentLen && rightSuffixLen >= kMinRightSegmentLen) { + outBreaks.push_back({cps[i + 1].byteOffset, false}); + } + } + segmentStart = i + 1; + } + } +} + +void sortAndDedupeBreakInfos(std::vector& infos) { + std::sort(infos.begin(), infos.end(), [](const Hyphenator::BreakInfo& a, const Hyphenator::BreakInfo& b) { + if (a.byteOffset != b.byteOffset) { + return a.byteOffset < b.byteOffset; + } + return a.requiresInsertedHyphen < b.requiresInsertedHyphen; + }); + + infos.erase(std::unique(infos.begin(), infos.end(), + [](const Hyphenator::BreakInfo& a, const Hyphenator::BreakInfo& b) { + return a.byteOffset == b.byteOffset; + }), + infos.end()); +} + } // namespace std::vector Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) { @@ -71,6 +160,15 @@ std::vector Hyphenator::breakOffsets(const std::string& w trimSurroundingPunctuationAndFootnote(cps); const auto* hyphenator = cachedHyphenator_; + // Detect apostrophe-like separators early; used by both branches below. + bool hasApostropheLikeSeparator = false; + for (const auto& cp : cps) { + if (isApostrophe(cp.value)) { + hasApostropheLikeSeparator = true; + break; + } + } + // Explicit hyphen markers (soft or hard) take precedence over language breaks. auto explicitBreakInfos = buildExplicitBreakInfos(cps); if (!explicitBreakInfos.empty()) { @@ -89,31 +187,32 @@ std::vector Hyphenator::breakOffsets(const std::string& w // @16 Satellitensys|tems (+hyphen) // Result: 6 sorted break points; the line-breaker picks the widest prefix that fits. if (hyphenator) { - size_t segStart = 0; - for (size_t i = 0; i <= cps.size(); ++i) { - const bool atEnd = (i == cps.size()); - const bool atHyphen = !atEnd && isExplicitHyphen(cps[i].value); - if (atEnd || atHyphen) { - if (i > segStart) { - std::vector segment(cps.begin() + segStart, cps.begin() + i); - auto segIndexes = hyphenator->breakIndexes(segment); - for (const size_t idx : segIndexes) { - const size_t cpIdx = segStart + idx; - if (cpIdx < cps.size()) { - explicitBreakInfos.push_back({cps[cpIdx].byteOffset, true}); - } - } - } - segStart = i + 1; - } - } - // Merge explicit and pattern breaks into ascending byte-offset order. - std::sort(explicitBreakInfos.begin(), explicitBreakInfos.end(), - [](const BreakInfo& a, const BreakInfo& b) { return a.byteOffset < b.byteOffset; }); + appendSegmentPatternBreaks(cps, *hyphenator, /*includeFallback=*/false, explicitBreakInfos); } + // Also add apostrophe contraction breaks when present (e.g. "l'état-major" + // has both an explicit hyphen and an apostrophe that can independently break). + if (hasApostropheLikeSeparator) { + appendApostropheContractionBreaks(cps, explicitBreakInfos); + } + // Merge all break points into ascending byte-offset order. + sortAndDedupeBreakInfos(explicitBreakInfos); return explicitBreakInfos; } + // Apostrophe-like separators split compounds into alphabetic segments; run Liang on each segment. + // This allows words like "all'improvviso" to hyphenate within "improvviso" instead of becoming + // completely unsplittable due to the apostrophe punctuation. Apostrophe contraction breaks are + // applied regardless of whether a language hyphenator is available. + if (hasApostropheLikeSeparator) { + std::vector segmentedBreaks; + if (hyphenator) { + appendSegmentPatternBreaks(cps, *hyphenator, includeFallback, segmentedBreaks); + } + appendApostropheContractionBreaks(cps, segmentedBreaks); + sortAndDedupeBreakInfos(segmentedBreaks); + return segmentedBreaks; + } + // Ask language hyphenator for legal break points. std::vector indexes; if (hyphenator) { diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.h b/lib/Epub/Epub/hyphenation/Hyphenator.h index 4447f9cc..74886ba6 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.h +++ b/lib/Epub/Epub/hyphenation/Hyphenator.h @@ -11,7 +11,8 @@ class Hyphenator { struct BreakInfo { size_t byteOffset; // Byte position inside the UTF-8 word where a break may occur. bool requiresInsertedHyphen; // true = a visible '-' must be rendered at the break (pattern/fallback breaks). - // false = the word already contains a hyphen at this position (explicit '-'). + // false = break occurs at an existing visible separator boundary + // (explicit '-' or eligible apostrophe contraction boundary). }; // Returns byte offsets where the word may be hyphenated. @@ -19,12 +20,17 @@ class Hyphenator { // Break sources (in priority order): // 1. Explicit hyphens already present in the word (e.g. '-' or soft-hyphen U+00AD). // When found, language patterns are additionally run on each alphabetic segment - // between hyphens so compound words can break within their parts. + // between separators so compound words can break within their parts. // Example: "US-Satellitensystems" yields breaks after "US-" (no inserted hyphen) // plus pattern breaks inside "Satellitensystems" (Sa|tel|li|ten|sys|tems). - // 2. Language-specific Liang patterns (e.g. German de_patterns). + // 2. Apostrophe contractions between letters (e.g. all'improvviso). + // Liang patterns are run per alphabetic segment around apostrophes. + // A direct break at the apostrophe boundary is allowed only when the left + // segment has at least 3 letters and the right segment has at least 2 letters, + // avoiding short clitics (e.g. l', d') and short contraction tails (e.g. can't). + // 3. Language-specific Liang patterns (e.g. German de_patterns). // Example: "Quadratkilometer" -> Qua|drat|ki|lo|me|ter. - // 3. Fallback every-N-chars splitting (only when includeFallback is true AND no + // 4. Fallback every-N-chars splitting (only when includeFallback is true AND no // pattern breaks were found). Used as a last resort to prevent a single oversized // word from overflowing the page width. static std::vector breakOffsets(const std::string& word, bool includeFallback);