diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp index b402d5b9..4b2ac4a1 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp @@ -107,6 +107,17 @@ bool isPunctuation(const uint32_t cp) { bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; } +bool isApostrophe(const uint32_t cp) { + switch (cp) { + case '\'': + case 0x2018: // left single quotation mark + case 0x2019: // right single quotation mark + return true; + default: + return false; + } +} + bool isExplicitHyphen(const uint32_t cp) { switch (cp) { case '-': diff --git a/lib/Epub/Epub/hyphenation/HyphenationCommon.h b/lib/Epub/Epub/hyphenation/HyphenationCommon.h index 522a4673..1639c257 100644 --- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h +++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h @@ -19,6 +19,7 @@ bool isCyrillicLetter(uint32_t cp); bool isAlphabetic(uint32_t cp); bool isPunctuation(uint32_t cp); bool isAsciiDigit(uint32_t cp); +bool isApostrophe(uint32_t cp); bool isExplicitHyphen(uint32_t cp); bool isSoftHyphen(uint32_t cp); void trimSurroundingPunctuationAndFootnote(std::vector& cps); diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index 4d86febe..eb3bdec2 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -1,6 +1,7 @@ #include "Hyphenator.h" #include +#include #include #include "HyphenationCommon.h" @@ -59,6 +60,94 @@ std::vector buildExplicitBreakInfos(const std::vector& cps, const LanguageHyphenator& hyphenator, + const bool includeFallback, std::vector& outBreaks) { + size_t segStart = 0; + + for (size_t i = 0; i <= cps.size(); ++i) { + const bool atEnd = i == cps.size(); + const bool atSeparator = !atEnd && isSegmentSeparator(cps[i].value); + if (!atEnd && !atSeparator) { + continue; + } + + if (i > segStart) { + std::vector segment(cps.begin() + segStart, cps.begin() + i); + auto segIndexes = hyphenator.breakIndexes(segment); + + if (includeFallback && segIndexes.empty()) { + const size_t minPrefix = hyphenator.minPrefix(); + const size_t minSuffix = hyphenator.minSuffix(); + for (size_t idx = minPrefix; idx + minSuffix <= segment.size(); ++idx) { + segIndexes.push_back(idx); + } + } + + for (const size_t idx : segIndexes) { + assert(idx > 0 && idx < segment.size()); + if (idx == 0 || idx >= segment.size()) continue; + const size_t cpIdx = segStart + idx; + if (cpIdx < cps.size()) { + outBreaks.push_back({cps[cpIdx].byteOffset, true}); + } + } + } + + segStart = i + 1; + } +} + +void appendApostropheContractionBreaks(const std::vector& cps, + std::vector& outBreaks) { + constexpr size_t kMinLeftSegmentLen = 3; + constexpr size_t kMinRightSegmentLen = 2; + size_t segmentStart = 0; + + for (size_t i = 0; i < cps.size(); ++i) { + if (isSegmentSeparator(cps[i].value)) { + if (isApostrophe(cps[i].value) && i > 0 && i + 1 < cps.size() && isAlphabetic(cps[i - 1].value) && + isAlphabetic(cps[i + 1].value)) { + size_t leftPrefixLen = 0; + for (size_t j = segmentStart; j < i; ++j) { + if (isAlphabetic(cps[j].value)) { + ++leftPrefixLen; + } + } + + size_t rightSuffixLen = 0; + for (size_t j = i + 1; j < cps.size() && !isSegmentSeparator(cps[j].value); ++j) { + if (isAlphabetic(cps[j].value)) { + ++rightSuffixLen; + } + } + + // Avoid stranding short clitics like "l'"/"d'" or tiny suffixes like "'t". + if (leftPrefixLen >= kMinLeftSegmentLen && rightSuffixLen >= kMinRightSegmentLen) { + outBreaks.push_back({cps[i + 1].byteOffset, false}); + } + } + segmentStart = i + 1; + } + } +} + +void sortAndDedupeBreakInfos(std::vector& infos) { + std::sort(infos.begin(), infos.end(), [](const Hyphenator::BreakInfo& a, const Hyphenator::BreakInfo& b) { + if (a.byteOffset != b.byteOffset) { + return a.byteOffset < b.byteOffset; + } + return a.requiresInsertedHyphen < b.requiresInsertedHyphen; + }); + + infos.erase(std::unique(infos.begin(), infos.end(), + [](const Hyphenator::BreakInfo& a, const Hyphenator::BreakInfo& b) { + return a.byteOffset == b.byteOffset; + }), + infos.end()); +} + } // namespace std::vector Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) { @@ -71,6 +160,15 @@ std::vector Hyphenator::breakOffsets(const std::string& w trimSurroundingPunctuationAndFootnote(cps); const auto* hyphenator = cachedHyphenator_; + // Detect apostrophe-like separators early; used by both branches below. + bool hasApostropheLikeSeparator = false; + for (const auto& cp : cps) { + if (isApostrophe(cp.value)) { + hasApostropheLikeSeparator = true; + break; + } + } + // Explicit hyphen markers (soft or hard) take precedence over language breaks. auto explicitBreakInfos = buildExplicitBreakInfos(cps); if (!explicitBreakInfos.empty()) { @@ -89,31 +187,32 @@ std::vector Hyphenator::breakOffsets(const std::string& w // @16 Satellitensys|tems (+hyphen) // Result: 6 sorted break points; the line-breaker picks the widest prefix that fits. if (hyphenator) { - size_t segStart = 0; - for (size_t i = 0; i <= cps.size(); ++i) { - const bool atEnd = (i == cps.size()); - const bool atHyphen = !atEnd && isExplicitHyphen(cps[i].value); - if (atEnd || atHyphen) { - if (i > segStart) { - std::vector segment(cps.begin() + segStart, cps.begin() + i); - auto segIndexes = hyphenator->breakIndexes(segment); - for (const size_t idx : segIndexes) { - const size_t cpIdx = segStart + idx; - if (cpIdx < cps.size()) { - explicitBreakInfos.push_back({cps[cpIdx].byteOffset, true}); - } - } - } - segStart = i + 1; - } - } - // Merge explicit and pattern breaks into ascending byte-offset order. - std::sort(explicitBreakInfos.begin(), explicitBreakInfos.end(), - [](const BreakInfo& a, const BreakInfo& b) { return a.byteOffset < b.byteOffset; }); + appendSegmentPatternBreaks(cps, *hyphenator, /*includeFallback=*/false, explicitBreakInfos); } + // Also add apostrophe contraction breaks when present (e.g. "l'état-major" + // has both an explicit hyphen and an apostrophe that can independently break). + if (hasApostropheLikeSeparator) { + appendApostropheContractionBreaks(cps, explicitBreakInfos); + } + // Merge all break points into ascending byte-offset order. + sortAndDedupeBreakInfos(explicitBreakInfos); return explicitBreakInfos; } + // Apostrophe-like separators split compounds into alphabetic segments; run Liang on each segment. + // This allows words like "all'improvviso" to hyphenate within "improvviso" instead of becoming + // completely unsplittable due to the apostrophe punctuation. Apostrophe contraction breaks are + // applied regardless of whether a language hyphenator is available. + if (hasApostropheLikeSeparator) { + std::vector segmentedBreaks; + if (hyphenator) { + appendSegmentPatternBreaks(cps, *hyphenator, includeFallback, segmentedBreaks); + } + appendApostropheContractionBreaks(cps, segmentedBreaks); + sortAndDedupeBreakInfos(segmentedBreaks); + return segmentedBreaks; + } + // Ask language hyphenator for legal break points. std::vector indexes; if (hyphenator) { diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.h b/lib/Epub/Epub/hyphenation/Hyphenator.h index 4447f9cc..74886ba6 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.h +++ b/lib/Epub/Epub/hyphenation/Hyphenator.h @@ -11,7 +11,8 @@ class Hyphenator { struct BreakInfo { size_t byteOffset; // Byte position inside the UTF-8 word where a break may occur. bool requiresInsertedHyphen; // true = a visible '-' must be rendered at the break (pattern/fallback breaks). - // false = the word already contains a hyphen at this position (explicit '-'). + // false = break occurs at an existing visible separator boundary + // (explicit '-' or eligible apostrophe contraction boundary). }; // Returns byte offsets where the word may be hyphenated. @@ -19,12 +20,17 @@ class Hyphenator { // Break sources (in priority order): // 1. Explicit hyphens already present in the word (e.g. '-' or soft-hyphen U+00AD). // When found, language patterns are additionally run on each alphabetic segment - // between hyphens so compound words can break within their parts. + // between separators so compound words can break within their parts. // Example: "US-Satellitensystems" yields breaks after "US-" (no inserted hyphen) // plus pattern breaks inside "Satellitensystems" (Sa|tel|li|ten|sys|tems). - // 2. Language-specific Liang patterns (e.g. German de_patterns). + // 2. Apostrophe contractions between letters (e.g. all'improvviso). + // Liang patterns are run per alphabetic segment around apostrophes. + // A direct break at the apostrophe boundary is allowed only when the left + // segment has at least 3 letters and the right segment has at least 2 letters, + // avoiding short clitics (e.g. l', d') and short contraction tails (e.g. can't). + // 3. Language-specific Liang patterns (e.g. German de_patterns). // Example: "Quadratkilometer" -> Qua|drat|ki|lo|me|ter. - // 3. Fallback every-N-chars splitting (only when includeFallback is true AND no + // 4. Fallback every-N-chars splitting (only when includeFallback is true AND no // pattern breaks were found). Used as a last resort to prevent a single oversized // word from overflowing the page width. static std::vector breakOffsets(const std::string& word, bool includeFallback);