fix: Prevent line breaks on common English contractions (#1405)

This commit is contained in:
Zach Nelson
2026-03-16 19:04:06 -05:00
committed by GitHub
parent b5df6cb2b5
commit dc39480349
2 changed files with 4 additions and 4 deletions

View File

@@ -102,7 +102,7 @@ void appendSegmentPatternBreaks(const std::vector<CodepointInfo>& cps, const Lan
void appendApostropheContractionBreaks(const std::vector<CodepointInfo>& cps, void appendApostropheContractionBreaks(const std::vector<CodepointInfo>& cps,
std::vector<Hyphenator::BreakInfo>& outBreaks) { std::vector<Hyphenator::BreakInfo>& outBreaks) {
constexpr size_t kMinLeftSegmentLen = 3; constexpr size_t kMinLeftSegmentLen = 3;
constexpr size_t kMinRightSegmentLen = 2; constexpr size_t kMinRightSegmentLen = 3;
size_t segmentStart = 0; size_t segmentStart = 0;
for (size_t i = 0; i < cps.size(); ++i) { for (size_t i = 0; i < cps.size(); ++i) {
@@ -123,7 +123,7 @@ void appendApostropheContractionBreaks(const std::vector<CodepointInfo>& cps,
} }
} }
// Avoid stranding short clitics like "l'"/"d'" or tiny suffixes like "'t". // Avoid stranding short clitics like "l'"/"d'" or contraction tails like "'ve"/"'re"/"'ll".
if (leftPrefixLen >= kMinLeftSegmentLen && rightSuffixLen >= kMinRightSegmentLen) { if (leftPrefixLen >= kMinLeftSegmentLen && rightSuffixLen >= kMinRightSegmentLen) {
outBreaks.push_back({cps[i + 1].byteOffset, false}); outBreaks.push_back({cps[i + 1].byteOffset, false});
} }

View File

@@ -26,8 +26,8 @@ class Hyphenator {
// 2. Apostrophe contractions between letters (e.g. all'improvviso). // 2. Apostrophe contractions between letters (e.g. all'improvviso).
// Liang patterns are run per alphabetic segment around apostrophes. // Liang patterns are run per alphabetic segment around apostrophes.
// A direct break at the apostrophe boundary is allowed only when the left // A direct break at the apostrophe boundary is allowed only when the left
// segment has at least 3 letters and the right segment has at least 2 letters, // segment has at least 3 letters and the right segment has at least 3 letters,
// avoiding short clitics (e.g. l', d') and short contraction tails (e.g. can't). // avoiding short clitics (e.g. l', d') and contraction tails (e.g. 've, 're, 'll).
// 3. Language-specific Liang patterns (e.g. German de_patterns). // 3. Language-specific Liang patterns (e.g. German de_patterns).
// Example: "Quadratkilometer" -> Qua|drat|ki|lo|me|ter. // Example: "Quadratkilometer" -> Qua|drat|ki|lo|me|ter.
// 4. Fallback every-N-chars splitting (only when includeFallback is true AND no // 4. Fallback every-N-chars splitting (only when includeFallback is true AND no