fix: Add special handling for apostrophe hyphenation (#1318)

## Summary * **What is the goal of this PR?** Fixing / extending the hyphenation logic to deal with words containing an apostophe as raised in #1186 * **What changes are included?** ## Additional Context --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**PARTIALLY**_ (as the user provided a thorough analysis that I followed)
2026-03-12 00:35:23 +01:00
parent f1e9dc7f30
commit 3dabd30287
4 changed files with 142 additions and 25 deletions
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@@ -107,6 +107,17 @@ bool isPunctuation(const uint32_t cp) {
 bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
 bool isApostrophe(const uint32_t cp) {
  switch (cp) {
    case '\'':
    case 0x2018:  // left single quotation mark
    case 0x2019:  // right single quotation mark
      return true;
    default:
      return false;
  }
 }
 bool isExplicitHyphen(const uint32_t cp) {
  switch (cp) {
    case '-':
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
@@ -19,6 +19,7 @@ bool isCyrillicLetter(uint32_t cp);
 bool isAlphabetic(uint32_t cp);
 bool isPunctuation(uint32_t cp);
 bool isAsciiDigit(uint32_t cp);
 bool isApostrophe(uint32_t cp);
 bool isExplicitHyphen(uint32_t cp);
 bool isSoftHyphen(uint32_t cp);
 void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps);
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@@ -1,6 +1,7 @@
 #include "Hyphenator.h"
 #include <algorithm>
 #include <cassert>
 #include <vector>
 #include "HyphenationCommon.h"
@@ -59,6 +60,94 @@ std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<Cod
  return breaks;
 }
 bool isSegmentSeparator(const uint32_t cp) { return isExplicitHyphen(cp) || isApostrophe(cp); }
 void appendSegmentPatternBreaks(const std::vector<CodepointInfo>& cps, const LanguageHyphenator& hyphenator,
                                const bool includeFallback, std::vector<Hyphenator::BreakInfo>& outBreaks) {
  size_t segStart = 0;
  for (size_t i = 0; i <= cps.size(); ++i) {
    const bool atEnd = i == cps.size();
    const bool atSeparator = !atEnd && isSegmentSeparator(cps[i].value);
    if (!atEnd && !atSeparator) {
      continue;
    }
    if (i > segStart) {
      std::vector<CodepointInfo> segment(cps.begin() + segStart, cps.begin() + i);
      auto segIndexes = hyphenator.breakIndexes(segment);
      if (includeFallback && segIndexes.empty()) {
        const size_t minPrefix = hyphenator.minPrefix();
        const size_t minSuffix = hyphenator.minSuffix();
        for (size_t idx = minPrefix; idx + minSuffix <= segment.size(); ++idx) {
          segIndexes.push_back(idx);
        }
      }
      for (const size_t idx : segIndexes) {
        assert(idx > 0 && idx < segment.size());
        if (idx == 0 || idx >= segment.size()) continue;
        const size_t cpIdx = segStart + idx;
        if (cpIdx < cps.size()) {
          outBreaks.push_back({cps[cpIdx].byteOffset, true});
        }
      }
    }
    segStart = i + 1;
  }
 }
 void appendApostropheContractionBreaks(const std::vector<CodepointInfo>& cps,
                                       std::vector<Hyphenator::BreakInfo>& outBreaks) {
  constexpr size_t kMinLeftSegmentLen = 3;
  constexpr size_t kMinRightSegmentLen = 2;
  size_t segmentStart = 0;
  for (size_t i = 0; i < cps.size(); ++i) {
    if (isSegmentSeparator(cps[i].value)) {
      if (isApostrophe(cps[i].value) && i > 0 && i + 1 < cps.size() && isAlphabetic(cps[i - 1].value) &&
          isAlphabetic(cps[i + 1].value)) {
        size_t leftPrefixLen = 0;
        for (size_t j = segmentStart; j < i; ++j) {
          if (isAlphabetic(cps[j].value)) {
            ++leftPrefixLen;
          }
        }
        size_t rightSuffixLen = 0;
        for (size_t j = i + 1; j < cps.size() && !isSegmentSeparator(cps[j].value); ++j) {
          if (isAlphabetic(cps[j].value)) {
            ++rightSuffixLen;
          }
        }
        // Avoid stranding short clitics like "l'"/"d'" or tiny suffixes like "'t".
        if (leftPrefixLen >= kMinLeftSegmentLen && rightSuffixLen >= kMinRightSegmentLen) {
          outBreaks.push_back({cps[i + 1].byteOffset, false});
        }
      }
      segmentStart = i + 1;
    }
  }
 }
 void sortAndDedupeBreakInfos(std::vector<Hyphenator::BreakInfo>& infos) {
  std::sort(infos.begin(), infos.end(), [](const Hyphenator::BreakInfo& a, const Hyphenator::BreakInfo& b) {
    if (a.byteOffset != b.byteOffset) {
      return a.byteOffset < b.byteOffset;
    }
    return a.requiresInsertedHyphen < b.requiresInsertedHyphen;
  });
  infos.erase(std::unique(infos.begin(), infos.end(),
                          [](const Hyphenator::BreakInfo& a, const Hyphenator::BreakInfo& b) {
                            return a.byteOffset == b.byteOffset;
                          }),
              infos.end());
 }
 }  // namespace
 std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
@@ -71,6 +160,15 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
  trimSurroundingPunctuationAndFootnote(cps);
  const auto* hyphenator = cachedHyphenator_;
  // Detect apostrophe-like separators early; used by both branches below.
  bool hasApostropheLikeSeparator = false;
  for (const auto& cp : cps) {
    if (isApostrophe(cp.value)) {
      hasApostropheLikeSeparator = true;
      break;
    }
  }
  // Explicit hyphen markers (soft or hard) take precedence over language breaks.
  auto explicitBreakInfos = buildExplicitBreakInfos(cps);
  if (!explicitBreakInfos.empty()) {
@@ -89,31 +187,32 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
    //                                            @16 Satellitensys|tems  (+hyphen)
    //   Result: 6 sorted break points; the line-breaker picks the widest prefix that fits.
    if (hyphenator) {
-      size_t segStart = 0;
+      appendSegmentPatternBreaks(cps, *hyphenator, /*includeFallback=*/false, explicitBreakInfos);
      for (size_t i = 0; i <= cps.size(); ++i) {
        const bool atEnd = (i == cps.size());
        const bool atHyphen = !atEnd && isExplicitHyphen(cps[i].value);
        if (atEnd || atHyphen) {
          if (i > segStart) {
            std::vector<CodepointInfo> segment(cps.begin() + segStart, cps.begin() + i);
            auto segIndexes = hyphenator->breakIndexes(segment);
            for (const size_t idx : segIndexes) {
              const size_t cpIdx = segStart + idx;
              if (cpIdx < cps.size()) {
                explicitBreakInfos.push_back({cps[cpIdx].byteOffset, true});
    }
    // Also add apostrophe contraction breaks when present (e.g. "l'état-major"
    // has both an explicit hyphen and an apostrophe that can independently break).
    if (hasApostropheLikeSeparator) {
      appendApostropheContractionBreaks(cps, explicitBreakInfos);
    }
-          }
+    // Merge all break points into ascending byte-offset order.
-          segStart = i + 1;
+    sortAndDedupeBreakInfos(explicitBreakInfos);
        }
      }
      // Merge explicit and pattern breaks into ascending byte-offset order.
      std::sort(explicitBreakInfos.begin(), explicitBreakInfos.end(),
                [](const BreakInfo& a, const BreakInfo& b) { return a.byteOffset < b.byteOffset; });
    }
    return explicitBreakInfos;
  }
  // Apostrophe-like separators split compounds into alphabetic segments; run Liang on each segment.
  // This allows words like "all'improvviso" to hyphenate within "improvviso" instead of becoming
  // completely unsplittable due to the apostrophe punctuation. Apostrophe contraction breaks are
  // applied regardless of whether a language hyphenator is available.
  if (hasApostropheLikeSeparator) {
    std::vector<BreakInfo> segmentedBreaks;
    if (hyphenator) {
      appendSegmentPatternBreaks(cps, *hyphenator, includeFallback, segmentedBreaks);
    }
    appendApostropheContractionBreaks(cps, segmentedBreaks);
    sortAndDedupeBreakInfos(segmentedBreaks);
    return segmentedBreaks;
  }
  // Ask language hyphenator for legal break points.
  std::vector<size_t> indexes;
  if (hyphenator) {
--- a/lib/Epub/Epub/hyphenation/Hyphenator.h
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.h
@@ -11,7 +11,8 @@ class Hyphenator {
  struct BreakInfo {
    size_t byteOffset;            // Byte position inside the UTF-8 word where a break may occur.
    bool requiresInsertedHyphen;  // true = a visible '-' must be rendered at the break (pattern/fallback breaks).
-                                  // false = the word already contains a hyphen at this position (explicit '-').
+                                  // false = break occurs at an existing visible separator boundary
                                  //         (explicit '-' or eligible apostrophe contraction boundary).
  };
  // Returns byte offsets where the word may be hyphenated.
@@ -19,12 +20,17 @@ class Hyphenator {
  // Break sources (in priority order):
  //   1. Explicit hyphens already present in the word (e.g. '-' or soft-hyphen U+00AD).
  //      When found, language patterns are additionally run on each alphabetic segment
-  //      between hyphens so compound words can break within their parts.
+  //      between separators so compound words can break within their parts.
  //      Example: "US-Satellitensystems" yields breaks after "US-" (no inserted hyphen)
  //               plus pattern breaks inside "Satellitensystems" (Sa|tel|li|ten|sys|tems).
-  //   2. Language-specific Liang patterns (e.g. German de_patterns).
+  //   2. Apostrophe contractions between letters (e.g. all'improvviso).
  //      Liang patterns are run per alphabetic segment around apostrophes.
  //      A direct break at the apostrophe boundary is allowed only when the left
  //      segment has at least 3 letters and the right segment has at least 2 letters,
  //      avoiding short clitics (e.g. l', d') and short contraction tails (e.g. can't).
  //   3. Language-specific Liang patterns (e.g. German de_patterns).
  //      Example: "Quadratkilometer" -> Qua|drat|ki|lo|me|ter.
-  //   3. Fallback every-N-chars splitting (only when includeFallback is true AND no
+  //   4. Fallback every-N-chars splitting (only when includeFallback is true AND no
  //      pattern breaks were found). Used as a last resort to prevent a single oversized
  //      word from overflowing the page width.
  static std::vector<BreakInfo> breakOffsets(const std::string& word, bool includeFallback);