Rename trimTrailingPunctuation to trimSurroundingPunctuation and update logic to remove surrounding punctuation; add explicit hyphen handling in breakOffsets function.

2025-12-26 06:03:38 +05:00 · 2025-12-26 06:03:38 +05:00 · f6767c857f
commit f6767c857f
parent 23183a6270
3 changed files with 43 additions and 3 deletions
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@ -92,7 +92,10 @@ bool isPunctuation(const uint32_t cp) {
  }
 }

-void trimTrailingPunctuation(std::vector<CodepointInfo>& cps) {
+void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
+  while (!cps.empty() && isPunctuation(cps.front().value)) {
+    cps.erase(cps.begin());
+  }
  while (!cps.empty() && isPunctuation(cps.back().value)) {
    cps.pop_back();
  }
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
@ -28,6 +28,6 @@ bool isCyrillicConsonant(uint32_t cp);
 bool isAlphabetic(uint32_t cp);
 bool isVowel(uint32_t cp);
 bool isPunctuation(uint32_t cp);
-void trimTrailingPunctuation(std::vector<CodepointInfo>& cps);
+void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);

 Script detectScript(const std::vector<CodepointInfo>& cps);
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@ -48,6 +48,32 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
  return cps;
 }

+bool isExplicitHyphen(const uint32_t cp) { return cp == '-' || cp == 0x2010; }
+
+std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo>& cps) {
+  std::vector<size_t> indexes;
+  for (size_t i = 0; i < cps.size(); ++i) {
+    if (!isExplicitHyphen(cps[i].value)) {
+      continue;
+    }
+    if (i == 0 || i + 1 >= cps.size()) {
+      continue;
+    }
+    if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
+      continue;
+    }
+    const size_t breakIndex = i + 1;
+    if (breakIndex >= cps.size()) {
+      continue;
+    }
+    if (breakIndex == 0) {
+      continue;
+    }
+    indexes.push_back(breakIndex);
+  }
+  return indexes;
+}
+
 // Rejects words containing punctuation or digits unless forced.
 bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
  if (cps.empty()) {
@ -93,11 +119,22 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
  }

  auto cps = collectCodepoints(word);
-  trimTrailingPunctuation(cps);
+  trimSurroundingPunctuation(cps);
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
    return {};
  }

+  if (auto explicitIndexes = collectExplicitHyphenIndexes(cps); !explicitIndexes.empty()) {
+    std::sort(explicitIndexes.begin(), explicitIndexes.end());
+    explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
+    std::vector<size_t> byteOffsets;
+    byteOffsets.reserve(explicitIndexes.size());
+    for (const size_t idx : explicitIndexes) {
+      byteOffsets.push_back(byteOffsetForIndex(cps, idx));
+    }
+    return byteOffsets;
+  }
+
  std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
  if (includeFallback) {
    for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {