Add explicit hyphen handling and improve hyphenation logic in ParsedText and Hyphenator

2026-01-03 15:20:53 +05:00
parent f6767c857f
commit cb1ecdb505
4 changed files with 109 additions and 15 deletions
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@@ -79,8 +79,6 @@ bool isPunctuation(const uint32_t cp) {
    case 0x2019:  // ’
    case 0x201C:  // “
    case 0x201D:  // ”
-    case '[':
-    case ']':
    case '{':
    case '}':
    case '/':
@@ -92,6 +90,33 @@ bool isPunctuation(const uint32_t cp) {
  }
 }

+bool isExplicitHyphen(const uint32_t cp) {
+  switch (cp) {
+    case '-':
+    case 0x00AD:  // soft hyphen
+    case 0x058A:  // Armenian hyphen
+    case 0x2010:  // hyphen
+    case 0x2011:  // non-breaking hyphen
+    case 0x2012:  // figure dash
+    case 0x2013:  // en dash
+    case 0x2014:  // em dash
+    case 0x2015:  // horizontal bar
+    case 0x2043:  // hyphen bullet
+    case 0x207B:  // superscript minus
+    case 0x208B:  // subscript minus
+    case 0x2212:  // minus sign
+    case 0x2E17:  // double oblique hyphen
+    case 0x2E3A:  // two-em dash
+    case 0x2E3B:  // three-em dash
+    case 0xFE58:  // small em dash
+    case 0xFE63:  // small hyphen-minus
+    case 0xFF0D:  // fullwidth hyphen-minus
+      return true;
+    default:
+      return false;
+  }
+}
+
 void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
  while (!cps.empty() && isPunctuation(cps.front().value)) {
    cps.erase(cps.begin());
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.h
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.h
@@ -28,6 +28,7 @@ bool isCyrillicConsonant(uint32_t cp);
 bool isAlphabetic(uint32_t cp);
 bool isVowel(uint32_t cp);
 bool isPunctuation(uint32_t cp);
+bool isExplicitHyphen(uint32_t cp);
 void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);

 Script detectScript(const std::vector<CodepointInfo>& cps);
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@@ -48,8 +48,6 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
  return cps;
 }

-bool isExplicitHyphen(const uint32_t cp) { return cp == '-' || cp == 0x2010; }
-
 std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo>& cps) {
  std::vector<size_t> indexes;
  for (size_t i = 0; i < cps.size(); ++i) {
@@ -74,6 +72,32 @@ std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo
  return indexes;
 }

+bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
+
+void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
+  if (cps.size() < 3) {
+    return;
+  }
+  int closing = static_cast<int>(cps.size()) - 1;
+  if (cps[closing].value != ']') {
+    return;
+  }
+  int pos = closing - 1;
+  if (pos < 0 || !isAsciiDigit(cps[pos].value)) {
+    return;
+  }
+  while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
+    --pos;
+  }
+  if (pos < 0 || cps[pos].value != '[') {
+    return;
+  }
+  if (closing - pos <= 1) {
+    return;
+  }
+  cps.erase(cps.begin() + pos, cps.end());
+}
+
 // Rejects words containing punctuation or digits unless forced.
 bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
  if (cps.empty()) {
@@ -120,11 +144,13 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool

  auto cps = collectCodepoints(word);
  trimSurroundingPunctuation(cps);
+  trimTrailingFootnoteReference(cps);
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
    return {};
  }

-  if (auto explicitIndexes = collectExplicitHyphenIndexes(cps); !explicitIndexes.empty()) {
+  auto explicitIndexes = collectExplicitHyphenIndexes(cps);
+  if (!explicitIndexes.empty()) {
    std::sort(explicitIndexes.begin(), explicitIndexes.end());
    explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
    std::vector<size_t> byteOffsets;