From 5d00e5ac0fc2797ae705fc4778f28d3c124c1779 Mon Sep 17 00:00:00 2001
From: Arthur Tazhitdinov <lisnake@gmail.com>
Date: Fri, 26 Dec 2025 04:36:19 +0500
Subject: [PATCH] Enhance hyphenation logic: add morphology break handling and
 improve vowel detection

---
 .../Epub/hyphenation/EnglishHyphenator.cpp    | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)
diff --git a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
index 1e01b05..cef341d 100644
--- a/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/EnglishHyphenator.cpp
@@ -1,7 +1,9 @@
 #include "EnglishHyphenator.h"
 
 #include <algorithm>
+#include <array>
 #include <initializer_list>
+#include <string>
 #include <vector>
 
 namespace {
@@ -45,6 +47,105 @@ bool isEnglishFricativeChar(const char c) {
   }
 }
 
+struct LatinLiteral {
+  const char* text;
+  size_t length;
+};
+
+bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
+
+std::string lowercaseLatinWord(const std::vector<CodepointInfo>& cps) {
+  std::string lower;
+  lower.reserve(cps.size());
+  for (const auto& info : cps) {
+    lower.push_back(lowerLatinChar(info.value));
+  }
+  return lower;
+}
+
+bool matchesPatternAt(const std::string& lowerWord, const size_t start, const LatinLiteral& pattern) {
+  if (!pattern.text || pattern.length == 0) {
+    return false;
+  }
+  if (start + pattern.length > lowerWord.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < pattern.length; ++i) {
+    if (lowerWord[start + i] != pattern.text[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool englishSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
+  if (start >= end || start >= cps.size()) {
+    return false;
+  }
+  const size_t clampedEnd = std::min(end, cps.size());
+  for (size_t i = start; i < clampedEnd; ++i) {
+    if (isLatinVowel(cps[i].value)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
+                            std::vector<size_t>& indexes) {
+  static constexpr std::array<LatinLiteral, 20> PREFIXES = {{{"anti", 4},   {"auto", 4},   {"counter", 7}, {"de", 2},
+                                                             {"dis", 3},    {"hyper", 5},  {"inter", 5},   {"micro", 5},
+                                                             {"mis", 3},    {"mono", 4},   {"multi", 5},   {"non", 3},
+                                                             {"over", 4},   {"post", 4},   {"pre", 3},     {"pro", 3},
+                                                             {"re", 2},     {"sub", 3},    {"super", 5},   {"trans", 5}}};
+
+  static constexpr std::array<LatinLiteral, 24> SUFFIXES = {{{"able", 4},   {"ible", 4},   {"ing", 3},    {"ings", 4},
+                                                             {"ed", 2},     {"er", 2},     {"ers", 3},    {"est", 3},
+                                                             {"ful", 3},    {"hood", 4},   {"less", 4},   {"lessly", 6},
+                                                             {"ly", 2},     {"ment", 4},   {"ments", 5},  {"ness", 4},
+                                                             {"ous", 3},    {"tion", 4},   {"sion", 4},   {"ward", 4},
+                                                             {"wards", 5},  {"ship", 4},   {"ships", 5},  {"y", 1}}};
+
+  const size_t length = cps.size();
+  if (length < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
+    return;
+  }
+
+  const auto tryPush = [&](const size_t breakIndex) {
+    if (breakIndex < MIN_PREFIX_CP || length - breakIndex < MIN_SUFFIX_CP) {
+      return;
+    }
+    if (!englishSegmentHasVowel(cps, 0, breakIndex) || !englishSegmentHasVowel(cps, breakIndex, length)) {
+      return;
+    }
+    if (nextToApostrophe(cps, breakIndex)) {
+      return;
+    }
+    indexes.push_back(breakIndex);
+  };
+
+  for (const auto& prefix : PREFIXES) {
+    if (prefix.length == 0 || prefix.length >= length) {
+      continue;
+    }
+    if (!matchesPatternAt(lowerWord, 0, prefix)) {
+      continue;
+    }
+    tryPush(prefix.length);
+  }
+
+  for (const auto& suffix : SUFFIXES) {
+    if (suffix.length == 0 || suffix.length >= length) {
+      continue;
+    }
+    const size_t breakIndex = length - suffix.length;
+    if (!matchesPatternAt(lowerWord, breakIndex, suffix)) {
+      continue;
+    }
+    tryPush(breakIndex);
+  }
+}
+
 struct CharPair {
   char first;
   char second;
@@ -225,6 +326,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
     return indexes;
   }
 
+  const auto lowerWord = lowercaseLatinWord(cps);
   std::vector<size_t> vowelPositions;
   vowelPositions.reserve(cps.size());
   for (size_t i = 0; i < cps.size(); ++i) {
@@ -263,6 +365,8 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
     indexes.push_back(breakIndex);
   }
 
+  appendMorphologyBreaks(cps, lowerWord, indexes);
+
   std::sort(indexes.begin(), indexes.end());
   indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
   return indexes;