Implement hyphenation support in text layout by enhancing word splitting and line breaking logic

2025-12-26 00:39:09 +05:00
parent 54d7a9437e
commit e7edcb6467
4 changed files with 255 additions and 131 deletions
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@@ -1,11 +1,9 @@
 #include "Hyphenator.h"

-#include <GfxRenderer.h>
 #include <Utf8.h>

 #include <algorithm>
 #include <array>
-#include <limits>
 #include <vector>

 #include "EnglishHyphenator.h"
@@ -87,84 +85,44 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
  return cps[index].byteOffset;
 }

-// Safely slices a UTF-8 string without splitting multibyte sequences.
-std::string slice(const std::string& word, const size_t startByte, const size_t endByte) {
-  if (startByte >= endByte || startByte >= word.size()) {
-    return std::string();
-  }
-  const size_t boundedEnd = std::min(endByte, word.size());
-  return word.substr(startByte, boundedEnd - startByte);
-}
-
 }  // namespace

-bool Hyphenator::splitWord(const GfxRenderer& renderer, const int fontId, const std::string& word,
-                           const EpdFontStyle style, const int availableWidth, HyphenationResult* result,
-                           const bool force) {
-  if (!result || word.empty()) {
-    return false;
+std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
+  std::vector<size_t> byteOffsets;
+  if (word.empty()) {
+    return byteOffsets;
  }

  auto cps = collectCodepoints(word);
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
-    return false;
+    return byteOffsets;
  }

-  // Skip mixed tokens (e.g., "v2.0") unless the caller forces a split due to overflow.
-  if (!force && !hasOnlyAlphabetic(cps)) {
-    return false;
+  std::vector<size_t> indexes;
+  indexes.reserve(cps.size());
+
+  if (hasOnlyAlphabetic(cps)) {
+    auto dictBreaks = collectBreakIndexes(cps);
+    indexes.insert(indexes.end(), dictBreaks.begin(), dictBreaks.end());
  }

-  const auto breakIndexes = collectBreakIndexes(cps);
-  // Budget for a trailing hyphen so rendered width matches the layout test.
-  const int hyphenWidth = renderer.getTextWidth(fontId, "-", style);
-  const int adjustedWidth = availableWidth - hyphenWidth;
-
-  size_t chosenIndex = std::numeric_limits<size_t>::max();
-
-  // Prefer dictionary-style break points emitted by language hyphenators.
-  if (adjustedWidth > 0) {
-    for (const size_t idx : breakIndexes) {
-      const size_t byteOffset = byteOffsetForIndex(cps, idx);
-      const std::string prefix = word.substr(0, byteOffset);
-      const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
-      if (prefixWidth <= adjustedWidth) {
-        chosenIndex = idx;
-      } else {
-        break;
-      }
-    }
-  }
-
-  if (chosenIndex == std::numeric_limits<size_t>::max() && force) {
-    // Emergency fallback: brute-force through codepoints to avoid overflow when no legal breaks fit.
+  if (includeFallback) {
    for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
-      const size_t byteOffset = byteOffsetForIndex(cps, idx);
-      const std::string prefix = word.substr(0, byteOffset);
-      const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
-      if (adjustedWidth <= 0 || prefixWidth <= adjustedWidth) {
-        chosenIndex = idx;
-        if (adjustedWidth > 0 && prefixWidth > adjustedWidth) {
-          break;
-        }
-      }
+      indexes.push_back(idx);
    }
  }

-  if (chosenIndex == std::numeric_limits<size_t>::max()) {
-    return false;
+  if (indexes.empty()) {
+    return byteOffsets;
  }

-  const size_t splitByte = byteOffsetForIndex(cps, chosenIndex);
-  const std::string head = word.substr(0, splitByte);
-  const std::string tail = slice(word, splitByte, word.size());
+  std::sort(indexes.begin(), indexes.end());
+  indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());

-  if (head.empty() || tail.empty()) {
-    return false;
+  byteOffsets.reserve(indexes.size());
+  for (const size_t idx : indexes) {
+    byteOffsets.push_back(byteOffsetForIndex(cps, idx));
  }

-  // Append the printed hyphen to the prefix while leaving the tail untouched.
-  result->head = head + "-";
-  result->tail = tail;
-  return true;
+  return byteOffsets;
 }
--- a/lib/Epub/Epub/hyphenation/Hyphenator.h
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.h
@@ -1,20 +1,12 @@
 #pragma once

-#include <EpdFontFamily.h>
-
+#include <cstddef>
 #include <string>
-
-class GfxRenderer;
-
-// Holds the split portions of a hyphenated word.
-struct HyphenationResult {
-  std::string head;
-  std::string tail;
-};
+#include <vector>

 class Hyphenator {
 public:
-  // Splits a word so it fits within availableWidth, appending a hyphen to the head when needed.
-  static bool splitWord(const GfxRenderer& renderer, int fontId, const std::string& word, EpdFontStyle style,
-                        int availableWidth, HyphenationResult* result, bool force);
+  // Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
+  // minimum prefix/suffix constraints are returned even if no language-specific rule matches.
+  static std::vector<size_t> breakOffsets(const std::string& word, bool includeFallback);
 };