Implement hyphenation support in text layout by enhancing word splitting and line breaking logic

2025-12-26 00:39:09 +05:00 · 2025-12-26 00:39:09 +05:00 · e7edcb6467
commit e7edcb6467
parent 54d7a9437e
4 changed files with 255 additions and 131 deletions
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@ -1,15 +1,72 @@
 #include "ParsedText.h"

 #include <GfxRenderer.h>
+#include "hyphenation/Hyphenator.h"

 #include <algorithm>
 #include <cmath>
 #include <functional>
+#include <iterator>
 #include <limits>
 #include <vector>

 constexpr int MAX_COST = std::numeric_limits<int>::max();

+namespace {
+
+struct HyphenSplitDecision {
+  size_t byteOffset;
+  uint16_t prefixWidth;
+};
+
+struct HyphenationGuard {
+  size_t prefixIndex;
+  size_t tailIndex;
+};
+
+bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
+                         const EpdFontStyle style, const int availableWidth, const bool includeFallback,
+                         HyphenSplitDecision* decision) {
+  if (!decision || availableWidth <= 0) {
+    return false;
+  }
+
+  const int hyphenWidth = renderer.getTextWidth(fontId, "-", style);
+  const int adjustedWidth = availableWidth - hyphenWidth;
+  if (adjustedWidth <= 0) {
+    return false;
+  }
+
+  auto offsets = Hyphenator::breakOffsets(word, includeFallback);
+  if (offsets.empty()) {
+    return false;
+  }
+
+  size_t chosenOffset = std::numeric_limits<size_t>::max();
+  uint16_t chosenWidth = 0;
+
+  for (const size_t offset : offsets) {
+    const std::string prefix = word.substr(0, offset);
+    const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
+    if (prefixWidth <= adjustedWidth) {
+      chosenOffset = offset;
+      chosenWidth = static_cast<uint16_t>(prefixWidth + hyphenWidth);
+    } else {
+      break;
+    }
+  }
+
+  if (chosenOffset == std::numeric_limits<size_t>::max()) {
+    return false;
+  }
+
+  decision->byteOffset = chosenOffset;
+  decision->prefixWidth = chosenWidth;
+  return true;
+}
+
+}  // namespace
+
 void ParsedText::addWord(std::string word, const EpdFontStyle fontStyle) {
  if (word.empty()) return;

@ -27,8 +84,9 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo

  const int pageWidth = renderer.getScreenWidth() - horizontalMargin;
  const int spaceWidth = renderer.getSpaceWidth(fontId);
-  const auto wordWidths = calculateWordWidths(renderer, fontId);
-  const auto lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths);
+  // Pre-split oversized tokens so the DP step always has feasible line candidates.
+  auto wordWidths = calculateWordWidths(renderer, fontId, pageWidth);
+  auto lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
  const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;

  for (size_t i = 0; i < lineCount; ++i) {
@ -36,7 +94,8 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
  }
 }

-std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) {
+std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId,
+                                                      const int pageWidth) {
  const size_t totalWordCount = words.size();

  std::vector<uint16_t> wordWidths;
@ -52,7 +111,32 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
  auto wordStylesIt = wordStyles.begin();

  while (wordsIt != words.end()) {
-    wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt));
+    uint16_t width = renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt);
+
+    if (width > pageWidth) {
+      HyphenSplitDecision decision;
+      if (chooseSplitForWidth(renderer, fontId, *wordsIt, *wordStylesIt, pageWidth, true, &decision)) {
+        const std::string originalWord = *wordsIt;
+        const std::string tail = originalWord.substr(decision.byteOffset);
+        if (tail.empty()) {
+          continue;
+        }
+        const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-";
+
+        *wordsIt = prefix;
+        auto nextWordIt = words.insert(std::next(wordsIt), tail);
+        auto nextStyleIt = wordStyles.insert(std::next(wordStylesIt), *wordStylesIt);
+        // Continue processing the freshly inserted tail so cascading splits still respect the limit.
+
+        wordWidths.push_back(decision.prefixWidth);
+
+        wordsIt = nextWordIt;
+        wordStylesIt = nextStyleIt;
+        continue;
+      }
+    }
+
+    wordWidths.push_back(width);

    std::advance(wordsIt, 1);
    std::advance(wordStylesIt, 1);
@ -61,70 +145,159 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
  return wordWidths;
 }

-std::vector<size_t> ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth,
-                                                  const std::vector<uint16_t>& wordWidths) const {
-  const size_t totalWordCount = words.size();
+std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
+                                                  const int spaceWidth, std::vector<uint16_t>& wordWidths) {
+  if (words.empty()) {
+    return {};
+  }

-  // DP table to store the minimum badness (cost) of lines starting at index i
-  std::vector<int> dp(totalWordCount);
-  // 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i'
-  std::vector<size_t> ans(totalWordCount);
+  std::vector<HyphenationGuard> guards;

-  // Base Case
-  dp[totalWordCount - 1] = 0;
-  ans[totalWordCount - 1] = totalWordCount - 1;
+  auto shiftGuardIndices = [&](size_t insertPos) {
+    for (auto& guard : guards) {
+      if (guard.prefixIndex >= insertPos) {
+        guard.prefixIndex++;
+      }
+      if (guard.tailIndex >= insertPos) {
+        guard.tailIndex++;
+      }
+    }
+  };

-  for (int i = totalWordCount - 2; i >= 0; --i) {
-    int currlen = -spaceWidth;
-    dp[i] = MAX_COST;
+  auto runDp = [&](std::vector<size_t>& lineBreaks) {
+    const size_t totalWordCount = wordWidths.size();

-    for (size_t j = i; j < totalWordCount; ++j) {
-      // Current line length: previous width + space + current word width
-      currlen += wordWidths[j] + spaceWidth;
+    std::vector<int> dp(totalWordCount);
+    std::vector<size_t> ans(totalWordCount);

-      if (currlen > pageWidth) {
+    dp[totalWordCount - 1] = 0;
+    ans[totalWordCount - 1] = totalWordCount - 1;
+
+    for (int i = static_cast<int>(totalWordCount) - 2; i >= 0; --i) {
+      int currlen = -spaceWidth;
+      dp[i] = MAX_COST;
+
+      for (size_t j = i; j < totalWordCount; ++j) {
+        currlen += wordWidths[j] + spaceWidth;
+
+        if (currlen > pageWidth) {
+          break;
+        }
+
+        bool violatesGuard = false;
+        for (const auto& guard : guards) {
+          if (i <= guard.prefixIndex && j >= guard.tailIndex) {
+            violatesGuard = true;
+            break;
+          }
+        }
+        if (violatesGuard) {
+          continue;
+        }
+
+        int cost;
+        if (j == totalWordCount - 1) {
+          cost = 0;
+        } else {
+          const int remainingSpace = pageWidth - currlen;
+          const long long cost_ll = static_cast<long long>(remainingSpace) * remainingSpace + dp[j + 1];
+          cost = cost_ll > MAX_COST ? MAX_COST : static_cast<int>(cost_ll);
+        }
+
+        if (cost < dp[i]) {
+          dp[i] = cost;
+          ans[i] = j;
+        }
+      }
+    }
+
+    lineBreaks.clear();
+    size_t currentWordIndex = 0;
+    constexpr size_t MAX_LINES = 1000;
+
+    while (currentWordIndex < totalWordCount && lineBreaks.size() < MAX_LINES) {
+      const size_t nextBreakIndex = ans[currentWordIndex] + 1;
+      lineBreaks.push_back(nextBreakIndex);
+      currentWordIndex = nextBreakIndex;
+    }
+  };
+
+  std::vector<size_t> lineBreakIndices;
+
+  while (true) {
+    runDp(lineBreakIndices);
+
+    if (!hyphenationEnabled) {
+      return lineBreakIndices;
+    }
+
+    bool insertedSplit = false;
+    size_t lastBreakAt = 0;
+
+    for (size_t lineIdx = 0; lineIdx < lineBreakIndices.size(); ++lineIdx) {
+      const size_t lineBreak = lineBreakIndices[lineIdx];
+      const bool isLastLine = lineIdx == lineBreakIndices.size() - 1;
+      const size_t lineWordCount = lineBreak - lastBreakAt;
+
+      int lineWordWidthSum = 0;
+      for (size_t idx = lastBreakAt; idx < lineBreak; ++idx) {
+        lineWordWidthSum += wordWidths[idx];
+      }
+      lastBreakAt = lineBreak;
+
+      if (isLastLine || lineBreak >= wordWidths.size()) {
+        continue;
+      }
+
+      const size_t spacingCount = lineWordCount > 0 ? lineWordCount - 1 : 0;
+      const int usedSpace = lineWordWidthSum + static_cast<int>(spacingCount) * spaceWidth;
+      const int unusedWidth = pageWidth - usedSpace;
+      const int spaceNeeded = lineWordCount == 0 ? 0 : spaceWidth;
+      const int budgetForPrefix = unusedWidth - spaceNeeded;
+      if (budgetForPrefix <= 0) {
+        continue;
+      }
+
+      auto nextWordIt = words.begin();
+      auto nextStyleIt = wordStyles.begin();
+      std::advance(nextWordIt, lineBreak);
+      std::advance(nextStyleIt, lineBreak);
+
+      if (nextWordIt == words.end()) {
        break;
      }

-      int cost;
-      if (j == totalWordCount - 1) {
-        cost = 0;  // Last line
-      } else {
-        const int remainingSpace = pageWidth - currlen;
-        // Use long long for the square to prevent overflow
-        const long long cost_ll = static_cast<long long>(remainingSpace) * remainingSpace + dp[j + 1];
-
-        if (cost_ll > MAX_COST) {
-          cost = MAX_COST;
-        } else {
-          cost = static_cast<int>(cost_ll);
-        }
+      HyphenSplitDecision decision;
+      if (!chooseSplitForWidth(renderer, fontId, *nextWordIt, *nextStyleIt, budgetForPrefix, false, &decision)) {
+        continue;
      }

-      if (cost < dp[i]) {
-        dp[i] = cost;
-        ans[i] = j;  // j is the index of the last word in this optimal line
+      const EpdFontStyle styleForSplit = *nextStyleIt;
+      const std::string originalWord = *nextWordIt;
+      const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-";
+      const std::string tail = originalWord.substr(decision.byteOffset);
+      if (tail.empty()) {
+        continue;
      }
-    }
-  }

-  // Stores the index of the word that starts the next line (last_word_index + 1)
-  std::vector<size_t> lineBreakIndices;
-  size_t currentWordIndex = 0;
-  constexpr size_t MAX_LINES = 1000;
+      *nextWordIt = tail;
+      words.insert(nextWordIt, prefix);
+      wordStyles.insert(nextStyleIt, styleForSplit);

-  while (currentWordIndex < totalWordCount) {
-    if (lineBreakIndices.size() >= MAX_LINES) {
+      const uint16_t tailWidth = renderer.getTextWidth(fontId, tail.c_str(), styleForSplit);
+      wordWidths.insert(wordWidths.begin() + lineBreak, decision.prefixWidth);
+      wordWidths[lineBreak + 1] = tailWidth;
+
+      shiftGuardIndices(lineBreak);
+      guards.push_back({lineBreak, lineBreak + 1});
+      insertedSplit = true;
      break;
    }

-    size_t nextBreakIndex = ans[currentWordIndex] + 1;
-    lineBreakIndices.push_back(nextBreakIndex);
-
-    currentWordIndex = nextBreakIndex;
+    if (!insertedSplit) {
+      return lineBreakIndices;
+    }
  }
-
-  return lineBreakIndices;
 }

 void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth,
@ -136,8 +309,8 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const

  // Calculate total word width for this line
  int lineWordWidthSum = 0;
-  for (size_t i = lastBreakAt; i < lineBreak; i++) {
-    lineWordWidthSum += wordWidths[i];
+  for (size_t idx = lastBreakAt; idx < lineBreak; ++idx) {
+    lineWordWidthSum += wordWidths[idx];
  }

  // Calculate spacing
--- a/lib/Epub/Epub/ParsedText.h
+++ b/lib/Epub/Epub/ParsedText.h
@ -19,11 +19,12 @@ class ParsedText {
  bool extraParagraphSpacing;
  bool hyphenationEnabled;

-  std::vector<size_t> computeLineBreaks(int pageWidth, int spaceWidth, const std::vector<uint16_t>& wordWidths) const;
+  std::vector<size_t> computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth,
+                                        std::vector<uint16_t>& wordWidths);
  void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector<uint16_t>& wordWidths,
                   const std::vector<size_t>& lineBreakIndices,
                   const std::function<void(std::shared_ptr<TextBlock>)>& processLine);
-  std::vector<uint16_t> calculateWordWidths(const GfxRenderer& renderer, int fontId);
+  std::vector<uint16_t> calculateWordWidths(const GfxRenderer& renderer, int fontId, int pageWidth);

 public:
  explicit ParsedText(const TextBlock::BLOCK_STYLE style, const bool extraParagraphSpacing,
--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@ -1,11 +1,9 @@
 #include "Hyphenator.h"

-#include <GfxRenderer.h>
 #include <Utf8.h>

 #include <algorithm>
 #include <array>
-#include <limits>
 #include <vector>

 #include "EnglishHyphenator.h"
@ -87,84 +85,44 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
  return cps[index].byteOffset;
 }

-// Safely slices a UTF-8 string without splitting multibyte sequences.
-std::string slice(const std::string& word, const size_t startByte, const size_t endByte) {
-  if (startByte >= endByte || startByte >= word.size()) {
-    return std::string();
-  }
-  const size_t boundedEnd = std::min(endByte, word.size());
-  return word.substr(startByte, boundedEnd - startByte);
-}
-
 }  // namespace

-bool Hyphenator::splitWord(const GfxRenderer& renderer, const int fontId, const std::string& word,
-                           const EpdFontStyle style, const int availableWidth, HyphenationResult* result,
-                           const bool force) {
-  if (!result || word.empty()) {
-    return false;
+std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
+  std::vector<size_t> byteOffsets;
+  if (word.empty()) {
+    return byteOffsets;
  }

  auto cps = collectCodepoints(word);
  if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
-    return false;
+    return byteOffsets;
  }

-  // Skip mixed tokens (e.g., "v2.0") unless the caller forces a split due to overflow.
-  if (!force && !hasOnlyAlphabetic(cps)) {
-    return false;
+  std::vector<size_t> indexes;
+  indexes.reserve(cps.size());
+
+  if (hasOnlyAlphabetic(cps)) {
+    auto dictBreaks = collectBreakIndexes(cps);
+    indexes.insert(indexes.end(), dictBreaks.begin(), dictBreaks.end());
  }

-  const auto breakIndexes = collectBreakIndexes(cps);
-  // Budget for a trailing hyphen so rendered width matches the layout test.
-  const int hyphenWidth = renderer.getTextWidth(fontId, "-", style);
-  const int adjustedWidth = availableWidth - hyphenWidth;
-
-  size_t chosenIndex = std::numeric_limits<size_t>::max();
-
-  // Prefer dictionary-style break points emitted by language hyphenators.
-  if (adjustedWidth > 0) {
-    for (const size_t idx : breakIndexes) {
-      const size_t byteOffset = byteOffsetForIndex(cps, idx);
-      const std::string prefix = word.substr(0, byteOffset);
-      const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
-      if (prefixWidth <= adjustedWidth) {
-        chosenIndex = idx;
-      } else {
-        break;
-      }
-    }
-  }
-
-  if (chosenIndex == std::numeric_limits<size_t>::max() && force) {
-    // Emergency fallback: brute-force through codepoints to avoid overflow when no legal breaks fit.
+  if (includeFallback) {
    for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
-      const size_t byteOffset = byteOffsetForIndex(cps, idx);
-      const std::string prefix = word.substr(0, byteOffset);
-      const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
-      if (adjustedWidth <= 0 || prefixWidth <= adjustedWidth) {
-        chosenIndex = idx;
-        if (adjustedWidth > 0 && prefixWidth > adjustedWidth) {
-          break;
-        }
-      }
+      indexes.push_back(idx);
    }
  }

-  if (chosenIndex == std::numeric_limits<size_t>::max()) {
-    return false;
+  if (indexes.empty()) {
+    return byteOffsets;
  }

-  const size_t splitByte = byteOffsetForIndex(cps, chosenIndex);
-  const std::string head = word.substr(0, splitByte);
-  const std::string tail = slice(word, splitByte, word.size());
+  std::sort(indexes.begin(), indexes.end());
+  indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());

-  if (head.empty() || tail.empty()) {
-    return false;
+  byteOffsets.reserve(indexes.size());
+  for (const size_t idx : indexes) {
+    byteOffsets.push_back(byteOffsetForIndex(cps, idx));
  }

-  // Append the printed hyphen to the prefix while leaving the tail untouched.
-  result->head = head + "-";
-  result->tail = tail;
-  return true;
+  return byteOffsets;
 }
--- a/lib/Epub/Epub/hyphenation/Hyphenator.h
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.h
@ -1,20 +1,12 @@
 #pragma once

-#include <EpdFontFamily.h>
-
+#include <cstddef>
 #include <string>
-
-class GfxRenderer;
-
-// Holds the split portions of a hyphenated word.
-struct HyphenationResult {
-  std::string head;
-  std::string tail;
-};
+#include <vector>

 class Hyphenator {
 public:
-  // Splits a word so it fits within availableWidth, appending a hyphen to the head when needed.
-  static bool splitWord(const GfxRenderer& renderer, int fontId, const std::string& word, EpdFontStyle style,
-                        int availableWidth, HyphenationResult* result, bool force);
+  // Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
+  // minimum prefix/suffix constraints are returned even if no language-specific rule matches.
+  static std::vector<size_t> breakOffsets(const std::string& word, bool includeFallback);
 };