feat: dict based Hyphenation (#305)

## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
2026-01-19 17:56:26 +05:00
parent 5fef99c641
commit 8824c87490
40 changed files with 36465 additions and 52 deletions
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@@ -5,11 +5,50 @@
 #include <algorithm>
 #include <cmath>
 #include <functional>
+#include <iterator>
 #include <limits>
 #include <vector>

+#include "hyphenation/Hyphenator.h"
+
 constexpr int MAX_COST = std::numeric_limits<int>::max();

+namespace {
+
+// Soft hyphen byte pattern used throughout EPUBs (UTF-8 for U+00AD).
+constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
+constexpr size_t SOFT_HYPHEN_BYTES = 2;
+
+bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }
+
+// Removes every soft hyphen in-place so rendered glyphs match measured widths.
+void stripSoftHyphensInPlace(std::string& word) {
+  size_t pos = 0;
+  while ((pos = word.find(SOFT_HYPHEN_UTF8, pos)) != std::string::npos) {
+    word.erase(pos, SOFT_HYPHEN_BYTES);
+  }
+}
+
+// Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
+uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
+                          const EpdFontFamily::Style style, const bool appendHyphen = false) {
+  const bool hasSoftHyphen = containsSoftHyphen(word);
+  if (!hasSoftHyphen && !appendHyphen) {
+    return renderer.getTextWidth(fontId, word.c_str(), style);
+  }
+
+  std::string sanitized = word;
+  if (hasSoftHyphen) {
+    stripSoftHyphensInPlace(sanitized);
+  }
+  if (appendHyphen) {
+    sanitized.push_back('-');
+  }
+  return renderer.getTextWidth(fontId, sanitized.c_str(), style);
+}
+
+}  // namespace
+
 void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle) {
  if (word.empty()) return;

@@ -25,10 +64,19 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
    return;
  }

+  // Apply fixed transforms before any per-line layout work.
+  applyParagraphIndent();
+
  const int pageWidth = viewportWidth;
  const int spaceWidth = renderer.getSpaceWidth(fontId);
-  const auto wordWidths = calculateWordWidths(renderer, fontId);
-  const auto lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths);
+  auto wordWidths = calculateWordWidths(renderer, fontId);
+  std::vector<size_t> lineBreakIndices;
+  if (hyphenationEnabled) {
+    // Use greedy layout that can split words mid-loop when a hyphenated prefix fits.
+    lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
+  } else {
+    lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
+  }
  const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;

  for (size_t i = 0; i < lineCount; ++i) {
@@ -42,17 +90,11 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
  std::vector<uint16_t> wordWidths;
  wordWidths.reserve(totalWordCount);

-  // add em-space at the beginning of first word in paragraph to indent
-  if ((style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN) && !extraParagraphSpacing) {
-    std::string& first_word = words.front();
-    first_word.insert(0, "\xe2\x80\x83");
-  }
-
  auto wordsIt = words.begin();
  auto wordStylesIt = wordStyles.begin();

  while (wordsIt != words.end()) {
-    wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt));
+    wordWidths.push_back(measureWordWidth(renderer, fontId, *wordsIt, *wordStylesIt));

    std::advance(wordsIt, 1);
    std::advance(wordStylesIt, 1);
@@ -61,8 +103,21 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
  return wordWidths;
 }

-std::vector<size_t> ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth,
-                                                  const std::vector<uint16_t>& wordWidths) const {
+std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
+                                                  const int spaceWidth, std::vector<uint16_t>& wordWidths) {
+  if (words.empty()) {
+    return {};
+  }
+
+  // Ensure any word that would overflow even as the first entry on a line is split using fallback hyphenation.
+  for (size_t i = 0; i < wordWidths.size(); ++i) {
+    while (wordWidths[i] > pageWidth) {
+      if (!hyphenateWordAtIndex(i, pageWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) {
+        break;
+      }
+    }
+  }
+
  const size_t totalWordCount = words.size();

  // DP table to store the minimum badness (cost) of lines starting at index i
@@ -140,6 +195,138 @@ std::vector<size_t> ParsedText::computeLineBreaks(const int pageWidth, const int
  return lineBreakIndices;
 }

+void ParsedText::applyParagraphIndent() {
+  if (extraParagraphSpacing || words.empty()) {
+    return;
+  }
+
+  if (style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN) {
+    words.front().insert(0, "\xe2\x80\x83");
+  }
+}
+
+// Builds break indices while opportunistically splitting the word that would overflow the current line.
+std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId,
+                                                            const int pageWidth, const int spaceWidth,
+                                                            std::vector<uint16_t>& wordWidths) {
+  std::vector<size_t> lineBreakIndices;
+  size_t currentIndex = 0;
+
+  while (currentIndex < wordWidths.size()) {
+    const size_t lineStart = currentIndex;
+    int lineWidth = 0;
+
+    // Consume as many words as possible for current line, splitting when prefixes fit
+    while (currentIndex < wordWidths.size()) {
+      const bool isFirstWord = currentIndex == lineStart;
+      const int spacing = isFirstWord ? 0 : spaceWidth;
+      const int candidateWidth = spacing + wordWidths[currentIndex];
+
+      // Word fits on current line
+      if (lineWidth + candidateWidth <= pageWidth) {
+        lineWidth += candidateWidth;
+        ++currentIndex;
+        continue;
+      }
+
+      // Word would overflow — try to split based on hyphenation points
+      const int availableWidth = pageWidth - lineWidth - spacing;
+      const bool allowFallbackBreaks = isFirstWord;  // Only for first word on line
+
+      if (availableWidth > 0 &&
+          hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
+        // Prefix now fits; append it to this line and move to next line
+        lineWidth += spacing + wordWidths[currentIndex];
+        ++currentIndex;
+        break;
+      }
+
+      // Could not split: force at least one word per line to avoid infinite loop
+      if (currentIndex == lineStart) {
+        lineWidth += candidateWidth;
+        ++currentIndex;
+      }
+      break;
+    }
+
+    lineBreakIndices.push_back(currentIndex);
+  }
+
+  return lineBreakIndices;
+}
+
+// Splits words[wordIndex] into prefix (adding a hyphen only when needed) and remainder when a legal breakpoint fits the
+// available width.
+bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer,
+                                      const int fontId, std::vector<uint16_t>& wordWidths,
+                                      const bool allowFallbackBreaks) {
+  // Guard against invalid indices or zero available width before attempting to split.
+  if (availableWidth <= 0 || wordIndex >= words.size()) {
+    return false;
+  }
+
+  // Get iterators to target word and style.
+  auto wordIt = words.begin();
+  auto styleIt = wordStyles.begin();
+  std::advance(wordIt, wordIndex);
+  std::advance(styleIt, wordIndex);
+
+  const std::string& word = *wordIt;
+  const auto style = *styleIt;
+
+  // Collect candidate breakpoints (byte offsets and hyphen requirements).
+  auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks);
+  if (breakInfos.empty()) {
+    return false;
+  }
+
+  size_t chosenOffset = 0;
+  int chosenWidth = -1;
+  bool chosenNeedsHyphen = true;
+
+  // Iterate over each legal breakpoint and retain the widest prefix that still fits.
+  for (const auto& info : breakInfos) {
+    const size_t offset = info.byteOffset;
+    if (offset == 0 || offset >= word.size()) {
+      continue;
+    }
+
+    const bool needsHyphen = info.requiresInsertedHyphen;
+    const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen);
+    if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) {
+      continue;  // Skip if too wide or not an improvement
+    }
+
+    chosenWidth = prefixWidth;
+    chosenOffset = offset;
+    chosenNeedsHyphen = needsHyphen;
+  }
+
+  if (chosenWidth < 0) {
+    // No hyphenation point produced a prefix that fits in the remaining space.
+    return false;
+  }
+
+  // Split the word at the selected breakpoint and append a hyphen if required.
+  std::string remainder = word.substr(chosenOffset);
+  wordIt->resize(chosenOffset);
+  if (chosenNeedsHyphen) {
+    wordIt->push_back('-');
+  }
+
+  // Insert the remainder word (with matching style) directly after the prefix.
+  auto insertWordIt = std::next(wordIt);
+  auto insertStyleIt = std::next(styleIt);
+  words.insert(insertWordIt, remainder);
+  wordStyles.insert(insertStyleIt, style);
+
+  // Update cached widths to reflect the new prefix/remainder pairing.
+  wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
+  const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style);
+  wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth);
+  return true;
+}
+
 void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth,
                             const std::vector<uint16_t>& wordWidths, const std::vector<size_t>& lineBreakIndices,
                             const std::function<void(std::shared_ptr<TextBlock>)>& processLine) {
@@ -191,5 +378,11 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
  std::list<EpdFontFamily::Style> lineWordStyles;
  lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt);

+  for (auto& word : lineWords) {
+    if (containsSoftHyphen(word)) {
+      stripSoftHyphensInPlace(word);
+    }
+  }
+
  processLine(std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style));
-}
+}