feat: dict based Hyphenation (#305)

## Summary

* Adds (optional) Hyphenation for English, French, German, Russian
languages

## Additional Context

* Included hyphenation dictionaries add approximately 280kb to the flash
usage (German alone takes 200kb)
* Trie encoded dictionaries are adopted from hypher project
(https://github.com/typst/hypher)
* Soft hyphens (and other explicit hyphens) take precedence over
dict-based hyphenation. Overall, the hyphenation rules are quite
aggressive, as I believe it makes more sense on our smaller screen.

---------

Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
Arthur Tazhitdinov
2026-01-19 17:56:26 +05:00
committed by GitHub
parent 5fef99c641
commit 8824c87490
40 changed files with 36465 additions and 52 deletions

View File

@@ -5,11 +5,50 @@
#include <algorithm>
#include <cmath>
#include <functional>
#include <iterator>
#include <limits>
#include <vector>
#include "hyphenation/Hyphenator.h"
constexpr int MAX_COST = std::numeric_limits<int>::max();
namespace {
// Soft hyphen byte pattern used throughout EPUBs (UTF-8 for U+00AD).
constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
constexpr size_t SOFT_HYPHEN_BYTES = 2;
bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }
// Removes every soft hyphen in-place so rendered glyphs match measured widths.
void stripSoftHyphensInPlace(std::string& word) {
size_t pos = 0;
while ((pos = word.find(SOFT_HYPHEN_UTF8, pos)) != std::string::npos) {
word.erase(pos, SOFT_HYPHEN_BYTES);
}
}
// Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
const EpdFontFamily::Style style, const bool appendHyphen = false) {
const bool hasSoftHyphen = containsSoftHyphen(word);
if (!hasSoftHyphen && !appendHyphen) {
return renderer.getTextWidth(fontId, word.c_str(), style);
}
std::string sanitized = word;
if (hasSoftHyphen) {
stripSoftHyphensInPlace(sanitized);
}
if (appendHyphen) {
sanitized.push_back('-');
}
return renderer.getTextWidth(fontId, sanitized.c_str(), style);
}
} // namespace
void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle) {
if (word.empty()) return;
@@ -25,10 +64,19 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
return;
}
// Apply fixed transforms before any per-line layout work.
applyParagraphIndent();
const int pageWidth = viewportWidth;
const int spaceWidth = renderer.getSpaceWidth(fontId);
const auto wordWidths = calculateWordWidths(renderer, fontId);
const auto lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths);
auto wordWidths = calculateWordWidths(renderer, fontId);
std::vector<size_t> lineBreakIndices;
if (hyphenationEnabled) {
// Use greedy layout that can split words mid-loop when a hyphenated prefix fits.
lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
} else {
lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
}
const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;
for (size_t i = 0; i < lineCount; ++i) {
@@ -42,17 +90,11 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
std::vector<uint16_t> wordWidths;
wordWidths.reserve(totalWordCount);
// add em-space at the beginning of first word in paragraph to indent
if ((style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN) && !extraParagraphSpacing) {
std::string& first_word = words.front();
first_word.insert(0, "\xe2\x80\x83");
}
auto wordsIt = words.begin();
auto wordStylesIt = wordStyles.begin();
while (wordsIt != words.end()) {
wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt));
wordWidths.push_back(measureWordWidth(renderer, fontId, *wordsIt, *wordStylesIt));
std::advance(wordsIt, 1);
std::advance(wordStylesIt, 1);
@@ -61,8 +103,21 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
return wordWidths;
}
std::vector<size_t> ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth,
const std::vector<uint16_t>& wordWidths) const {
std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
const int spaceWidth, std::vector<uint16_t>& wordWidths) {
if (words.empty()) {
return {};
}
// Ensure any word that would overflow even as the first entry on a line is split using fallback hyphenation.
for (size_t i = 0; i < wordWidths.size(); ++i) {
while (wordWidths[i] > pageWidth) {
if (!hyphenateWordAtIndex(i, pageWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) {
break;
}
}
}
const size_t totalWordCount = words.size();
// DP table to store the minimum badness (cost) of lines starting at index i
@@ -140,6 +195,138 @@ std::vector<size_t> ParsedText::computeLineBreaks(const int pageWidth, const int
return lineBreakIndices;
}
void ParsedText::applyParagraphIndent() {
if (extraParagraphSpacing || words.empty()) {
return;
}
if (style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN) {
words.front().insert(0, "\xe2\x80\x83");
}
}
// Builds break indices while opportunistically splitting the word that would overflow the current line.
std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId,
const int pageWidth, const int spaceWidth,
std::vector<uint16_t>& wordWidths) {
std::vector<size_t> lineBreakIndices;
size_t currentIndex = 0;
while (currentIndex < wordWidths.size()) {
const size_t lineStart = currentIndex;
int lineWidth = 0;
// Consume as many words as possible for current line, splitting when prefixes fit
while (currentIndex < wordWidths.size()) {
const bool isFirstWord = currentIndex == lineStart;
const int spacing = isFirstWord ? 0 : spaceWidth;
const int candidateWidth = spacing + wordWidths[currentIndex];
// Word fits on current line
if (lineWidth + candidateWidth <= pageWidth) {
lineWidth += candidateWidth;
++currentIndex;
continue;
}
// Word would overflow — try to split based on hyphenation points
const int availableWidth = pageWidth - lineWidth - spacing;
const bool allowFallbackBreaks = isFirstWord; // Only for first word on line
if (availableWidth > 0 &&
hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
// Prefix now fits; append it to this line and move to next line
lineWidth += spacing + wordWidths[currentIndex];
++currentIndex;
break;
}
// Could not split: force at least one word per line to avoid infinite loop
if (currentIndex == lineStart) {
lineWidth += candidateWidth;
++currentIndex;
}
break;
}
lineBreakIndices.push_back(currentIndex);
}
return lineBreakIndices;
}
// Splits words[wordIndex] into prefix (adding a hyphen only when needed) and remainder when a legal breakpoint fits the
// available width.
bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer,
const int fontId, std::vector<uint16_t>& wordWidths,
const bool allowFallbackBreaks) {
// Guard against invalid indices or zero available width before attempting to split.
if (availableWidth <= 0 || wordIndex >= words.size()) {
return false;
}
// Get iterators to target word and style.
auto wordIt = words.begin();
auto styleIt = wordStyles.begin();
std::advance(wordIt, wordIndex);
std::advance(styleIt, wordIndex);
const std::string& word = *wordIt;
const auto style = *styleIt;
// Collect candidate breakpoints (byte offsets and hyphen requirements).
auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks);
if (breakInfos.empty()) {
return false;
}
size_t chosenOffset = 0;
int chosenWidth = -1;
bool chosenNeedsHyphen = true;
// Iterate over each legal breakpoint and retain the widest prefix that still fits.
for (const auto& info : breakInfos) {
const size_t offset = info.byteOffset;
if (offset == 0 || offset >= word.size()) {
continue;
}
const bool needsHyphen = info.requiresInsertedHyphen;
const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen);
if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) {
continue; // Skip if too wide or not an improvement
}
chosenWidth = prefixWidth;
chosenOffset = offset;
chosenNeedsHyphen = needsHyphen;
}
if (chosenWidth < 0) {
// No hyphenation point produced a prefix that fits in the remaining space.
return false;
}
// Split the word at the selected breakpoint and append a hyphen if required.
std::string remainder = word.substr(chosenOffset);
wordIt->resize(chosenOffset);
if (chosenNeedsHyphen) {
wordIt->push_back('-');
}
// Insert the remainder word (with matching style) directly after the prefix.
auto insertWordIt = std::next(wordIt);
auto insertStyleIt = std::next(styleIt);
words.insert(insertWordIt, remainder);
wordStyles.insert(insertStyleIt, style);
// Update cached widths to reflect the new prefix/remainder pairing.
wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style);
wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth);
return true;
}
void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth,
const std::vector<uint16_t>& wordWidths, const std::vector<size_t>& lineBreakIndices,
const std::function<void(std::shared_ptr<TextBlock>)>& processLine) {
@@ -191,5 +378,11 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
std::list<EpdFontFamily::Style> lineWordStyles;
lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt);
for (auto& word : lineWords) {
if (containsSoftHyphen(word)) {
stripSoftHyphensInPlace(word);
}
}
processLine(std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style));
}
}