feat: dict based Hyphenation (#305)
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
committed by
GitHub
parent
5fef99c641
commit
8824c87490
@@ -5,11 +5,50 @@
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "hyphenation/Hyphenator.h"
|
||||
|
||||
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
||||
|
||||
namespace {
|
||||
|
||||
// Soft hyphen byte pattern used throughout EPUBs (UTF-8 for U+00AD).
|
||||
constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
|
||||
constexpr size_t SOFT_HYPHEN_BYTES = 2;
|
||||
|
||||
bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }
|
||||
|
||||
// Removes every soft hyphen in-place so rendered glyphs match measured widths.
|
||||
void stripSoftHyphensInPlace(std::string& word) {
|
||||
size_t pos = 0;
|
||||
while ((pos = word.find(SOFT_HYPHEN_UTF8, pos)) != std::string::npos) {
|
||||
word.erase(pos, SOFT_HYPHEN_BYTES);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
|
||||
uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||||
const EpdFontFamily::Style style, const bool appendHyphen = false) {
|
||||
const bool hasSoftHyphen = containsSoftHyphen(word);
|
||||
if (!hasSoftHyphen && !appendHyphen) {
|
||||
return renderer.getTextWidth(fontId, word.c_str(), style);
|
||||
}
|
||||
|
||||
std::string sanitized = word;
|
||||
if (hasSoftHyphen) {
|
||||
stripSoftHyphensInPlace(sanitized);
|
||||
}
|
||||
if (appendHyphen) {
|
||||
sanitized.push_back('-');
|
||||
}
|
||||
return renderer.getTextWidth(fontId, sanitized.c_str(), style);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle) {
|
||||
if (word.empty()) return;
|
||||
|
||||
@@ -25,10 +64,19 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
||||
return;
|
||||
}
|
||||
|
||||
// Apply fixed transforms before any per-line layout work.
|
||||
applyParagraphIndent();
|
||||
|
||||
const int pageWidth = viewportWidth;
|
||||
const int spaceWidth = renderer.getSpaceWidth(fontId);
|
||||
const auto wordWidths = calculateWordWidths(renderer, fontId);
|
||||
const auto lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths);
|
||||
auto wordWidths = calculateWordWidths(renderer, fontId);
|
||||
std::vector<size_t> lineBreakIndices;
|
||||
if (hyphenationEnabled) {
|
||||
// Use greedy layout that can split words mid-loop when a hyphenated prefix fits.
|
||||
lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
|
||||
} else {
|
||||
lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
|
||||
}
|
||||
const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;
|
||||
|
||||
for (size_t i = 0; i < lineCount; ++i) {
|
||||
@@ -42,17 +90,11 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
|
||||
std::vector<uint16_t> wordWidths;
|
||||
wordWidths.reserve(totalWordCount);
|
||||
|
||||
// add em-space at the beginning of first word in paragraph to indent
|
||||
if ((style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN) && !extraParagraphSpacing) {
|
||||
std::string& first_word = words.front();
|
||||
first_word.insert(0, "\xe2\x80\x83");
|
||||
}
|
||||
|
||||
auto wordsIt = words.begin();
|
||||
auto wordStylesIt = wordStyles.begin();
|
||||
|
||||
while (wordsIt != words.end()) {
|
||||
wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt));
|
||||
wordWidths.push_back(measureWordWidth(renderer, fontId, *wordsIt, *wordStylesIt));
|
||||
|
||||
std::advance(wordsIt, 1);
|
||||
std::advance(wordStylesIt, 1);
|
||||
@@ -61,8 +103,21 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
|
||||
return wordWidths;
|
||||
}
|
||||
|
||||
std::vector<size_t> ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth,
|
||||
const std::vector<uint16_t>& wordWidths) const {
|
||||
std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
|
||||
const int spaceWidth, std::vector<uint16_t>& wordWidths) {
|
||||
if (words.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Ensure any word that would overflow even as the first entry on a line is split using fallback hyphenation.
|
||||
for (size_t i = 0; i < wordWidths.size(); ++i) {
|
||||
while (wordWidths[i] > pageWidth) {
|
||||
if (!hyphenateWordAtIndex(i, pageWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const size_t totalWordCount = words.size();
|
||||
|
||||
// DP table to store the minimum badness (cost) of lines starting at index i
|
||||
@@ -140,6 +195,138 @@ std::vector<size_t> ParsedText::computeLineBreaks(const int pageWidth, const int
|
||||
return lineBreakIndices;
|
||||
}
|
||||
|
||||
void ParsedText::applyParagraphIndent() {
|
||||
if (extraParagraphSpacing || words.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (style == TextBlock::JUSTIFIED || style == TextBlock::LEFT_ALIGN) {
|
||||
words.front().insert(0, "\xe2\x80\x83");
|
||||
}
|
||||
}
|
||||
|
||||
// Builds break indices while opportunistically splitting the word that would overflow the current line.
|
||||
std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId,
|
||||
const int pageWidth, const int spaceWidth,
|
||||
std::vector<uint16_t>& wordWidths) {
|
||||
std::vector<size_t> lineBreakIndices;
|
||||
size_t currentIndex = 0;
|
||||
|
||||
while (currentIndex < wordWidths.size()) {
|
||||
const size_t lineStart = currentIndex;
|
||||
int lineWidth = 0;
|
||||
|
||||
// Consume as many words as possible for current line, splitting when prefixes fit
|
||||
while (currentIndex < wordWidths.size()) {
|
||||
const bool isFirstWord = currentIndex == lineStart;
|
||||
const int spacing = isFirstWord ? 0 : spaceWidth;
|
||||
const int candidateWidth = spacing + wordWidths[currentIndex];
|
||||
|
||||
// Word fits on current line
|
||||
if (lineWidth + candidateWidth <= pageWidth) {
|
||||
lineWidth += candidateWidth;
|
||||
++currentIndex;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Word would overflow — try to split based on hyphenation points
|
||||
const int availableWidth = pageWidth - lineWidth - spacing;
|
||||
const bool allowFallbackBreaks = isFirstWord; // Only for first word on line
|
||||
|
||||
if (availableWidth > 0 &&
|
||||
hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
|
||||
// Prefix now fits; append it to this line and move to next line
|
||||
lineWidth += spacing + wordWidths[currentIndex];
|
||||
++currentIndex;
|
||||
break;
|
||||
}
|
||||
|
||||
// Could not split: force at least one word per line to avoid infinite loop
|
||||
if (currentIndex == lineStart) {
|
||||
lineWidth += candidateWidth;
|
||||
++currentIndex;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
lineBreakIndices.push_back(currentIndex);
|
||||
}
|
||||
|
||||
return lineBreakIndices;
|
||||
}
|
||||
|
||||
// Splits words[wordIndex] into prefix (adding a hyphen only when needed) and remainder when a legal breakpoint fits the
|
||||
// available width.
|
||||
bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer,
|
||||
const int fontId, std::vector<uint16_t>& wordWidths,
|
||||
const bool allowFallbackBreaks) {
|
||||
// Guard against invalid indices or zero available width before attempting to split.
|
||||
if (availableWidth <= 0 || wordIndex >= words.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get iterators to target word and style.
|
||||
auto wordIt = words.begin();
|
||||
auto styleIt = wordStyles.begin();
|
||||
std::advance(wordIt, wordIndex);
|
||||
std::advance(styleIt, wordIndex);
|
||||
|
||||
const std::string& word = *wordIt;
|
||||
const auto style = *styleIt;
|
||||
|
||||
// Collect candidate breakpoints (byte offsets and hyphen requirements).
|
||||
auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks);
|
||||
if (breakInfos.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t chosenOffset = 0;
|
||||
int chosenWidth = -1;
|
||||
bool chosenNeedsHyphen = true;
|
||||
|
||||
// Iterate over each legal breakpoint and retain the widest prefix that still fits.
|
||||
for (const auto& info : breakInfos) {
|
||||
const size_t offset = info.byteOffset;
|
||||
if (offset == 0 || offset >= word.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool needsHyphen = info.requiresInsertedHyphen;
|
||||
const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen);
|
||||
if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) {
|
||||
continue; // Skip if too wide or not an improvement
|
||||
}
|
||||
|
||||
chosenWidth = prefixWidth;
|
||||
chosenOffset = offset;
|
||||
chosenNeedsHyphen = needsHyphen;
|
||||
}
|
||||
|
||||
if (chosenWidth < 0) {
|
||||
// No hyphenation point produced a prefix that fits in the remaining space.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Split the word at the selected breakpoint and append a hyphen if required.
|
||||
std::string remainder = word.substr(chosenOffset);
|
||||
wordIt->resize(chosenOffset);
|
||||
if (chosenNeedsHyphen) {
|
||||
wordIt->push_back('-');
|
||||
}
|
||||
|
||||
// Insert the remainder word (with matching style) directly after the prefix.
|
||||
auto insertWordIt = std::next(wordIt);
|
||||
auto insertStyleIt = std::next(styleIt);
|
||||
words.insert(insertWordIt, remainder);
|
||||
wordStyles.insert(insertStyleIt, style);
|
||||
|
||||
// Update cached widths to reflect the new prefix/remainder pairing.
|
||||
wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
|
||||
const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style);
|
||||
wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth);
|
||||
return true;
|
||||
}
|
||||
|
||||
void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth,
|
||||
const std::vector<uint16_t>& wordWidths, const std::vector<size_t>& lineBreakIndices,
|
||||
const std::function<void(std::shared_ptr<TextBlock>)>& processLine) {
|
||||
@@ -191,5 +378,11 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
|
||||
std::list<EpdFontFamily::Style> lineWordStyles;
|
||||
lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt);
|
||||
|
||||
for (auto& word : lineWords) {
|
||||
if (containsSoftHyphen(word)) {
|
||||
stripSoftHyphensInPlace(word);
|
||||
}
|
||||
}
|
||||
|
||||
processLine(std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user