Implement hyphenation support in text layout by enhancing word splitting and line breaking logic
This commit is contained in:
parent
54d7a9437e
commit
e7edcb6467
@ -1,15 +1,72 @@
|
||||
#include "ParsedText.h"
|
||||
|
||||
#include <GfxRenderer.h>
|
||||
#include "hyphenation/Hyphenator.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
||||
|
||||
namespace {
|
||||
|
||||
struct HyphenSplitDecision {
|
||||
size_t byteOffset;
|
||||
uint16_t prefixWidth;
|
||||
};
|
||||
|
||||
struct HyphenationGuard {
|
||||
size_t prefixIndex;
|
||||
size_t tailIndex;
|
||||
};
|
||||
|
||||
bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||||
const EpdFontStyle style, const int availableWidth, const bool includeFallback,
|
||||
HyphenSplitDecision* decision) {
|
||||
if (!decision || availableWidth <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int hyphenWidth = renderer.getTextWidth(fontId, "-", style);
|
||||
const int adjustedWidth = availableWidth - hyphenWidth;
|
||||
if (adjustedWidth <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto offsets = Hyphenator::breakOffsets(word, includeFallback);
|
||||
if (offsets.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t chosenOffset = std::numeric_limits<size_t>::max();
|
||||
uint16_t chosenWidth = 0;
|
||||
|
||||
for (const size_t offset : offsets) {
|
||||
const std::string prefix = word.substr(0, offset);
|
||||
const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
|
||||
if (prefixWidth <= adjustedWidth) {
|
||||
chosenOffset = offset;
|
||||
chosenWidth = static_cast<uint16_t>(prefixWidth + hyphenWidth);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (chosenOffset == std::numeric_limits<size_t>::max()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
decision->byteOffset = chosenOffset;
|
||||
decision->prefixWidth = chosenWidth;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void ParsedText::addWord(std::string word, const EpdFontStyle fontStyle) {
|
||||
if (word.empty()) return;
|
||||
|
||||
@ -27,8 +84,9 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
||||
|
||||
const int pageWidth = renderer.getScreenWidth() - horizontalMargin;
|
||||
const int spaceWidth = renderer.getSpaceWidth(fontId);
|
||||
const auto wordWidths = calculateWordWidths(renderer, fontId);
|
||||
const auto lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths);
|
||||
// Pre-split oversized tokens so the DP step always has feasible line candidates.
|
||||
auto wordWidths = calculateWordWidths(renderer, fontId, pageWidth);
|
||||
auto lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths);
|
||||
const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;
|
||||
|
||||
for (size_t i = 0; i < lineCount; ++i) {
|
||||
@ -36,7 +94,8 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) {
|
||||
std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId,
|
||||
const int pageWidth) {
|
||||
const size_t totalWordCount = words.size();
|
||||
|
||||
std::vector<uint16_t> wordWidths;
|
||||
@ -52,7 +111,32 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
|
||||
auto wordStylesIt = wordStyles.begin();
|
||||
|
||||
while (wordsIt != words.end()) {
|
||||
wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt));
|
||||
uint16_t width = renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt);
|
||||
|
||||
if (width > pageWidth) {
|
||||
HyphenSplitDecision decision;
|
||||
if (chooseSplitForWidth(renderer, fontId, *wordsIt, *wordStylesIt, pageWidth, true, &decision)) {
|
||||
const std::string originalWord = *wordsIt;
|
||||
const std::string tail = originalWord.substr(decision.byteOffset);
|
||||
if (tail.empty()) {
|
||||
continue;
|
||||
}
|
||||
const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-";
|
||||
|
||||
*wordsIt = prefix;
|
||||
auto nextWordIt = words.insert(std::next(wordsIt), tail);
|
||||
auto nextStyleIt = wordStyles.insert(std::next(wordStylesIt), *wordStylesIt);
|
||||
// Continue processing the freshly inserted tail so cascading splits still respect the limit.
|
||||
|
||||
wordWidths.push_back(decision.prefixWidth);
|
||||
|
||||
wordsIt = nextWordIt;
|
||||
wordStylesIt = nextStyleIt;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
wordWidths.push_back(width);
|
||||
|
||||
std::advance(wordsIt, 1);
|
||||
std::advance(wordStylesIt, 1);
|
||||
@ -61,70 +145,159 @@ std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& rendere
|
||||
return wordWidths;
|
||||
}
|
||||
|
||||
std::vector<size_t> ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth,
|
||||
const std::vector<uint16_t>& wordWidths) const {
|
||||
const size_t totalWordCount = words.size();
|
||||
std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
|
||||
const int spaceWidth, std::vector<uint16_t>& wordWidths) {
|
||||
if (words.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// DP table to store the minimum badness (cost) of lines starting at index i
|
||||
std::vector<int> dp(totalWordCount);
|
||||
// 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i'
|
||||
std::vector<size_t> ans(totalWordCount);
|
||||
std::vector<HyphenationGuard> guards;
|
||||
|
||||
// Base Case
|
||||
dp[totalWordCount - 1] = 0;
|
||||
ans[totalWordCount - 1] = totalWordCount - 1;
|
||||
auto shiftGuardIndices = [&](size_t insertPos) {
|
||||
for (auto& guard : guards) {
|
||||
if (guard.prefixIndex >= insertPos) {
|
||||
guard.prefixIndex++;
|
||||
}
|
||||
if (guard.tailIndex >= insertPos) {
|
||||
guard.tailIndex++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = totalWordCount - 2; i >= 0; --i) {
|
||||
int currlen = -spaceWidth;
|
||||
dp[i] = MAX_COST;
|
||||
auto runDp = [&](std::vector<size_t>& lineBreaks) {
|
||||
const size_t totalWordCount = wordWidths.size();
|
||||
|
||||
for (size_t j = i; j < totalWordCount; ++j) {
|
||||
// Current line length: previous width + space + current word width
|
||||
currlen += wordWidths[j] + spaceWidth;
|
||||
std::vector<int> dp(totalWordCount);
|
||||
std::vector<size_t> ans(totalWordCount);
|
||||
|
||||
if (currlen > pageWidth) {
|
||||
dp[totalWordCount - 1] = 0;
|
||||
ans[totalWordCount - 1] = totalWordCount - 1;
|
||||
|
||||
for (int i = static_cast<int>(totalWordCount) - 2; i >= 0; --i) {
|
||||
int currlen = -spaceWidth;
|
||||
dp[i] = MAX_COST;
|
||||
|
||||
for (size_t j = i; j < totalWordCount; ++j) {
|
||||
currlen += wordWidths[j] + spaceWidth;
|
||||
|
||||
if (currlen > pageWidth) {
|
||||
break;
|
||||
}
|
||||
|
||||
bool violatesGuard = false;
|
||||
for (const auto& guard : guards) {
|
||||
if (i <= guard.prefixIndex && j >= guard.tailIndex) {
|
||||
violatesGuard = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (violatesGuard) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int cost;
|
||||
if (j == totalWordCount - 1) {
|
||||
cost = 0;
|
||||
} else {
|
||||
const int remainingSpace = pageWidth - currlen;
|
||||
const long long cost_ll = static_cast<long long>(remainingSpace) * remainingSpace + dp[j + 1];
|
||||
cost = cost_ll > MAX_COST ? MAX_COST : static_cast<int>(cost_ll);
|
||||
}
|
||||
|
||||
if (cost < dp[i]) {
|
||||
dp[i] = cost;
|
||||
ans[i] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lineBreaks.clear();
|
||||
size_t currentWordIndex = 0;
|
||||
constexpr size_t MAX_LINES = 1000;
|
||||
|
||||
while (currentWordIndex < totalWordCount && lineBreaks.size() < MAX_LINES) {
|
||||
const size_t nextBreakIndex = ans[currentWordIndex] + 1;
|
||||
lineBreaks.push_back(nextBreakIndex);
|
||||
currentWordIndex = nextBreakIndex;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<size_t> lineBreakIndices;
|
||||
|
||||
while (true) {
|
||||
runDp(lineBreakIndices);
|
||||
|
||||
if (!hyphenationEnabled) {
|
||||
return lineBreakIndices;
|
||||
}
|
||||
|
||||
bool insertedSplit = false;
|
||||
size_t lastBreakAt = 0;
|
||||
|
||||
for (size_t lineIdx = 0; lineIdx < lineBreakIndices.size(); ++lineIdx) {
|
||||
const size_t lineBreak = lineBreakIndices[lineIdx];
|
||||
const bool isLastLine = lineIdx == lineBreakIndices.size() - 1;
|
||||
const size_t lineWordCount = lineBreak - lastBreakAt;
|
||||
|
||||
int lineWordWidthSum = 0;
|
||||
for (size_t idx = lastBreakAt; idx < lineBreak; ++idx) {
|
||||
lineWordWidthSum += wordWidths[idx];
|
||||
}
|
||||
lastBreakAt = lineBreak;
|
||||
|
||||
if (isLastLine || lineBreak >= wordWidths.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t spacingCount = lineWordCount > 0 ? lineWordCount - 1 : 0;
|
||||
const int usedSpace = lineWordWidthSum + static_cast<int>(spacingCount) * spaceWidth;
|
||||
const int unusedWidth = pageWidth - usedSpace;
|
||||
const int spaceNeeded = lineWordCount == 0 ? 0 : spaceWidth;
|
||||
const int budgetForPrefix = unusedWidth - spaceNeeded;
|
||||
if (budgetForPrefix <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto nextWordIt = words.begin();
|
||||
auto nextStyleIt = wordStyles.begin();
|
||||
std::advance(nextWordIt, lineBreak);
|
||||
std::advance(nextStyleIt, lineBreak);
|
||||
|
||||
if (nextWordIt == words.end()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int cost;
|
||||
if (j == totalWordCount - 1) {
|
||||
cost = 0; // Last line
|
||||
} else {
|
||||
const int remainingSpace = pageWidth - currlen;
|
||||
// Use long long for the square to prevent overflow
|
||||
const long long cost_ll = static_cast<long long>(remainingSpace) * remainingSpace + dp[j + 1];
|
||||
|
||||
if (cost_ll > MAX_COST) {
|
||||
cost = MAX_COST;
|
||||
} else {
|
||||
cost = static_cast<int>(cost_ll);
|
||||
}
|
||||
HyphenSplitDecision decision;
|
||||
if (!chooseSplitForWidth(renderer, fontId, *nextWordIt, *nextStyleIt, budgetForPrefix, false, &decision)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cost < dp[i]) {
|
||||
dp[i] = cost;
|
||||
ans[i] = j; // j is the index of the last word in this optimal line
|
||||
const EpdFontStyle styleForSplit = *nextStyleIt;
|
||||
const std::string originalWord = *nextWordIt;
|
||||
const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-";
|
||||
const std::string tail = originalWord.substr(decision.byteOffset);
|
||||
if (tail.empty()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Stores the index of the word that starts the next line (last_word_index + 1)
|
||||
std::vector<size_t> lineBreakIndices;
|
||||
size_t currentWordIndex = 0;
|
||||
constexpr size_t MAX_LINES = 1000;
|
||||
*nextWordIt = tail;
|
||||
words.insert(nextWordIt, prefix);
|
||||
wordStyles.insert(nextStyleIt, styleForSplit);
|
||||
|
||||
while (currentWordIndex < totalWordCount) {
|
||||
if (lineBreakIndices.size() >= MAX_LINES) {
|
||||
const uint16_t tailWidth = renderer.getTextWidth(fontId, tail.c_str(), styleForSplit);
|
||||
wordWidths.insert(wordWidths.begin() + lineBreak, decision.prefixWidth);
|
||||
wordWidths[lineBreak + 1] = tailWidth;
|
||||
|
||||
shiftGuardIndices(lineBreak);
|
||||
guards.push_back({lineBreak, lineBreak + 1});
|
||||
insertedSplit = true;
|
||||
break;
|
||||
}
|
||||
|
||||
size_t nextBreakIndex = ans[currentWordIndex] + 1;
|
||||
lineBreakIndices.push_back(nextBreakIndex);
|
||||
|
||||
currentWordIndex = nextBreakIndex;
|
||||
if (!insertedSplit) {
|
||||
return lineBreakIndices;
|
||||
}
|
||||
}
|
||||
|
||||
return lineBreakIndices;
|
||||
}
|
||||
|
||||
void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth,
|
||||
@ -136,8 +309,8 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const
|
||||
|
||||
// Calculate total word width for this line
|
||||
int lineWordWidthSum = 0;
|
||||
for (size_t i = lastBreakAt; i < lineBreak; i++) {
|
||||
lineWordWidthSum += wordWidths[i];
|
||||
for (size_t idx = lastBreakAt; idx < lineBreak; ++idx) {
|
||||
lineWordWidthSum += wordWidths[idx];
|
||||
}
|
||||
|
||||
// Calculate spacing
|
||||
|
||||
@ -19,11 +19,12 @@ class ParsedText {
|
||||
bool extraParagraphSpacing;
|
||||
bool hyphenationEnabled;
|
||||
|
||||
std::vector<size_t> computeLineBreaks(int pageWidth, int spaceWidth, const std::vector<uint16_t>& wordWidths) const;
|
||||
std::vector<size_t> computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth,
|
||||
std::vector<uint16_t>& wordWidths);
|
||||
void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector<uint16_t>& wordWidths,
|
||||
const std::vector<size_t>& lineBreakIndices,
|
||||
const std::function<void(std::shared_ptr<TextBlock>)>& processLine);
|
||||
std::vector<uint16_t> calculateWordWidths(const GfxRenderer& renderer, int fontId);
|
||||
std::vector<uint16_t> calculateWordWidths(const GfxRenderer& renderer, int fontId, int pageWidth);
|
||||
|
||||
public:
|
||||
explicit ParsedText(const TextBlock::BLOCK_STYLE style, const bool extraParagraphSpacing,
|
||||
|
||||
@ -1,11 +1,9 @@
|
||||
#include "Hyphenator.h"
|
||||
|
||||
#include <GfxRenderer.h>
|
||||
#include <Utf8.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "EnglishHyphenator.h"
|
||||
@ -87,84 +85,44 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
|
||||
return cps[index].byteOffset;
|
||||
}
|
||||
|
||||
// Safely slices a UTF-8 string without splitting multibyte sequences.
|
||||
std::string slice(const std::string& word, const size_t startByte, const size_t endByte) {
|
||||
if (startByte >= endByte || startByte >= word.size()) {
|
||||
return std::string();
|
||||
}
|
||||
const size_t boundedEnd = std::min(endByte, word.size());
|
||||
return word.substr(startByte, boundedEnd - startByte);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool Hyphenator::splitWord(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||||
const EpdFontStyle style, const int availableWidth, HyphenationResult* result,
|
||||
const bool force) {
|
||||
if (!result || word.empty()) {
|
||||
return false;
|
||||
std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
|
||||
std::vector<size_t> byteOffsets;
|
||||
if (word.empty()) {
|
||||
return byteOffsets;
|
||||
}
|
||||
|
||||
auto cps = collectCodepoints(word);
|
||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||
return false;
|
||||
return byteOffsets;
|
||||
}
|
||||
|
||||
// Skip mixed tokens (e.g., "v2.0") unless the caller forces a split due to overflow.
|
||||
if (!force && !hasOnlyAlphabetic(cps)) {
|
||||
return false;
|
||||
std::vector<size_t> indexes;
|
||||
indexes.reserve(cps.size());
|
||||
|
||||
if (hasOnlyAlphabetic(cps)) {
|
||||
auto dictBreaks = collectBreakIndexes(cps);
|
||||
indexes.insert(indexes.end(), dictBreaks.begin(), dictBreaks.end());
|
||||
}
|
||||
|
||||
const auto breakIndexes = collectBreakIndexes(cps);
|
||||
// Budget for a trailing hyphen so rendered width matches the layout test.
|
||||
const int hyphenWidth = renderer.getTextWidth(fontId, "-", style);
|
||||
const int adjustedWidth = availableWidth - hyphenWidth;
|
||||
|
||||
size_t chosenIndex = std::numeric_limits<size_t>::max();
|
||||
|
||||
// Prefer dictionary-style break points emitted by language hyphenators.
|
||||
if (adjustedWidth > 0) {
|
||||
for (const size_t idx : breakIndexes) {
|
||||
const size_t byteOffset = byteOffsetForIndex(cps, idx);
|
||||
const std::string prefix = word.substr(0, byteOffset);
|
||||
const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
|
||||
if (prefixWidth <= adjustedWidth) {
|
||||
chosenIndex = idx;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (chosenIndex == std::numeric_limits<size_t>::max() && force) {
|
||||
// Emergency fallback: brute-force through codepoints to avoid overflow when no legal breaks fit.
|
||||
if (includeFallback) {
|
||||
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
||||
const size_t byteOffset = byteOffsetForIndex(cps, idx);
|
||||
const std::string prefix = word.substr(0, byteOffset);
|
||||
const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style);
|
||||
if (adjustedWidth <= 0 || prefixWidth <= adjustedWidth) {
|
||||
chosenIndex = idx;
|
||||
if (adjustedWidth > 0 && prefixWidth > adjustedWidth) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
indexes.push_back(idx);
|
||||
}
|
||||
}
|
||||
|
||||
if (chosenIndex == std::numeric_limits<size_t>::max()) {
|
||||
return false;
|
||||
if (indexes.empty()) {
|
||||
return byteOffsets;
|
||||
}
|
||||
|
||||
const size_t splitByte = byteOffsetForIndex(cps, chosenIndex);
|
||||
const std::string head = word.substr(0, splitByte);
|
||||
const std::string tail = slice(word, splitByte, word.size());
|
||||
std::sort(indexes.begin(), indexes.end());
|
||||
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
|
||||
|
||||
if (head.empty() || tail.empty()) {
|
||||
return false;
|
||||
byteOffsets.reserve(indexes.size());
|
||||
for (const size_t idx : indexes) {
|
||||
byteOffsets.push_back(byteOffsetForIndex(cps, idx));
|
||||
}
|
||||
|
||||
// Append the printed hyphen to the prefix while leaving the tail untouched.
|
||||
result->head = head + "-";
|
||||
result->tail = tail;
|
||||
return true;
|
||||
return byteOffsets;
|
||||
}
|
||||
|
||||
@ -1,20 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <EpdFontFamily.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
|
||||
class GfxRenderer;
|
||||
|
||||
// Holds the split portions of a hyphenated word.
|
||||
struct HyphenationResult {
|
||||
std::string head;
|
||||
std::string tail;
|
||||
};
|
||||
#include <vector>
|
||||
|
||||
class Hyphenator {
|
||||
public:
|
||||
// Splits a word so it fits within availableWidth, appending a hyphen to the head when needed.
|
||||
static bool splitWord(const GfxRenderer& renderer, int fontId, const std::string& word, EpdFontStyle style,
|
||||
int availableWidth, HyphenationResult* result, bool force);
|
||||
// Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
|
||||
// minimum prefix/suffix constraints are returned even if no language-specific rule matches.
|
||||
static std::vector<size_t> breakOffsets(const std::string& word, bool includeFallback);
|
||||
};
|
||||
Loading…
x
Reference in New Issue
Block a user