Replace std::list with std::vector for the words, wordStyles, wordXpos, and wordContinues containers in TextBlock and ParsedText. Vectors provide contiguous memory layout for better cache locality and O(1) random access, eliminating per-node heap allocation and the 16-byte prev/next pointer overhead of doubly-linked list nodes. The indexed access also removes the need for a separate continuesVec copy that was previously built from the list for O(1) layout access.
447 lines
17 KiB
C++
447 lines
17 KiB
C++
#include "ParsedText.h"
|
|
|
|
#include <GfxRenderer.h>
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <functional>
|
|
#include <limits>
|
|
#include <vector>
|
|
|
|
#include "hyphenation/Hyphenator.h"
|
|
|
|
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
|
|
|
namespace {
|
|
|
|
// Soft hyphen byte pattern used throughout EPUBs (UTF-8 for U+00AD).
|
|
constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
|
|
constexpr size_t SOFT_HYPHEN_BYTES = 2;
|
|
|
|
bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }
|
|
|
|
// Removes every soft hyphen in-place so rendered glyphs match measured widths.
|
|
void stripSoftHyphensInPlace(std::string& word) {
|
|
size_t pos = 0;
|
|
while ((pos = word.find(SOFT_HYPHEN_UTF8, pos)) != std::string::npos) {
|
|
word.erase(pos, SOFT_HYPHEN_BYTES);
|
|
}
|
|
}
|
|
|
|
// Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
|
|
uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
|
const EpdFontFamily::Style style, const bool appendHyphen = false) {
|
|
const bool hasSoftHyphen = containsSoftHyphen(word);
|
|
if (!hasSoftHyphen && !appendHyphen) {
|
|
return renderer.getTextWidth(fontId, word.c_str(), style);
|
|
}
|
|
|
|
std::string sanitized = word;
|
|
if (hasSoftHyphen) {
|
|
stripSoftHyphensInPlace(sanitized);
|
|
}
|
|
if (appendHyphen) {
|
|
sanitized.push_back('-');
|
|
}
|
|
return renderer.getTextWidth(fontId, sanitized.c_str(), style);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline,
|
|
const bool attachToPrevious) {
|
|
if (word.empty()) return;
|
|
|
|
words.push_back(std::move(word));
|
|
EpdFontFamily::Style combinedStyle = fontStyle;
|
|
if (underline) {
|
|
combinedStyle = static_cast<EpdFontFamily::Style>(combinedStyle | EpdFontFamily::UNDERLINE);
|
|
}
|
|
wordStyles.push_back(combinedStyle);
|
|
wordContinues.push_back(attachToPrevious);
|
|
}
|
|
|
|
// Consumes data to minimize memory usage
|
|
void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fontId, const uint16_t viewportWidth,
|
|
const std::function<void(std::shared_ptr<TextBlock>)>& processLine,
|
|
const bool includeLastLine) {
|
|
if (words.empty()) {
|
|
return;
|
|
}
|
|
|
|
// Apply fixed transforms before any per-line layout work.
|
|
applyParagraphIndent();
|
|
|
|
const int pageWidth = viewportWidth;
|
|
const int spaceWidth = renderer.getSpaceWidth(fontId);
|
|
auto wordWidths = calculateWordWidths(renderer, fontId);
|
|
|
|
std::vector<size_t> lineBreakIndices;
|
|
if (hyphenationEnabled) {
|
|
// Use greedy layout that can split words mid-loop when a hyphenated prefix fits.
|
|
lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, wordContinues);
|
|
} else {
|
|
lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths, wordContinues);
|
|
}
|
|
const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;
|
|
|
|
for (size_t i = 0; i < lineCount; ++i) {
|
|
extractLine(i, pageWidth, spaceWidth, wordWidths, wordContinues, lineBreakIndices, processLine);
|
|
}
|
|
}
|
|
|
|
std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) {
|
|
std::vector<uint16_t> wordWidths;
|
|
wordWidths.reserve(words.size());
|
|
|
|
for (size_t i = 0; i < words.size(); ++i) {
|
|
wordWidths.push_back(measureWordWidth(renderer, fontId, words[i], wordStyles[i]));
|
|
}
|
|
|
|
return wordWidths;
|
|
}
|
|
|
|
std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
|
|
const int spaceWidth, std::vector<uint16_t>& wordWidths,
|
|
std::vector<bool>& continuesVec) {
|
|
if (words.empty()) {
|
|
return {};
|
|
}
|
|
|
|
// Calculate first line indent (only for left/justified text without extra paragraph spacing)
|
|
const int firstLineIndent =
|
|
blockStyle.textIndent > 0 && !extraParagraphSpacing &&
|
|
(blockStyle.alignment == CssTextAlign::Justify || blockStyle.alignment == CssTextAlign::Left)
|
|
? blockStyle.textIndent
|
|
: 0;
|
|
|
|
// Ensure any word that would overflow even as the first entry on a line is split using fallback hyphenation.
|
|
for (size_t i = 0; i < wordWidths.size(); ++i) {
|
|
// First word needs to fit in reduced width if there's an indent
|
|
const int effectiveWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth;
|
|
while (wordWidths[i] > effectiveWidth) {
|
|
if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
const size_t totalWordCount = words.size();
|
|
|
|
// DP table to store the minimum badness (cost) of lines starting at index i
|
|
std::vector<int> dp(totalWordCount);
|
|
// 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i'
|
|
std::vector<size_t> ans(totalWordCount);
|
|
|
|
// Base Case
|
|
dp[totalWordCount - 1] = 0;
|
|
ans[totalWordCount - 1] = totalWordCount - 1;
|
|
|
|
for (int i = totalWordCount - 2; i >= 0; --i) {
|
|
int currlen = 0;
|
|
dp[i] = MAX_COST;
|
|
|
|
// First line has reduced width due to text-indent
|
|
const int effectivePageWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth;
|
|
|
|
for (size_t j = i; j < totalWordCount; ++j) {
|
|
// Add space before word j, unless it's the first word on the line or a continuation
|
|
const int gap = j > static_cast<size_t>(i) && !continuesVec[j] ? spaceWidth : 0;
|
|
currlen += wordWidths[j] + gap;
|
|
|
|
if (currlen > effectivePageWidth) {
|
|
break;
|
|
}
|
|
|
|
// Cannot break after word j if the next word attaches to it (continuation group)
|
|
if (j + 1 < totalWordCount && continuesVec[j + 1]) {
|
|
continue;
|
|
}
|
|
|
|
int cost;
|
|
if (j == totalWordCount - 1) {
|
|
cost = 0; // Last line
|
|
} else {
|
|
const int remainingSpace = effectivePageWidth - currlen;
|
|
// Use long long for the square to prevent overflow
|
|
const long long cost_ll = static_cast<long long>(remainingSpace) * remainingSpace + dp[j + 1];
|
|
|
|
if (cost_ll > MAX_COST) {
|
|
cost = MAX_COST;
|
|
} else {
|
|
cost = static_cast<int>(cost_ll);
|
|
}
|
|
}
|
|
|
|
if (cost < dp[i]) {
|
|
dp[i] = cost;
|
|
ans[i] = j; // j is the index of the last word in this optimal line
|
|
}
|
|
}
|
|
|
|
// Handle oversized word: if no valid configuration found, force single-word line
|
|
// This prevents cascade failure where one oversized word breaks all preceding words
|
|
if (dp[i] == MAX_COST) {
|
|
ans[i] = i; // Just this word on its own line
|
|
// Inherit cost from next word to allow subsequent words to find valid configurations
|
|
if (i + 1 < static_cast<int>(totalWordCount)) {
|
|
dp[i] = dp[i + 1];
|
|
} else {
|
|
dp[i] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stores the index of the word that starts the next line (last_word_index + 1)
|
|
std::vector<size_t> lineBreakIndices;
|
|
size_t currentWordIndex = 0;
|
|
|
|
while (currentWordIndex < totalWordCount) {
|
|
size_t nextBreakIndex = ans[currentWordIndex] + 1;
|
|
|
|
// Safety check: prevent infinite loop if nextBreakIndex doesn't advance
|
|
if (nextBreakIndex <= currentWordIndex) {
|
|
// Force advance by at least one word to avoid infinite loop
|
|
nextBreakIndex = currentWordIndex + 1;
|
|
}
|
|
|
|
lineBreakIndices.push_back(nextBreakIndex);
|
|
currentWordIndex = nextBreakIndex;
|
|
}
|
|
|
|
return lineBreakIndices;
|
|
}
|
|
|
|
void ParsedText::applyParagraphIndent() {
|
|
if (extraParagraphSpacing || words.empty()) {
|
|
return;
|
|
}
|
|
|
|
if (blockStyle.textIndentDefined) {
|
|
// CSS text-indent is explicitly set (even if 0) - don't use fallback EmSpace
|
|
// The actual indent positioning is handled in extractLine()
|
|
} else if (blockStyle.alignment == CssTextAlign::Justify || blockStyle.alignment == CssTextAlign::Left) {
|
|
// No CSS text-indent defined - use EmSpace fallback for visual indent
|
|
words.front().insert(0, "\xe2\x80\x83");
|
|
}
|
|
}
|
|
|
|
// Builds break indices while opportunistically splitting the word that would overflow the current line.
|
|
std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId,
|
|
const int pageWidth, const int spaceWidth,
|
|
std::vector<uint16_t>& wordWidths,
|
|
std::vector<bool>& continuesVec) {
|
|
// Calculate first line indent (only for left/justified text without extra paragraph spacing)
|
|
const int firstLineIndent =
|
|
blockStyle.textIndent > 0 && !extraParagraphSpacing &&
|
|
(blockStyle.alignment == CssTextAlign::Justify || blockStyle.alignment == CssTextAlign::Left)
|
|
? blockStyle.textIndent
|
|
: 0;
|
|
|
|
std::vector<size_t> lineBreakIndices;
|
|
size_t currentIndex = 0;
|
|
bool isFirstLine = true;
|
|
|
|
while (currentIndex < wordWidths.size()) {
|
|
const size_t lineStart = currentIndex;
|
|
int lineWidth = 0;
|
|
|
|
// First line has reduced width due to text-indent
|
|
const int effectivePageWidth = isFirstLine ? pageWidth - firstLineIndent : pageWidth;
|
|
|
|
// Consume as many words as possible for current line, splitting when prefixes fit
|
|
while (currentIndex < wordWidths.size()) {
|
|
const bool isFirstWord = currentIndex == lineStart;
|
|
const int spacing = isFirstWord || continuesVec[currentIndex] ? 0 : spaceWidth;
|
|
const int candidateWidth = spacing + wordWidths[currentIndex];
|
|
|
|
// Word fits on current line
|
|
if (lineWidth + candidateWidth <= effectivePageWidth) {
|
|
lineWidth += candidateWidth;
|
|
++currentIndex;
|
|
continue;
|
|
}
|
|
|
|
// Word would overflow — try to split based on hyphenation points
|
|
const int availableWidth = effectivePageWidth - lineWidth - spacing;
|
|
const bool allowFallbackBreaks = isFirstWord; // Only for first word on line
|
|
|
|
if (availableWidth > 0 &&
|
|
hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
|
|
// Prefix now fits; append it to this line and move to next line
|
|
lineWidth += spacing + wordWidths[currentIndex];
|
|
++currentIndex;
|
|
break;
|
|
}
|
|
|
|
// Could not split: force at least one word per line to avoid infinite loop
|
|
if (currentIndex == lineStart) {
|
|
lineWidth += candidateWidth;
|
|
++currentIndex;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Don't break before a continuation word (e.g., orphaned "?" after "question").
|
|
// Backtrack to the start of the continuation group so the whole group moves to the next line.
|
|
while (currentIndex > lineStart + 1 && currentIndex < wordWidths.size() && continuesVec[currentIndex]) {
|
|
--currentIndex;
|
|
}
|
|
|
|
lineBreakIndices.push_back(currentIndex);
|
|
isFirstLine = false;
|
|
}
|
|
|
|
return lineBreakIndices;
|
|
}
|
|
|
|
// Splits words[wordIndex] into prefix (adding a hyphen only when needed) and remainder when a legal breakpoint fits the
|
|
// available width.
|
|
bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer,
|
|
const int fontId, std::vector<uint16_t>& wordWidths,
|
|
const bool allowFallbackBreaks) {
|
|
// Guard against invalid indices or zero available width before attempting to split.
|
|
if (availableWidth <= 0 || wordIndex >= words.size()) {
|
|
return false;
|
|
}
|
|
|
|
const std::string& word = words[wordIndex];
|
|
const auto style = wordStyles[wordIndex];
|
|
|
|
// Collect candidate breakpoints (byte offsets and hyphen requirements).
|
|
auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks);
|
|
if (breakInfos.empty()) {
|
|
return false;
|
|
}
|
|
|
|
size_t chosenOffset = 0;
|
|
int chosenWidth = -1;
|
|
bool chosenNeedsHyphen = true;
|
|
|
|
// Iterate over each legal breakpoint and retain the widest prefix that still fits.
|
|
for (const auto& info : breakInfos) {
|
|
const size_t offset = info.byteOffset;
|
|
if (offset == 0 || offset >= word.size()) {
|
|
continue;
|
|
}
|
|
|
|
const bool needsHyphen = info.requiresInsertedHyphen;
|
|
const int prefixWidth = measureWordWidth(renderer, fontId, word.substr(0, offset), style, needsHyphen);
|
|
if (prefixWidth > availableWidth || prefixWidth <= chosenWidth) {
|
|
continue; // Skip if too wide or not an improvement
|
|
}
|
|
|
|
chosenWidth = prefixWidth;
|
|
chosenOffset = offset;
|
|
chosenNeedsHyphen = needsHyphen;
|
|
}
|
|
|
|
if (chosenWidth < 0) {
|
|
// No hyphenation point produced a prefix that fits in the remaining space.
|
|
return false;
|
|
}
|
|
|
|
// Split the word at the selected breakpoint and append a hyphen if required.
|
|
std::string remainder = word.substr(chosenOffset);
|
|
words[wordIndex].resize(chosenOffset);
|
|
if (chosenNeedsHyphen) {
|
|
words[wordIndex].push_back('-');
|
|
}
|
|
|
|
// Insert the remainder word (with matching style and continuation flag) directly after the prefix.
|
|
words.insert(words.begin() + wordIndex + 1, remainder);
|
|
wordStyles.insert(wordStyles.begin() + wordIndex + 1, style);
|
|
|
|
// The remainder inherits whatever continuation status the original word had with the word after it.
|
|
const bool originalContinuedToNext = wordContinues[wordIndex];
|
|
// The original word (now prefix) does NOT continue to remainder (hyphen separates them)
|
|
wordContinues[wordIndex] = false;
|
|
wordContinues.insert(wordContinues.begin() + wordIndex + 1, originalContinuedToNext);
|
|
|
|
// Update cached widths to reflect the new prefix/remainder pairing.
|
|
wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
|
|
const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style);
|
|
wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth);
|
|
return true;
|
|
}
|
|
|
|
void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth,
|
|
const std::vector<uint16_t>& wordWidths, const std::vector<bool>& continuesVec,
|
|
const std::vector<size_t>& lineBreakIndices,
|
|
const std::function<void(std::shared_ptr<TextBlock>)>& processLine) {
|
|
const size_t lineBreak = lineBreakIndices[breakIndex];
|
|
const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0;
|
|
const size_t lineWordCount = lineBreak - lastBreakAt;
|
|
|
|
// Calculate first line indent (only for left/justified text without extra paragraph spacing)
|
|
const bool isFirstLine = breakIndex == 0;
|
|
const int firstLineIndent =
|
|
isFirstLine && blockStyle.textIndent > 0 && !extraParagraphSpacing &&
|
|
(blockStyle.alignment == CssTextAlign::Justify || blockStyle.alignment == CssTextAlign::Left)
|
|
? blockStyle.textIndent
|
|
: 0;
|
|
|
|
// Calculate total word width for this line and count actual word gaps
|
|
// (continuation words attach to previous word with no gap)
|
|
int lineWordWidthSum = 0;
|
|
size_t actualGapCount = 0;
|
|
|
|
for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
|
|
lineWordWidthSum += wordWidths[lastBreakAt + wordIdx];
|
|
// Count gaps: each word after the first creates a gap, unless it's a continuation
|
|
if (wordIdx > 0 && !continuesVec[lastBreakAt + wordIdx]) {
|
|
actualGapCount++;
|
|
}
|
|
}
|
|
|
|
// Calculate spacing (account for indent reducing effective page width on first line)
|
|
const int effectivePageWidth = pageWidth - firstLineIndent;
|
|
const int spareSpace = effectivePageWidth - lineWordWidthSum;
|
|
|
|
int spacing = spaceWidth;
|
|
const bool isLastLine = breakIndex == lineBreakIndices.size() - 1;
|
|
|
|
// For justified text, calculate spacing based on actual gap count
|
|
if (blockStyle.alignment == CssTextAlign::Justify && !isLastLine && actualGapCount >= 1) {
|
|
spacing = spareSpace / static_cast<int>(actualGapCount);
|
|
}
|
|
|
|
// Calculate initial x position (first line starts at indent for left/justified text)
|
|
auto xpos = static_cast<uint16_t>(firstLineIndent);
|
|
if (blockStyle.alignment == CssTextAlign::Right) {
|
|
xpos = spareSpace - static_cast<int>(actualGapCount) * spaceWidth;
|
|
} else if (blockStyle.alignment == CssTextAlign::Center) {
|
|
xpos = (spareSpace - static_cast<int>(actualGapCount) * spaceWidth) / 2;
|
|
}
|
|
|
|
// Pre-calculate X positions for words
|
|
// Continuation words attach to the previous word with no space before them
|
|
std::vector<uint16_t> lineXPos;
|
|
lineXPos.reserve(lineWordCount);
|
|
|
|
for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
|
|
const uint16_t currentWordWidth = wordWidths[lastBreakAt + wordIdx];
|
|
|
|
lineXPos.push_back(xpos);
|
|
|
|
// Add spacing after this word, unless the next word is a continuation
|
|
const bool nextIsContinuation = wordIdx + 1 < lineWordCount && continuesVec[lastBreakAt + wordIdx + 1];
|
|
|
|
xpos += currentWordWidth + (nextIsContinuation ? 0 : spacing);
|
|
}
|
|
|
|
// Build line data by moving from the original vectors using index range
|
|
std::vector<std::string> lineWords(std::make_move_iterator(words.begin() + lastBreakAt),
|
|
std::make_move_iterator(words.begin() + lineBreak));
|
|
std::vector<EpdFontFamily::Style> lineWordStyles(wordStyles.begin() + lastBreakAt, wordStyles.begin() + lineBreak);
|
|
|
|
for (auto& word : lineWords) {
|
|
if (containsSoftHyphen(word)) {
|
|
stripSoftHyphensInPlace(word);
|
|
}
|
|
}
|
|
|
|
processLine(
|
|
std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), blockStyle));
|
|
}
|