Ports upstream PR #1342 (feat: Add Book Info screen, richer metadata, and safer file-browser controls) with mod-specific adaptations: - Parse and cache series, seriesIndex, description from EPUB OPF - Bump book.bin cache version to 6 for new metadata fields - Add BookInfoActivity (new screen) accessible via Right button in FileBrowser - Add ManageBook menu via Left button in FileBrowser (replaces upstream hidden delete) - Guard all delete/archive actions with ConfirmationActivity (10 call sites) - Add inputArmed gating to ConfirmationActivity to prevent accidental confirmation - Safe deserialization: readString now returns bool with MAX_STRING_LENGTH guard - Add series field to RecentBooksStore with JSON and binary serialization - Add i18n keys: STR_BOOK_INFO, STR_AUTHOR, STR_SERIES, STR_FILE_SIZE, etc. Made-with: Cursor
624 lines
26 KiB
C++
624 lines
26 KiB
C++
#include "ParsedText.h"
|
||
|
||
#include <GfxRenderer.h>
|
||
#include <Utf8.h>
|
||
|
||
#include <algorithm>
|
||
#include <cmath>
|
||
#include <cstring>
|
||
#include <functional>
|
||
#include <limits>
|
||
#include <vector>
|
||
|
||
#include "hyphenation/Hyphenator.h"
|
||
|
||
constexpr int MAX_COST = std::numeric_limits<int>::max();
|
||
|
||
namespace {
|
||
|
||
// Soft hyphen byte pattern used throughout EPUBs (UTF-8 for U+00AD).
|
||
constexpr char SOFT_HYPHEN_UTF8[] = "\xC2\xAD";
|
||
constexpr size_t SOFT_HYPHEN_BYTES = 2;
|
||
|
||
// Returns the first rendered codepoint of a word (skipping leading soft hyphens).
|
||
uint32_t firstCodepoint(const std::string& word) {
|
||
const auto* ptr = reinterpret_cast<const unsigned char*>(word.c_str());
|
||
while (true) {
|
||
const uint32_t cp = utf8NextCodepoint(&ptr);
|
||
if (cp == 0) return 0;
|
||
if (cp != 0x00AD) return cp; // skip soft hyphens
|
||
}
|
||
}
|
||
|
||
// Returns the last codepoint of a word by scanning backward for the start of the last UTF-8 sequence.
|
||
uint32_t lastCodepoint(const std::string& word) {
|
||
if (word.empty()) return 0;
|
||
// UTF-8 continuation bytes start with 10xxxxxx; scan backward to find the leading byte.
|
||
size_t i = word.size() - 1;
|
||
while (i > 0 && (static_cast<uint8_t>(word[i]) & 0xC0) == 0x80) {
|
||
--i;
|
||
}
|
||
const auto* ptr = reinterpret_cast<const unsigned char*>(word.c_str() + i);
|
||
return utf8NextCodepoint(&ptr);
|
||
}
|
||
|
||
bool containsSoftHyphen(const std::string& word) { return word.find(SOFT_HYPHEN_UTF8) != std::string::npos; }
|
||
|
||
// Removes every soft hyphen in-place so rendered glyphs match measured widths.
|
||
void stripSoftHyphensInPlace(std::string& word) {
|
||
size_t pos = 0;
|
||
while ((pos = word.find(SOFT_HYPHEN_UTF8, pos)) != std::string::npos) {
|
||
word.erase(pos, SOFT_HYPHEN_BYTES);
|
||
}
|
||
}
|
||
|
||
// Returns the advance width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
|
||
// Uses advance width (sum of glyph advances + kerning) rather than bounding box width so that italic glyph overhangs
|
||
// don't inflate inter-word spacing.
|
||
uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||
const EpdFontFamily::Style style, const bool appendHyphen = false) {
|
||
if (word.size() == 1 && word[0] == ' ' && !appendHyphen) {
|
||
return renderer.getSpaceWidth(fontId, style);
|
||
}
|
||
const bool hasSoftHyphen = containsSoftHyphen(word);
|
||
if (!hasSoftHyphen && !appendHyphen) {
|
||
return renderer.getTextAdvanceX(fontId, word.c_str(), style);
|
||
}
|
||
|
||
std::string sanitized = word;
|
||
if (hasSoftHyphen) {
|
||
stripSoftHyphensInPlace(sanitized);
|
||
}
|
||
if (appendHyphen) {
|
||
sanitized.push_back('-');
|
||
}
|
||
return renderer.getTextAdvanceX(fontId, sanitized.c_str(), style);
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Direct-mapped word-width cache
|
||
//
|
||
// Avoids redundant getTextAdvanceX calls when the same (word, style, fontId)
|
||
// triple appears across paragraphs. A fixed-size static array is used so
|
||
// that heap allocation and fragmentation are both zero.
|
||
//
|
||
// Eviction policy: hash-direct mapping — a word always occupies the single
|
||
// slot determined by its hash; a collision simply overwrites that slot.
|
||
// This gives O(1) lookup (one hash + one memcmp) regardless of how full the
|
||
// cache is, avoiding the O(n) linear-scan overhead that causes a regression
|
||
// on corpora with many unique words (e.g. German compound-heavy text).
|
||
//
|
||
// Words longer than 23 bytes bypass the cache entirely — they are uncommon,
|
||
// unlikely to repeat verbatim, and exceed the fixed-width key buffer.
|
||
// ---------------------------------------------------------------------------
|
||
|
||
struct WordWidthCacheEntry {
|
||
char word[24]; // NUL-terminated; 23 usable bytes + terminator
|
||
int fontId;
|
||
uint16_t width;
|
||
uint8_t style; // EpdFontFamily::Style narrowed to one byte
|
||
bool valid; // false = slot empty (BSS-initialised to 0)
|
||
};
|
||
|
||
// Power-of-two size → slot selection via fast bitmask AND.
|
||
// 128 entries × 32 bytes = 4 KB in BSS; covers typical paragraph vocabulary
|
||
// with a low collision rate even for German compound-heavy prose.
|
||
static constexpr uint32_t WORD_WIDTH_CACHE_SIZE = 128;
|
||
static constexpr uint32_t WORD_WIDTH_CACHE_MASK = WORD_WIDTH_CACHE_SIZE - 1;
|
||
static WordWidthCacheEntry s_wordWidthCache[WORD_WIDTH_CACHE_SIZE];
|
||
|
||
// FNV-1a over the word bytes, then XOR-folded with fontId and style.
|
||
static uint32_t wordWidthCacheHash(const char* str, const size_t len, const int fontId, const uint8_t style) {
|
||
uint32_t h = 2166136261u; // FNV offset basis
|
||
for (size_t i = 0; i < len; ++i) {
|
||
h ^= static_cast<uint8_t>(str[i]);
|
||
h *= 16777619u; // FNV prime
|
||
}
|
||
h ^= static_cast<uint32_t>(fontId);
|
||
h *= 16777619u;
|
||
h ^= style;
|
||
return h;
|
||
}
|
||
|
||
// Returns the cached width for (word, style, fontId), measuring and caching
|
||
// on a miss. Appending a hyphen is not supported — those measurements are
|
||
// word-fragment lookups that will not repeat and must not pollute the cache.
|
||
static uint16_t cachedMeasureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||
const EpdFontFamily::Style style) {
|
||
const size_t len = word.size();
|
||
if (len >= 24) {
|
||
return measureWordWidth(renderer, fontId, word, style);
|
||
}
|
||
|
||
const uint8_t styleByte = static_cast<uint8_t>(style);
|
||
const char* const wordCStr = word.c_str();
|
||
|
||
const uint32_t slot = wordWidthCacheHash(wordCStr, len, fontId, styleByte) & WORD_WIDTH_CACHE_MASK;
|
||
auto& e = s_wordWidthCache[slot];
|
||
|
||
if (e.valid && e.fontId == fontId && e.style == styleByte && memcmp(e.word, wordCStr, len + 1) == 0) {
|
||
return e.width; // O(1) cache hit
|
||
}
|
||
|
||
const uint16_t w = measureWordWidth(renderer, fontId, word, style);
|
||
memcpy(e.word, wordCStr, len + 1);
|
||
e.fontId = fontId;
|
||
e.width = w;
|
||
e.style = styleByte;
|
||
e.valid = true;
|
||
return w;
|
||
}
|
||
|
||
} // namespace
|
||
|
||
void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, const bool underline,
|
||
const bool attachToPrevious) {
|
||
if (word.empty()) return;
|
||
|
||
words.push_back(std::move(word));
|
||
EpdFontFamily::Style combinedStyle = fontStyle;
|
||
if (underline) {
|
||
combinedStyle = static_cast<EpdFontFamily::Style>(combinedStyle | EpdFontFamily::UNDERLINE);
|
||
}
|
||
wordStyles.push_back(combinedStyle);
|
||
wordContinues.push_back(attachToPrevious);
|
||
}
|
||
|
||
// Consumes data to minimize memory usage
|
||
void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fontId, const uint16_t viewportWidth,
|
||
const std::function<void(std::shared_ptr<TextBlock>)>& processLine,
|
||
const bool includeLastLine) {
|
||
if (words.empty()) {
|
||
return;
|
||
}
|
||
|
||
// Apply fixed transforms before any per-line layout work.
|
||
applyParagraphIndent();
|
||
|
||
const int pageWidth = viewportWidth;
|
||
auto wordWidths = calculateWordWidths(renderer, fontId);
|
||
|
||
std::vector<size_t> lineBreakIndices;
|
||
if (hyphenationEnabled) {
|
||
lineBreakIndices = computeHyphenatedLineBreaks(renderer, fontId, pageWidth, wordWidths, wordContinues);
|
||
} else {
|
||
lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, wordWidths, wordContinues);
|
||
}
|
||
const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1;
|
||
|
||
for (size_t i = 0; i < lineCount; ++i) {
|
||
extractLine(i, pageWidth, wordWidths, wordContinues, lineBreakIndices, processLine, renderer, fontId);
|
||
}
|
||
|
||
// Remove consumed words so size() reflects only remaining words
|
||
if (lineCount > 0) {
|
||
const size_t consumed = lineBreakIndices[lineCount - 1];
|
||
words.erase(words.begin(), words.begin() + consumed);
|
||
wordStyles.erase(wordStyles.begin(), wordStyles.begin() + consumed);
|
||
wordContinues.erase(wordContinues.begin(), wordContinues.begin() + consumed);
|
||
}
|
||
}
|
||
|
||
std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) {
|
||
std::vector<uint16_t> wordWidths;
|
||
wordWidths.reserve(words.size());
|
||
|
||
for (size_t i = 0; i < words.size(); ++i) {
|
||
wordWidths.push_back(cachedMeasureWordWidth(renderer, fontId, words[i], wordStyles[i]));
|
||
}
|
||
|
||
return wordWidths;
|
||
}
|
||
|
||
std::vector<size_t> ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth,
|
||
std::vector<uint16_t>& wordWidths, std::vector<bool>& continuesVec) {
|
||
if (words.empty()) {
|
||
return {};
|
||
}
|
||
|
||
// Calculate first line indent (only for left/justified text).
|
||
// Positive text-indent (paragraph indent) is suppressed when extraParagraphSpacing is on.
|
||
// Negative text-indent (hanging indent, e.g. margin-left:3em; text-indent:-1em) always applies —
|
||
// it is structural (positions the bullet/marker), not decorative.
|
||
const int firstLineIndent =
|
||
blockStyle.textIndentDefined && (blockStyle.textIndent < 0 || !extraParagraphSpacing) &&
|
||
(blockStyle.alignment == CssTextAlign::Justify || blockStyle.alignment == CssTextAlign::Left)
|
||
? blockStyle.textIndent
|
||
: 0;
|
||
|
||
// Ensure any word that would overflow even as the first entry on a line is split using fallback hyphenation.
|
||
for (size_t i = 0; i < wordWidths.size(); ++i) {
|
||
// First word needs to fit in reduced width if there's an indent
|
||
const int effectiveWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth;
|
||
while (wordWidths[i] > effectiveWidth) {
|
||
if (!hyphenateWordAtIndex(i, effectiveWidth, renderer, fontId, wordWidths, /*allowFallbackBreaks=*/true)) {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
const size_t totalWordCount = words.size();
|
||
|
||
// DP table to store the minimum badness (cost) of lines starting at index i
|
||
std::vector<int> dp(totalWordCount);
|
||
// 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i'
|
||
std::vector<size_t> ans(totalWordCount);
|
||
|
||
// Base Case
|
||
dp[totalWordCount - 1] = 0;
|
||
ans[totalWordCount - 1] = totalWordCount - 1;
|
||
|
||
for (int i = totalWordCount - 2; i >= 0; --i) {
|
||
int currlen = 0;
|
||
dp[i] = MAX_COST;
|
||
|
||
// First line has reduced width due to text-indent
|
||
const int effectivePageWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth;
|
||
|
||
for (size_t j = i; j < totalWordCount; ++j) {
|
||
// Add space before word j, unless it's the first word on the line or a continuation
|
||
int gap = 0;
|
||
if (j > static_cast<size_t>(i) && !continuesVec[j]) {
|
||
gap =
|
||
renderer.getSpaceAdvance(fontId, lastCodepoint(words[j - 1]), firstCodepoint(words[j]), wordStyles[j - 1]);
|
||
} else if (j > static_cast<size_t>(i) && continuesVec[j]) {
|
||
// Cross-boundary kerning for continuation words (e.g. nonbreaking spaces, attached punctuation)
|
||
gap = renderer.getKerning(fontId, lastCodepoint(words[j - 1]), firstCodepoint(words[j]), wordStyles[j - 1]);
|
||
}
|
||
currlen += wordWidths[j] + gap;
|
||
|
||
if (currlen > effectivePageWidth) {
|
||
break;
|
||
}
|
||
|
||
// Cannot break after word j if the next word attaches to it (continuation group)
|
||
if (j + 1 < totalWordCount && continuesVec[j + 1]) {
|
||
continue;
|
||
}
|
||
|
||
int cost;
|
||
if (j == totalWordCount - 1) {
|
||
cost = 0; // Last line
|
||
} else {
|
||
const int remainingSpace = effectivePageWidth - currlen;
|
||
// Use long long for the square to prevent overflow
|
||
const long long cost_ll = static_cast<long long>(remainingSpace) * remainingSpace + dp[j + 1];
|
||
|
||
if (cost_ll > MAX_COST) {
|
||
cost = MAX_COST;
|
||
} else {
|
||
cost = static_cast<int>(cost_ll);
|
||
}
|
||
}
|
||
|
||
if (cost < dp[i]) {
|
||
dp[i] = cost;
|
||
ans[i] = j; // j is the index of the last word in this optimal line
|
||
}
|
||
}
|
||
|
||
// Handle oversized word: if no valid configuration found, force single-word line
|
||
// This prevents cascade failure where one oversized word breaks all preceding words
|
||
if (dp[i] == MAX_COST) {
|
||
ans[i] = i; // Just this word on its own line
|
||
// Inherit cost from next word to allow subsequent words to find valid configurations
|
||
if (i + 1 < static_cast<int>(totalWordCount)) {
|
||
dp[i] = dp[i + 1];
|
||
} else {
|
||
dp[i] = 0;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Stores the index of the word that starts the next line (last_word_index + 1)
|
||
std::vector<size_t> lineBreakIndices;
|
||
lineBreakIndices.reserve(totalWordCount / 8 + 1);
|
||
size_t currentWordIndex = 0;
|
||
|
||
while (currentWordIndex < totalWordCount) {
|
||
size_t nextBreakIndex = ans[currentWordIndex] + 1;
|
||
|
||
// Safety check: prevent infinite loop if nextBreakIndex doesn't advance
|
||
if (nextBreakIndex <= currentWordIndex) {
|
||
// Force advance by at least one word to avoid infinite loop
|
||
nextBreakIndex = currentWordIndex + 1;
|
||
}
|
||
|
||
lineBreakIndices.push_back(nextBreakIndex);
|
||
currentWordIndex = nextBreakIndex;
|
||
}
|
||
|
||
return lineBreakIndices;
|
||
}
|
||
|
||
void ParsedText::applyParagraphIndent() {
|
||
if (extraParagraphSpacing || words.empty()) {
|
||
return;
|
||
}
|
||
|
||
if (blockStyle.textIndentDefined) {
|
||
// CSS text-indent is explicitly set (even if 0) - don't use fallback EmSpace
|
||
// The actual indent positioning is handled in extractLine()
|
||
} else if (blockStyle.alignment == CssTextAlign::Justify || blockStyle.alignment == CssTextAlign::Left) {
|
||
// No CSS text-indent defined - use EmSpace fallback for visual indent
|
||
words.front().insert(0, "\xe2\x80\x83");
|
||
}
|
||
}
|
||
|
||
// Builds break indices while opportunistically splitting the word that would overflow the current line.
|
||
std::vector<size_t> ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& renderer, const int fontId,
|
||
const int pageWidth, std::vector<uint16_t>& wordWidths,
|
||
std::vector<bool>& continuesVec) {
|
||
// Calculate first line indent (only for left/justified text).
|
||
// Positive text-indent (paragraph indent) is suppressed when extraParagraphSpacing is on.
|
||
// Negative text-indent (hanging indent, e.g. margin-left:3em; text-indent:-1em) always applies —
|
||
// it is structural (positions the bullet/marker), not decorative.
|
||
const int firstLineIndent =
|
||
blockStyle.textIndentDefined && (blockStyle.textIndent < 0 || !extraParagraphSpacing) &&
|
||
(blockStyle.alignment == CssTextAlign::Justify || blockStyle.alignment == CssTextAlign::Left)
|
||
? blockStyle.textIndent
|
||
: 0;
|
||
|
||
std::vector<size_t> lineBreakIndices;
|
||
size_t currentIndex = 0;
|
||
bool isFirstLine = true;
|
||
|
||
while (currentIndex < wordWidths.size()) {
|
||
const size_t lineStart = currentIndex;
|
||
int lineWidth = 0;
|
||
|
||
// First line has reduced width due to text-indent
|
||
const int effectivePageWidth = isFirstLine ? pageWidth - firstLineIndent : pageWidth;
|
||
|
||
// Consume as many words as possible for current line, splitting when prefixes fit
|
||
while (currentIndex < wordWidths.size()) {
|
||
const bool isFirstWord = currentIndex == lineStart;
|
||
int spacing = 0;
|
||
if (!isFirstWord && !continuesVec[currentIndex]) {
|
||
spacing = renderer.getSpaceAdvance(fontId, lastCodepoint(words[currentIndex - 1]),
|
||
firstCodepoint(words[currentIndex]), wordStyles[currentIndex - 1]);
|
||
} else if (!isFirstWord && continuesVec[currentIndex]) {
|
||
// Cross-boundary kerning for continuation words (e.g. nonbreaking spaces, attached punctuation)
|
||
spacing = renderer.getKerning(fontId, lastCodepoint(words[currentIndex - 1]),
|
||
firstCodepoint(words[currentIndex]), wordStyles[currentIndex - 1]);
|
||
}
|
||
const int candidateWidth = spacing + wordWidths[currentIndex];
|
||
|
||
// Word fits on current line
|
||
if (lineWidth + candidateWidth <= effectivePageWidth) {
|
||
lineWidth += candidateWidth;
|
||
++currentIndex;
|
||
continue;
|
||
}
|
||
|
||
// Word would overflow — try to split based on hyphenation points
|
||
const int availableWidth = effectivePageWidth - lineWidth - spacing;
|
||
const bool allowFallbackBreaks = isFirstWord; // Only for first word on line
|
||
|
||
if (availableWidth > 0 &&
|
||
hyphenateWordAtIndex(currentIndex, availableWidth, renderer, fontId, wordWidths, allowFallbackBreaks)) {
|
||
// Prefix now fits; append it to this line and move to next line
|
||
lineWidth += spacing + wordWidths[currentIndex];
|
||
++currentIndex;
|
||
break;
|
||
}
|
||
|
||
// Could not split: force at least one word per line to avoid infinite loop
|
||
if (currentIndex == lineStart) {
|
||
lineWidth += candidateWidth;
|
||
++currentIndex;
|
||
}
|
||
break;
|
||
}
|
||
|
||
// Don't break before a continuation word (e.g., orphaned "?" after "question").
|
||
// Backtrack to the start of the continuation group so the whole group moves to the next line.
|
||
while (currentIndex > lineStart + 1 && currentIndex < wordWidths.size() && continuesVec[currentIndex]) {
|
||
--currentIndex;
|
||
}
|
||
|
||
lineBreakIndices.push_back(currentIndex);
|
||
isFirstLine = false;
|
||
}
|
||
|
||
return lineBreakIndices;
|
||
}
|
||
|
||
// Splits words[wordIndex] into prefix (adding a hyphen only when needed) and remainder when a legal breakpoint fits the
|
||
// available width.
|
||
bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availableWidth, const GfxRenderer& renderer,
|
||
const int fontId, std::vector<uint16_t>& wordWidths,
|
||
const bool allowFallbackBreaks) {
|
||
// Guard against invalid indices or zero available width before attempting to split.
|
||
if (availableWidth <= 0 || wordIndex >= words.size()) {
|
||
return false;
|
||
}
|
||
|
||
const std::string& word = words[wordIndex];
|
||
const auto style = wordStyles[wordIndex];
|
||
|
||
// Collect candidate breakpoints (byte offsets and hyphen requirements).
|
||
auto breakInfos = Hyphenator::breakOffsets(word, allowFallbackBreaks);
|
||
if (breakInfos.empty()) {
|
||
return false;
|
||
}
|
||
|
||
size_t chosenOffset = 0;
|
||
int chosenWidth = -1;
|
||
bool chosenNeedsHyphen = true;
|
||
std::string prefix;
|
||
prefix.reserve(word.size());
|
||
|
||
// Iterate over each legal breakpoint and retain the widest prefix that still fits.
|
||
// Breakpoints are in ascending order, so once a prefix is too wide, all subsequent ones will be too.
|
||
for (const auto& info : breakInfos) {
|
||
const size_t offset = info.byteOffset;
|
||
if (offset == 0 || offset >= word.size()) {
|
||
continue;
|
||
}
|
||
|
||
const bool needsHyphen = info.requiresInsertedHyphen;
|
||
prefix.assign(word, 0, offset);
|
||
const int prefixWidth = measureWordWidth(renderer, fontId, prefix, style, needsHyphen);
|
||
if (prefixWidth > availableWidth) {
|
||
break; // Ascending order: all subsequent breakpoints yield wider prefixes
|
||
}
|
||
if (prefixWidth <= chosenWidth) {
|
||
continue; // Not an improvement
|
||
}
|
||
|
||
chosenWidth = prefixWidth;
|
||
chosenOffset = offset;
|
||
chosenNeedsHyphen = needsHyphen;
|
||
}
|
||
|
||
if (chosenWidth < 0) {
|
||
// No hyphenation point produced a prefix that fits in the remaining space.
|
||
return false;
|
||
}
|
||
|
||
// Split the word at the selected breakpoint and append a hyphen if required.
|
||
std::string remainder = word.substr(chosenOffset);
|
||
words[wordIndex].resize(chosenOffset);
|
||
if (chosenNeedsHyphen) {
|
||
words[wordIndex].push_back('-');
|
||
}
|
||
|
||
// Insert the remainder word (with matching style and continuation flag) directly after the prefix.
|
||
words.insert(words.begin() + wordIndex + 1, remainder);
|
||
wordStyles.insert(wordStyles.begin() + wordIndex + 1, style);
|
||
|
||
// Continuation flag handling after splitting a word into prefix + remainder.
|
||
//
|
||
// The prefix keeps the original word's continuation flag so that no-break-space groups
|
||
// stay linked. The remainder always gets continues=false because it starts on the next
|
||
// line and is not attached to the prefix.
|
||
//
|
||
// Example: "200 Quadratkilometer" produces tokens:
|
||
// [0] "200" continues=false
|
||
// [1] " " continues=true
|
||
// [2] "Quadratkilometer" continues=true <-- the word being split
|
||
//
|
||
// After splitting "Quadratkilometer" at "Quadrat-" / "kilometer":
|
||
// [0] "200" continues=false
|
||
// [1] " " continues=true
|
||
// [2] "Quadrat-" continues=true (KEPT — still attached to the no-break group)
|
||
// [3] "kilometer" continues=false (NEW — starts fresh on the next line)
|
||
//
|
||
// This lets the backtracking loop keep the entire prefix group ("200 Quadrat-") on one
|
||
// line, while "kilometer" moves to the next line.
|
||
// wordContinues[wordIndex] is intentionally left unchanged — the prefix keeps its original attachment.
|
||
wordContinues.insert(wordContinues.begin() + wordIndex + 1, false);
|
||
|
||
// Update cached widths to reflect the new prefix/remainder pairing.
|
||
wordWidths[wordIndex] = static_cast<uint16_t>(chosenWidth);
|
||
const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style);
|
||
wordWidths.insert(wordWidths.begin() + wordIndex + 1, remainderWidth);
|
||
return true;
|
||
}
|
||
|
||
void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const std::vector<uint16_t>& wordWidths,
|
||
const std::vector<bool>& continuesVec, const std::vector<size_t>& lineBreakIndices,
|
||
const std::function<void(std::shared_ptr<TextBlock>)>& processLine,
|
||
const GfxRenderer& renderer, const int fontId) {
|
||
const size_t lineBreak = lineBreakIndices[breakIndex];
|
||
const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0;
|
||
const size_t lineWordCount = lineBreak - lastBreakAt;
|
||
|
||
// Calculate first line indent (only for left/justified text).
|
||
// Positive text-indent (paragraph indent) is suppressed when extraParagraphSpacing is on.
|
||
// Negative text-indent (hanging indent, e.g. margin-left:3em; text-indent:-1em) always applies —
|
||
// it is structural (positions the bullet/marker), not decorative.
|
||
const bool isFirstLine = breakIndex == 0;
|
||
const int firstLineIndent =
|
||
isFirstLine && blockStyle.textIndentDefined && (blockStyle.textIndent < 0 || !extraParagraphSpacing) &&
|
||
(blockStyle.alignment == CssTextAlign::Justify || blockStyle.alignment == CssTextAlign::Left)
|
||
? blockStyle.textIndent
|
||
: 0;
|
||
|
||
// Calculate total word width for this line, count actual word gaps,
|
||
// and accumulate total natural gap widths (including space kerning adjustments).
|
||
int lineWordWidthSum = 0;
|
||
size_t actualGapCount = 0;
|
||
int totalNaturalGaps = 0;
|
||
|
||
for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
|
||
lineWordWidthSum += wordWidths[lastBreakAt + wordIdx];
|
||
// Count gaps: each word after the first creates a gap, unless it's a continuation
|
||
if (wordIdx > 0 && !continuesVec[lastBreakAt + wordIdx]) {
|
||
actualGapCount++;
|
||
int naturalGap =
|
||
renderer.getSpaceAdvance(fontId, lastCodepoint(words[lastBreakAt + wordIdx - 1]),
|
||
firstCodepoint(words[lastBreakAt + wordIdx]), wordStyles[lastBreakAt + wordIdx - 1]);
|
||
totalNaturalGaps += naturalGap;
|
||
} else if (wordIdx > 0 && continuesVec[lastBreakAt + wordIdx]) {
|
||
// Cross-boundary kerning for continuation words (e.g. nonbreaking spaces, attached punctuation)
|
||
totalNaturalGaps +=
|
||
renderer.getKerning(fontId, lastCodepoint(words[lastBreakAt + wordIdx - 1]),
|
||
firstCodepoint(words[lastBreakAt + wordIdx]), wordStyles[lastBreakAt + wordIdx - 1]);
|
||
}
|
||
}
|
||
|
||
// Calculate spacing (account for indent reducing effective page width on first line)
|
||
const int effectivePageWidth = pageWidth - firstLineIndent;
|
||
const bool isLastLine = breakIndex == lineBreakIndices.size() - 1;
|
||
|
||
// For justified text, compute per-gap extra to distribute remaining space evenly
|
||
const int spareSpace = effectivePageWidth - lineWordWidthSum - totalNaturalGaps;
|
||
const int justifyExtra = (blockStyle.alignment == CssTextAlign::Justify && !isLastLine && actualGapCount >= 1)
|
||
? spareSpace / static_cast<int>(actualGapCount)
|
||
: 0;
|
||
|
||
// Calculate initial x position (first line starts at indent for left/justified text;
|
||
// may be negative for hanging indents, e.g. margin-left:3em; text-indent:-1em).
|
||
auto xpos = static_cast<int16_t>(firstLineIndent);
|
||
if (blockStyle.alignment == CssTextAlign::Right) {
|
||
xpos = effectivePageWidth - lineWordWidthSum - totalNaturalGaps;
|
||
} else if (blockStyle.alignment == CssTextAlign::Center) {
|
||
xpos = (effectivePageWidth - lineWordWidthSum - totalNaturalGaps) / 2;
|
||
}
|
||
|
||
// Pre-calculate X positions for words
|
||
// Continuation words attach to the previous word with no space before them
|
||
std::vector<int16_t> lineXPos;
|
||
lineXPos.reserve(lineWordCount);
|
||
|
||
for (size_t wordIdx = 0; wordIdx < lineWordCount; wordIdx++) {
|
||
lineXPos.push_back(xpos);
|
||
|
||
const bool nextIsContinuation = wordIdx + 1 < lineWordCount && continuesVec[lastBreakAt + wordIdx + 1];
|
||
if (nextIsContinuation) {
|
||
int advance = wordWidths[lastBreakAt + wordIdx];
|
||
// Cross-boundary kerning for continuation words (e.g. nonbreaking spaces, attached punctuation)
|
||
advance +=
|
||
renderer.getKerning(fontId, lastCodepoint(words[lastBreakAt + wordIdx]),
|
||
firstCodepoint(words[lastBreakAt + wordIdx + 1]), wordStyles[lastBreakAt + wordIdx]);
|
||
xpos += advance;
|
||
} else {
|
||
int gap = wordIdx + 1 < lineWordCount
|
||
? renderer.getSpaceAdvance(fontId, lastCodepoint(words[lastBreakAt + wordIdx]),
|
||
firstCodepoint(words[lastBreakAt + wordIdx + 1]),
|
||
wordStyles[lastBreakAt + wordIdx])
|
||
: renderer.getSpaceWidth(fontId, wordStyles[lastBreakAt + wordIdx]);
|
||
if (blockStyle.alignment == CssTextAlign::Justify && !isLastLine) {
|
||
gap += justifyExtra;
|
||
}
|
||
xpos += wordWidths[lastBreakAt + wordIdx] + gap;
|
||
}
|
||
}
|
||
|
||
// Build line data by moving from the original vectors using index range
|
||
std::vector<std::string> lineWords(std::make_move_iterator(words.begin() + lastBreakAt),
|
||
std::make_move_iterator(words.begin() + lineBreak));
|
||
std::vector<EpdFontFamily::Style> lineWordStyles(wordStyles.begin() + lastBreakAt, wordStyles.begin() + lineBreak);
|
||
|
||
for (auto& word : lineWords) {
|
||
if (containsSoftHyphen(word)) {
|
||
stripSoftHyphensInPlace(word);
|
||
}
|
||
}
|
||
|
||
processLine(
|
||
std::make_shared<TextBlock>(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), blockStyle));
|
||
}
|