diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index c9cd07a..b2ff819 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -1,15 +1,72 @@ #include "ParsedText.h" #include +#include "hyphenation/Hyphenator.h" #include #include #include +#include #include #include constexpr int MAX_COST = std::numeric_limits::max(); +namespace { + +struct HyphenSplitDecision { + size_t byteOffset; + uint16_t prefixWidth; +}; + +struct HyphenationGuard { + size_t prefixIndex; + size_t tailIndex; +}; + +bool chooseSplitForWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, + const EpdFontStyle style, const int availableWidth, const bool includeFallback, + HyphenSplitDecision* decision) { + if (!decision || availableWidth <= 0) { + return false; + } + + const int hyphenWidth = renderer.getTextWidth(fontId, "-", style); + const int adjustedWidth = availableWidth - hyphenWidth; + if (adjustedWidth <= 0) { + return false; + } + + auto offsets = Hyphenator::breakOffsets(word, includeFallback); + if (offsets.empty()) { + return false; + } + + size_t chosenOffset = std::numeric_limits::max(); + uint16_t chosenWidth = 0; + + for (const size_t offset : offsets) { + const std::string prefix = word.substr(0, offset); + const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); + if (prefixWidth <= adjustedWidth) { + chosenOffset = offset; + chosenWidth = static_cast(prefixWidth + hyphenWidth); + } else { + break; + } + } + + if (chosenOffset == std::numeric_limits::max()) { + return false; + } + + decision->byteOffset = chosenOffset; + decision->prefixWidth = chosenWidth; + return true; +} + +} // namespace + void ParsedText::addWord(std::string word, const EpdFontStyle fontStyle) { if (word.empty()) return; @@ -27,8 +84,9 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo const int pageWidth = renderer.getScreenWidth() - horizontalMargin; const int spaceWidth = renderer.getSpaceWidth(fontId); - const auto wordWidths = calculateWordWidths(renderer, fontId); - const auto lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths); + // Pre-split oversized tokens so the DP step always has feasible line candidates. + auto wordWidths = calculateWordWidths(renderer, fontId, pageWidth); + auto lineBreakIndices = computeLineBreaks(renderer, fontId, pageWidth, spaceWidth, wordWidths); const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1; for (size_t i = 0; i < lineCount; ++i) { @@ -36,7 +94,8 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo } } -std::vector ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) { +std::vector ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId, + const int pageWidth) { const size_t totalWordCount = words.size(); std::vector wordWidths; @@ -52,7 +111,32 @@ std::vector ParsedText::calculateWordWidths(const GfxRenderer& rendere auto wordStylesIt = wordStyles.begin(); while (wordsIt != words.end()) { - wordWidths.push_back(renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt)); + uint16_t width = renderer.getTextWidth(fontId, wordsIt->c_str(), *wordStylesIt); + + if (width > pageWidth) { + HyphenSplitDecision decision; + if (chooseSplitForWidth(renderer, fontId, *wordsIt, *wordStylesIt, pageWidth, true, &decision)) { + const std::string originalWord = *wordsIt; + const std::string tail = originalWord.substr(decision.byteOffset); + if (tail.empty()) { + continue; + } + const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-"; + + *wordsIt = prefix; + auto nextWordIt = words.insert(std::next(wordsIt), tail); + auto nextStyleIt = wordStyles.insert(std::next(wordStylesIt), *wordStylesIt); + // Continue processing the freshly inserted tail so cascading splits still respect the limit. + + wordWidths.push_back(decision.prefixWidth); + + wordsIt = nextWordIt; + wordStylesIt = nextStyleIt; + continue; + } + } + + wordWidths.push_back(width); std::advance(wordsIt, 1); std::advance(wordStylesIt, 1); @@ -61,70 +145,159 @@ std::vector ParsedText::calculateWordWidths(const GfxRenderer& rendere return wordWidths; } -std::vector ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth, - const std::vector& wordWidths) const { - const size_t totalWordCount = words.size(); +std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, const int fontId, const int pageWidth, + const int spaceWidth, std::vector& wordWidths) { + if (words.empty()) { + return {}; + } - // DP table to store the minimum badness (cost) of lines starting at index i - std::vector dp(totalWordCount); - // 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i' - std::vector ans(totalWordCount); + std::vector guards; - // Base Case - dp[totalWordCount - 1] = 0; - ans[totalWordCount - 1] = totalWordCount - 1; + auto shiftGuardIndices = [&](size_t insertPos) { + for (auto& guard : guards) { + if (guard.prefixIndex >= insertPos) { + guard.prefixIndex++; + } + if (guard.tailIndex >= insertPos) { + guard.tailIndex++; + } + } + }; - for (int i = totalWordCount - 2; i >= 0; --i) { - int currlen = -spaceWidth; - dp[i] = MAX_COST; + auto runDp = [&](std::vector& lineBreaks) { + const size_t totalWordCount = wordWidths.size(); - for (size_t j = i; j < totalWordCount; ++j) { - // Current line length: previous width + space + current word width - currlen += wordWidths[j] + spaceWidth; + std::vector dp(totalWordCount); + std::vector ans(totalWordCount); - if (currlen > pageWidth) { + dp[totalWordCount - 1] = 0; + ans[totalWordCount - 1] = totalWordCount - 1; + + for (int i = static_cast(totalWordCount) - 2; i >= 0; --i) { + int currlen = -spaceWidth; + dp[i] = MAX_COST; + + for (size_t j = i; j < totalWordCount; ++j) { + currlen += wordWidths[j] + spaceWidth; + + if (currlen > pageWidth) { + break; + } + + bool violatesGuard = false; + for (const auto& guard : guards) { + if (i <= guard.prefixIndex && j >= guard.tailIndex) { + violatesGuard = true; + break; + } + } + if (violatesGuard) { + continue; + } + + int cost; + if (j == totalWordCount - 1) { + cost = 0; + } else { + const int remainingSpace = pageWidth - currlen; + const long long cost_ll = static_cast(remainingSpace) * remainingSpace + dp[j + 1]; + cost = cost_ll > MAX_COST ? MAX_COST : static_cast(cost_ll); + } + + if (cost < dp[i]) { + dp[i] = cost; + ans[i] = j; + } + } + } + + lineBreaks.clear(); + size_t currentWordIndex = 0; + constexpr size_t MAX_LINES = 1000; + + while (currentWordIndex < totalWordCount && lineBreaks.size() < MAX_LINES) { + const size_t nextBreakIndex = ans[currentWordIndex] + 1; + lineBreaks.push_back(nextBreakIndex); + currentWordIndex = nextBreakIndex; + } + }; + + std::vector lineBreakIndices; + + while (true) { + runDp(lineBreakIndices); + + if (!hyphenationEnabled) { + return lineBreakIndices; + } + + bool insertedSplit = false; + size_t lastBreakAt = 0; + + for (size_t lineIdx = 0; lineIdx < lineBreakIndices.size(); ++lineIdx) { + const size_t lineBreak = lineBreakIndices[lineIdx]; + const bool isLastLine = lineIdx == lineBreakIndices.size() - 1; + const size_t lineWordCount = lineBreak - lastBreakAt; + + int lineWordWidthSum = 0; + for (size_t idx = lastBreakAt; idx < lineBreak; ++idx) { + lineWordWidthSum += wordWidths[idx]; + } + lastBreakAt = lineBreak; + + if (isLastLine || lineBreak >= wordWidths.size()) { + continue; + } + + const size_t spacingCount = lineWordCount > 0 ? lineWordCount - 1 : 0; + const int usedSpace = lineWordWidthSum + static_cast(spacingCount) * spaceWidth; + const int unusedWidth = pageWidth - usedSpace; + const int spaceNeeded = lineWordCount == 0 ? 0 : spaceWidth; + const int budgetForPrefix = unusedWidth - spaceNeeded; + if (budgetForPrefix <= 0) { + continue; + } + + auto nextWordIt = words.begin(); + auto nextStyleIt = wordStyles.begin(); + std::advance(nextWordIt, lineBreak); + std::advance(nextStyleIt, lineBreak); + + if (nextWordIt == words.end()) { break; } - int cost; - if (j == totalWordCount - 1) { - cost = 0; // Last line - } else { - const int remainingSpace = pageWidth - currlen; - // Use long long for the square to prevent overflow - const long long cost_ll = static_cast(remainingSpace) * remainingSpace + dp[j + 1]; - - if (cost_ll > MAX_COST) { - cost = MAX_COST; - } else { - cost = static_cast(cost_ll); - } + HyphenSplitDecision decision; + if (!chooseSplitForWidth(renderer, fontId, *nextWordIt, *nextStyleIt, budgetForPrefix, false, &decision)) { + continue; } - if (cost < dp[i]) { - dp[i] = cost; - ans[i] = j; // j is the index of the last word in this optimal line + const EpdFontStyle styleForSplit = *nextStyleIt; + const std::string originalWord = *nextWordIt; + const std::string prefix = originalWord.substr(0, decision.byteOffset) + "-"; + const std::string tail = originalWord.substr(decision.byteOffset); + if (tail.empty()) { + continue; } - } - } - // Stores the index of the word that starts the next line (last_word_index + 1) - std::vector lineBreakIndices; - size_t currentWordIndex = 0; - constexpr size_t MAX_LINES = 1000; + *nextWordIt = tail; + words.insert(nextWordIt, prefix); + wordStyles.insert(nextStyleIt, styleForSplit); - while (currentWordIndex < totalWordCount) { - if (lineBreakIndices.size() >= MAX_LINES) { + const uint16_t tailWidth = renderer.getTextWidth(fontId, tail.c_str(), styleForSplit); + wordWidths.insert(wordWidths.begin() + lineBreak, decision.prefixWidth); + wordWidths[lineBreak + 1] = tailWidth; + + shiftGuardIndices(lineBreak); + guards.push_back({lineBreak, lineBreak + 1}); + insertedSplit = true; break; } - size_t nextBreakIndex = ans[currentWordIndex] + 1; - lineBreakIndices.push_back(nextBreakIndex); - - currentWordIndex = nextBreakIndex; + if (!insertedSplit) { + return lineBreakIndices; + } } - - return lineBreakIndices; } void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth, @@ -136,8 +309,8 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const // Calculate total word width for this line int lineWordWidthSum = 0; - for (size_t i = lastBreakAt; i < lineBreak; i++) { - lineWordWidthSum += wordWidths[i]; + for (size_t idx = lastBreakAt; idx < lineBreak; ++idx) { + lineWordWidthSum += wordWidths[idx]; } // Calculate spacing diff --git a/lib/Epub/Epub/ParsedText.h b/lib/Epub/Epub/ParsedText.h index 6f745b9..f0e0405 100644 --- a/lib/Epub/Epub/ParsedText.h +++ b/lib/Epub/Epub/ParsedText.h @@ -19,11 +19,12 @@ class ParsedText { bool extraParagraphSpacing; bool hyphenationEnabled; - std::vector computeLineBreaks(int pageWidth, int spaceWidth, const std::vector& wordWidths) const; + std::vector computeLineBreaks(const GfxRenderer& renderer, int fontId, int pageWidth, int spaceWidth, + std::vector& wordWidths); void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector& wordWidths, const std::vector& lineBreakIndices, const std::function)>& processLine); - std::vector calculateWordWidths(const GfxRenderer& renderer, int fontId); + std::vector calculateWordWidths(const GfxRenderer& renderer, int fontId, int pageWidth); public: explicit ParsedText(const TextBlock::BLOCK_STYLE style, const bool extraParagraphSpacing, diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.cpp b/lib/Epub/Epub/hyphenation/Hyphenator.cpp index fff1a61..de8cd83 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp +++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp @@ -1,11 +1,9 @@ #include "Hyphenator.h" -#include #include #include #include -#include #include #include "EnglishHyphenator.h" @@ -87,84 +85,44 @@ size_t byteOffsetForIndex(const std::vector& cps, const size_t in return cps[index].byteOffset; } -// Safely slices a UTF-8 string without splitting multibyte sequences. -std::string slice(const std::string& word, const size_t startByte, const size_t endByte) { - if (startByte >= endByte || startByte >= word.size()) { - return std::string(); - } - const size_t boundedEnd = std::min(endByte, word.size()); - return word.substr(startByte, boundedEnd - startByte); -} - } // namespace -bool Hyphenator::splitWord(const GfxRenderer& renderer, const int fontId, const std::string& word, - const EpdFontStyle style, const int availableWidth, HyphenationResult* result, - const bool force) { - if (!result || word.empty()) { - return false; +std::vector Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) { + std::vector byteOffsets; + if (word.empty()) { + return byteOffsets; } auto cps = collectCodepoints(word); if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) { - return false; + return byteOffsets; } - // Skip mixed tokens (e.g., "v2.0") unless the caller forces a split due to overflow. - if (!force && !hasOnlyAlphabetic(cps)) { - return false; + std::vector indexes; + indexes.reserve(cps.size()); + + if (hasOnlyAlphabetic(cps)) { + auto dictBreaks = collectBreakIndexes(cps); + indexes.insert(indexes.end(), dictBreaks.begin(), dictBreaks.end()); } - const auto breakIndexes = collectBreakIndexes(cps); - // Budget for a trailing hyphen so rendered width matches the layout test. - const int hyphenWidth = renderer.getTextWidth(fontId, "-", style); - const int adjustedWidth = availableWidth - hyphenWidth; - - size_t chosenIndex = std::numeric_limits::max(); - - // Prefer dictionary-style break points emitted by language hyphenators. - if (adjustedWidth > 0) { - for (const size_t idx : breakIndexes) { - const size_t byteOffset = byteOffsetForIndex(cps, idx); - const std::string prefix = word.substr(0, byteOffset); - const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); - if (prefixWidth <= adjustedWidth) { - chosenIndex = idx; - } else { - break; - } - } - } - - if (chosenIndex == std::numeric_limits::max() && force) { - // Emergency fallback: brute-force through codepoints to avoid overflow when no legal breaks fit. + if (includeFallback) { for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) { - const size_t byteOffset = byteOffsetForIndex(cps, idx); - const std::string prefix = word.substr(0, byteOffset); - const int prefixWidth = renderer.getTextWidth(fontId, prefix.c_str(), style); - if (adjustedWidth <= 0 || prefixWidth <= adjustedWidth) { - chosenIndex = idx; - if (adjustedWidth > 0 && prefixWidth > adjustedWidth) { - break; - } - } + indexes.push_back(idx); } } - if (chosenIndex == std::numeric_limits::max()) { - return false; + if (indexes.empty()) { + return byteOffsets; } - const size_t splitByte = byteOffsetForIndex(cps, chosenIndex); - const std::string head = word.substr(0, splitByte); - const std::string tail = slice(word, splitByte, word.size()); + std::sort(indexes.begin(), indexes.end()); + indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end()); - if (head.empty() || tail.empty()) { - return false; + byteOffsets.reserve(indexes.size()); + for (const size_t idx : indexes) { + byteOffsets.push_back(byteOffsetForIndex(cps, idx)); } - // Append the printed hyphen to the prefix while leaving the tail untouched. - result->head = head + "-"; - result->tail = tail; - return true; + return byteOffsets; } diff --git a/lib/Epub/Epub/hyphenation/Hyphenator.h b/lib/Epub/Epub/hyphenation/Hyphenator.h index 8c0bd78..ba0319d 100644 --- a/lib/Epub/Epub/hyphenation/Hyphenator.h +++ b/lib/Epub/Epub/hyphenation/Hyphenator.h @@ -1,20 +1,12 @@ #pragma once -#include - +#include #include - -class GfxRenderer; - -// Holds the split portions of a hyphenated word. -struct HyphenationResult { - std::string head; - std::string tail; -}; +#include class Hyphenator { public: - // Splits a word so it fits within availableWidth, appending a hyphen to the head when needed. - static bool splitWord(const GfxRenderer& renderer, int fontId, const std::string& word, EpdFontStyle style, - int availableWidth, HyphenationResult* result, bool force); + // Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the + // minimum prefix/suffix constraints are returned even if no language-specific rule matches. + static std::vector breakOffsets(const std::string& word, bool includeFallback); }; \ No newline at end of file