From 299623927ea9d67fe371a56f78142b17b7e70fd9 Mon Sep 17 00:00:00 2001 From: Dave Allie Date: Sun, 21 Dec 2025 13:43:19 +1100 Subject: [PATCH] Build out lines when parsing html and holding >750 words in buffer (#73) ## Summary * Build out lines for pages when holding over 750 buffered words * Should fix issues with parsing long blocks of text causing memory crashes --- lib/Epub/Epub/ParsedText.cpp | 139 ++++++++++-------- lib/Epub/Epub/ParsedText.h | 12 +- .../Epub/parsers/ChapterHtmlSlimParser.cpp | 11 ++ 3 files changed, 96 insertions(+), 66 deletions(-) diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 3747246..eff3fd6 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -19,14 +19,25 @@ void ParsedText::addWord(std::string word, const EpdFontStyle fontStyle) { // Consumes data to minimize memory usage void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fontId, const int horizontalMargin, - const std::function)>& processLine) { + const std::function)>& processLine, + const bool includeLastLine) { if (words.empty()) { return; } - const size_t totalWordCount = words.size(); const int pageWidth = renderer.getScreenWidth() - horizontalMargin; const int spaceWidth = renderer.getSpaceWidth(fontId); + const auto wordWidths = calculateWordWidths(renderer, fontId); + const auto lineBreakIndices = computeLineBreaks(pageWidth, spaceWidth, wordWidths); + const size_t lineCount = includeLastLine ? lineBreakIndices.size() : lineBreakIndices.size() - 1; + + for (size_t i = 0; i < lineCount; ++i) { + extractLine(i, pageWidth, spaceWidth, wordWidths, lineBreakIndices, processLine); + } +} + +std::vector ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) { + const size_t totalWordCount = words.size(); std::vector wordWidths; wordWidths.reserve(totalWordCount); @@ -47,6 +58,13 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo std::advance(wordStylesIt, 1); } + return wordWidths; +} + +std::vector ParsedText::computeLineBreaks(const int pageWidth, const int spaceWidth, + const std::vector& wordWidths) const { + const size_t totalWordCount = words.size(); + // DP table to store the minimum badness (cost) of lines starting at index i std::vector dp(totalWordCount); // 'ans[i]' stores the index 'j' of the *last word* in the optimal line starting at 'i' @@ -106,66 +124,59 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo currentWordIndex = nextBreakIndex; } - // Initialize iterators for consumption - auto wordStartIt = words.begin(); - auto wordStyleStartIt = wordStyles.begin(); - size_t wordWidthIndex = 0; - - size_t lastBreakAt = 0; - for (const size_t lineBreak : lineBreakIndices) { - const size_t lineWordCount = lineBreak - lastBreakAt; - - // Calculate end iterators for the range to splice - auto wordEndIt = wordStartIt; - auto wordStyleEndIt = wordStyleStartIt; - std::advance(wordEndIt, lineWordCount); - std::advance(wordStyleEndIt, lineWordCount); - - // Calculate total word width for this line - int lineWordWidthSum = 0; - for (size_t i = 0; i < lineWordCount; ++i) { - lineWordWidthSum += wordWidths[wordWidthIndex + i]; - } - - // Calculate spacing - int spareSpace = pageWidth - lineWordWidthSum; - - int spacing = spaceWidth; - const bool isLastLine = lineBreak == totalWordCount; - - if (style == TextBlock::JUSTIFIED && !isLastLine && lineWordCount >= 2) { - spacing = spareSpace / (lineWordCount - 1); - } - - // Calculate initial x position - uint16_t xpos = 0; - if (style == TextBlock::RIGHT_ALIGN) { - xpos = spareSpace - (lineWordCount - 1) * spaceWidth; - } else if (style == TextBlock::CENTER_ALIGN) { - xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2; - } - - // Pre-calculate X positions for words - std::list lineXPos; - for (size_t i = 0; i < lineWordCount; ++i) { - const uint16_t currentWordWidth = wordWidths[wordWidthIndex + i]; - lineXPos.push_back(xpos); - xpos += currentWordWidth + spacing; - } - - // *** CRITICAL STEP: CONSUME DATA USING SPLICE *** - std::list lineWords; - lineWords.splice(lineWords.begin(), words, wordStartIt, wordEndIt); - std::list lineWordStyles; - lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyleStartIt, wordStyleEndIt); - - processLine( - std::make_shared(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style)); - - // Update pointers/indices for the next line - wordStartIt = wordEndIt; - wordStyleStartIt = wordStyleEndIt; - wordWidthIndex += lineWordCount; - lastBreakAt = lineBreak; - } + return lineBreakIndices; +} + +void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const int spaceWidth, + const std::vector& wordWidths, const std::vector& lineBreakIndices, + const std::function)>& processLine) { + const size_t lineBreak = lineBreakIndices[breakIndex]; + const size_t lastBreakAt = breakIndex > 0 ? lineBreakIndices[breakIndex - 1] : 0; + const size_t lineWordCount = lineBreak - lastBreakAt; + + // Calculate total word width for this line + int lineWordWidthSum = 0; + for (size_t i = lastBreakAt; i < lineBreak; i++) { + lineWordWidthSum += wordWidths[i]; + } + + // Calculate spacing + const int spareSpace = pageWidth - lineWordWidthSum; + + int spacing = spaceWidth; + const bool isLastLine = lineBreak == words.size(); + + if (style == TextBlock::JUSTIFIED && !isLastLine && lineWordCount >= 2) { + spacing = spareSpace / (lineWordCount - 1); + } + + // Calculate initial x position + uint16_t xpos = 0; + if (style == TextBlock::RIGHT_ALIGN) { + xpos = spareSpace - (lineWordCount - 1) * spaceWidth; + } else if (style == TextBlock::CENTER_ALIGN) { + xpos = (spareSpace - (lineWordCount - 1) * spaceWidth) / 2; + } + + // Pre-calculate X positions for words + std::list lineXPos; + for (size_t i = lastBreakAt; i < lineBreak; i++) { + const uint16_t currentWordWidth = wordWidths[i]; + lineXPos.push_back(xpos); + xpos += currentWordWidth + spacing; + } + + // Iterators always start at the beginning as we are moving content with splice below + auto wordEndIt = words.begin(); + auto wordStyleEndIt = wordStyles.begin(); + std::advance(wordEndIt, lineWordCount); + std::advance(wordStyleEndIt, lineWordCount); + + // *** CRITICAL STEP: CONSUME DATA USING SPLICE *** + std::list lineWords; + lineWords.splice(lineWords.begin(), words, words.begin(), wordEndIt); + std::list lineWordStyles; + lineWordStyles.splice(lineWordStyles.begin(), wordStyles, wordStyles.begin(), wordStyleEndIt); + + processLine(std::make_shared(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), style)); } diff --git a/lib/Epub/Epub/ParsedText.h b/lib/Epub/Epub/ParsedText.h index 0bd2544..7fdb128 100644 --- a/lib/Epub/Epub/ParsedText.h +++ b/lib/Epub/Epub/ParsedText.h @@ -2,11 +2,11 @@ #include -#include #include #include #include #include +#include #include "blocks/TextBlock.h" @@ -18,6 +18,12 @@ class ParsedText { TextBlock::BLOCK_STYLE style; bool extraParagraphSpacing; + std::vector computeLineBreaks(int pageWidth, int spaceWidth, const std::vector& wordWidths) const; + void extractLine(size_t breakIndex, int pageWidth, int spaceWidth, const std::vector& wordWidths, + const std::vector& lineBreakIndices, + const std::function)>& processLine); + std::vector calculateWordWidths(const GfxRenderer& renderer, int fontId); + public: explicit ParsedText(const TextBlock::BLOCK_STYLE style, const bool extraParagraphSpacing) : style(style), extraParagraphSpacing(extraParagraphSpacing) {} @@ -26,7 +32,9 @@ class ParsedText { void addWord(std::string word, EpdFontStyle fontStyle); void setStyle(const TextBlock::BLOCK_STYLE style) { this->style = style; } TextBlock::BLOCK_STYLE getStyle() const { return style; } + size_t size() const { return words.size(); } bool isEmpty() const { return words.empty(); } void layoutAndExtractLines(const GfxRenderer& renderer, int fontId, int horizontalMargin, - const std::function)>& processLine); + const std::function)>& processLine, + bool includeLastLine = true); }; diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index d4edc33..718f4d7 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -143,6 +143,17 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char self->partWordBuffer[self->partWordBufferIndex++] = s[i]; } + + // If we have > 750 words buffered up, perform the layout and consume out all but the last line + // There should be enough here to build out 1-2 full pages and doing this will free up a lot of + // memory. + // Spotted when reading Intermezzo, there are some really long text blocks in there. + if (self->currentTextBlock->size() > 750) { + Serial.printf("[%lu] [EHP] Text block too long, splitting into multiple pages\n", millis()); + self->currentTextBlock->layoutAndExtractLines( + self->renderer, self->fontId, self->marginLeft + self->marginRight, + [self](const std::shared_ptr& textBlock) { self->addLineToPage(textBlock); }, false); + } } void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {