Files
crosspoint-reader-mod/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
jpirnay 6be4413c97 fix: Don't extract unsupported formats (#977)
## Summary

* During chapter parsing, every <img> tag triggered ZIP decompression
and an SD card write regardless of whether the image format was
supported. The mandatory delay(50) after each SD write compounded the
cost. A chapter with 6 GIF images (a common decorative element in older
EPUBs) wasted ~750 ms before any text rendering began.
* **What changes are included?**
Added an ``ImageDecoderFactory::isFormatSupported()`` check before any
file I/O in the img-handler. Only JPEG and PNG proceed to extraction;
all other formats (GIF, SVG, WebP, etc.) fall through immediately to
alt-text rendering with no SD card access.
## Additional Context


Measured impact on a representative chapter with 6 GIF decorations:
|  | Before | After|
|-- | -- | --|
|Total parse time | ~882 ms | ~207 ms|
|Image handling | ~750 ms | ~76 ms|
---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**NO**_
2026-02-19 20:35:13 +11:00

761 lines
29 KiB
C++

#include "ChapterHtmlSlimParser.h"
#include <FsHelpers.h>
#include <GfxRenderer.h>
#include <HalStorage.h>
#include <Logging.h>
#include <expat.h>
#include "../../Epub.h"
#include "../Page.h"
#include "../converters/ImageDecoderFactory.h"
#include "../converters/ImageToFramebufferDecoder.h"
#include "../htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
// Minimum file size (in bytes) to show indexing popup - smaller chapters don't benefit from it
constexpr size_t MIN_SIZE_FOR_POPUP = 10 * 1024; // 10KB
constexpr size_t PARSE_BUFFER_SIZE = 1024;
const char* BLOCK_TAGS[] = {"p", "li", "div", "br", "blockquote"};
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
const char* BOLD_TAGS[] = {"b", "strong"};
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
const char* ITALIC_TAGS[] = {"i", "em"};
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
const char* UNDERLINE_TAGS[] = {"u", "ins"};
constexpr int NUM_UNDERLINE_TAGS = sizeof(UNDERLINE_TAGS) / sizeof(UNDERLINE_TAGS[0]);
const char* IMAGE_TAGS[] = {"img"};
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
const char* SKIP_TAGS[] = {"head"};
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; }
// given the start and end of a tag, check to see if it matches a known tag
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
for (int i = 0; i < possible_tag_count; i++) {
if (strcmp(tag_name, possible_tags[i]) == 0) {
return true;
}
}
return false;
}
bool isHeaderOrBlock(const char* name) {
return matches(name, HEADER_TAGS, NUM_HEADER_TAGS) || matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS);
}
// Update effective bold/italic/underline based on block style and inline style stack
void ChapterHtmlSlimParser::updateEffectiveInlineStyle() {
// Start with block-level styles
effectiveBold = currentCssStyle.hasFontWeight() && currentCssStyle.fontWeight == CssFontWeight::Bold;
effectiveItalic = currentCssStyle.hasFontStyle() && currentCssStyle.fontStyle == CssFontStyle::Italic;
effectiveUnderline =
currentCssStyle.hasTextDecoration() && currentCssStyle.textDecoration == CssTextDecoration::Underline;
// Apply inline style stack in order
for (const auto& entry : inlineStyleStack) {
if (entry.hasBold) {
effectiveBold = entry.bold;
}
if (entry.hasItalic) {
effectiveItalic = entry.italic;
}
if (entry.hasUnderline) {
effectiveUnderline = entry.underline;
}
}
}
// flush the contents of partWordBuffer to currentTextBlock
void ChapterHtmlSlimParser::flushPartWordBuffer() {
// Determine font style from depth-based tracking and CSS effective style
const bool isBold = boldUntilDepth < depth || effectiveBold;
const bool isItalic = italicUntilDepth < depth || effectiveItalic;
const bool isUnderline = underlineUntilDepth < depth || effectiveUnderline;
// Combine style flags using bitwise OR
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
if (isBold) {
fontStyle = static_cast<EpdFontFamily::Style>(fontStyle | EpdFontFamily::BOLD);
}
if (isItalic) {
fontStyle = static_cast<EpdFontFamily::Style>(fontStyle | EpdFontFamily::ITALIC);
}
if (isUnderline) {
fontStyle = static_cast<EpdFontFamily::Style>(fontStyle | EpdFontFamily::UNDERLINE);
}
// flush the buffer
partWordBuffer[partWordBufferIndex] = '\0';
currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues);
partWordBufferIndex = 0;
nextWordContinues = false;
}
// start a new text block if needed
void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) {
nextWordContinues = false; // New block = new paragraph, no continuation
if (currentTextBlock) {
// already have a text block running and it is empty - just reuse it
if (currentTextBlock->isEmpty()) {
// Merge with existing block style to accumulate CSS styling from parent block elements.
// This handles cases like <div style="margin-bottom:2em"><h1>text</h1></div> where the
// div's margin should be preserved, even though it has no direct text content.
currentTextBlock->setBlockStyle(currentTextBlock->getBlockStyle().getCombinedBlockStyle(blockStyle));
return;
}
makePages();
}
currentTextBlock.reset(new ParsedText(extraParagraphSpacing, hyphenationEnabled, blockStyle));
}
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
// Middle of skip
if (self->skipUntilDepth < self->depth) {
self->depth += 1;
return;
}
// Extract class and style attributes for CSS processing
std::string classAttr;
std::string styleAttr;
if (atts != nullptr) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "class") == 0) {
classAttr = atts[i + 1];
} else if (strcmp(atts[i], "style") == 0) {
styleAttr = atts[i + 1];
}
}
}
auto centeredBlockStyle = BlockStyle();
centeredBlockStyle.textAlignDefined = true;
centeredBlockStyle.alignment = CssTextAlign::Center;
// Special handling for tables - show placeholder text instead of dropping silently
if (strcmp(name, "table") == 0) {
// Add placeholder text
self->startNewTextBlock(centeredBlockStyle);
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
// Advance depth before processing character data (like you would for an element with text)
self->depth += 1;
self->characterData(userData, "[Table omitted]", strlen("[Table omitted]"));
// Skip table contents (skip until parent as we pre-advanced depth above)
self->skipUntilDepth = self->depth - 1;
return;
}
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
std::string src;
std::string alt;
if (atts != nullptr) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "src") == 0) {
src = atts[i + 1];
} else if (strcmp(atts[i], "alt") == 0) {
alt = atts[i + 1];
}
}
if (!src.empty()) {
LOG_DBG("EHP", "Found image: src=%s", src.c_str());
{
// Resolve the image path relative to the HTML file
std::string resolvedPath = FsHelpers::normalisePath(self->contentBase + src);
if (ImageDecoderFactory::isFormatSupported(resolvedPath)) {
// Create a unique filename for the cached image
std::string ext;
size_t extPos = resolvedPath.rfind('.');
if (extPos != std::string::npos) {
ext = resolvedPath.substr(extPos);
}
std::string cachedImagePath = self->imageBasePath + std::to_string(self->imageCounter++) + ext;
// Extract image to cache file
FsFile cachedImageFile;
bool extractSuccess = false;
if (Storage.openFileForWrite("EHP", cachedImagePath, cachedImageFile)) {
extractSuccess = self->epub->readItemContentsToStream(resolvedPath, cachedImageFile, 4096);
cachedImageFile.flush();
cachedImageFile.close();
delay(50); // Give SD card time to sync
}
if (extractSuccess) {
// Get image dimensions
ImageDimensions dims = {0, 0};
ImageToFramebufferDecoder* decoder = ImageDecoderFactory::getDecoder(cachedImagePath);
if (decoder && decoder->getDimensions(cachedImagePath, dims)) {
LOG_DBG("EHP", "Image dimensions: %dx%d", dims.width, dims.height);
// Scale to fit viewport while maintaining aspect ratio
int maxWidth = self->viewportWidth;
int maxHeight = self->viewportHeight;
float scaleX = (dims.width > maxWidth) ? (float)maxWidth / dims.width : 1.0f;
float scaleY = (dims.height > maxHeight) ? (float)maxHeight / dims.height : 1.0f;
float scale = (scaleX < scaleY) ? scaleX : scaleY;
if (scale > 1.0f) scale = 1.0f;
int displayWidth = (int)(dims.width * scale);
int displayHeight = (int)(dims.height * scale);
LOG_DBG("EHP", "Display size: %dx%d (scale %.2f)", displayWidth, displayHeight, scale);
// Create page for image - only break if image won't fit remaining space
if (self->currentPage && !self->currentPage->elements.empty() &&
(self->currentPageNextY + displayHeight > self->viewportHeight)) {
self->completePageFn(std::move(self->currentPage));
self->currentPage.reset(new Page());
if (!self->currentPage) {
LOG_ERR("EHP", "Failed to create new page");
return;
}
self->currentPageNextY = 0;
} else if (!self->currentPage) {
self->currentPage.reset(new Page());
if (!self->currentPage) {
LOG_ERR("EHP", "Failed to create initial page");
return;
}
self->currentPageNextY = 0;
}
// Create ImageBlock and add to page
auto imageBlock = std::make_shared<ImageBlock>(cachedImagePath, displayWidth, displayHeight);
if (!imageBlock) {
LOG_ERR("EHP", "Failed to create ImageBlock");
return;
}
int xPos = (self->viewportWidth - displayWidth) / 2;
auto pageImage = std::make_shared<PageImage>(imageBlock, xPos, self->currentPageNextY);
if (!pageImage) {
LOG_ERR("EHP", "Failed to create PageImage");
return;
}
self->currentPage->elements.push_back(pageImage);
self->currentPageNextY += displayHeight;
self->depth += 1;
return;
} else {
LOG_ERR("EHP", "Failed to get image dimensions");
Storage.remove(cachedImagePath.c_str());
}
} else {
LOG_ERR("EHP", "Failed to extract image");
}
} // isFormatSupported
}
}
// Fallback to alt text if image processing fails
if (!alt.empty()) {
alt = "[Image: " + alt + "]";
self->startNewTextBlock(centeredBlockStyle);
self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
self->depth += 1;
self->characterData(userData, alt.c_str(), alt.length());
// Skip any child content (skip until parent as we pre-advanced depth above)
self->skipUntilDepth = self->depth - 1;
return;
}
// No alt text, skip
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
}
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
// start skip
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
// Skip blocks with role="doc-pagebreak" and epub:type="pagebreak"
if (atts != nullptr) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 ||
strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) {
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
}
}
// Compute CSS style for this element
CssStyle cssStyle;
if (self->cssParser) {
// Get combined tag + class styles
cssStyle = self->cssParser->resolveStyle(name, classAttr);
// Merge inline style (highest priority)
if (!styleAttr.empty()) {
CssStyle inlineStyle = CssParser::parseInlineStyle(styleAttr);
cssStyle.applyOver(inlineStyle);
}
}
const float emSize = static_cast<float>(self->renderer.getLineHeight(self->fontId)) * self->lineCompression;
const auto userAlignmentBlockStyle = BlockStyle::fromCssStyle(
cssStyle, emSize, static_cast<CssTextAlign>(self->paragraphAlignment), self->viewportWidth);
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
self->currentCssStyle = cssStyle;
auto headerBlockStyle = BlockStyle::fromCssStyle(cssStyle, emSize, CssTextAlign::Center, self->viewportWidth);
headerBlockStyle.textAlignDefined = true;
if (self->embeddedStyle && cssStyle.hasTextAlign()) {
headerBlockStyle.alignment = cssStyle.textAlign;
}
self->startNewTextBlock(headerBlockStyle);
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
self->updateEffectiveInlineStyle();
} else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
if (strcmp(name, "br") == 0) {
if (self->partWordBufferIndex > 0) {
// flush word preceding <br/> to currentTextBlock before calling startNewTextBlock
self->flushPartWordBuffer();
}
self->startNewTextBlock(self->currentTextBlock->getBlockStyle());
} else {
self->currentCssStyle = cssStyle;
self->startNewTextBlock(userAlignmentBlockStyle);
self->updateEffectiveInlineStyle();
if (strcmp(name, "li") == 0) {
self->currentTextBlock->addWord("\xe2\x80\xa2", EpdFontFamily::REGULAR);
}
}
} else if (matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS)) {
// Flush buffer before style change so preceding text gets current style
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
self->nextWordContinues = true;
}
self->underlineUntilDepth = std::min(self->underlineUntilDepth, self->depth);
// Push inline style entry for underline tag
StyleStackEntry entry;
entry.depth = self->depth; // Track depth for matching pop
entry.hasUnderline = true;
entry.underline = true;
if (cssStyle.hasFontWeight()) {
entry.hasBold = true;
entry.bold = cssStyle.fontWeight == CssFontWeight::Bold;
}
if (cssStyle.hasFontStyle()) {
entry.hasItalic = true;
entry.italic = cssStyle.fontStyle == CssFontStyle::Italic;
}
self->inlineStyleStack.push_back(entry);
self->updateEffectiveInlineStyle();
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
// Flush buffer before style change so preceding text gets current style
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
self->nextWordContinues = true;
}
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
// Push inline style entry for bold tag
StyleStackEntry entry;
entry.depth = self->depth; // Track depth for matching pop
entry.hasBold = true;
entry.bold = true;
if (cssStyle.hasFontStyle()) {
entry.hasItalic = true;
entry.italic = cssStyle.fontStyle == CssFontStyle::Italic;
}
if (cssStyle.hasTextDecoration()) {
entry.hasUnderline = true;
entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline;
}
self->inlineStyleStack.push_back(entry);
self->updateEffectiveInlineStyle();
} else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
// Flush buffer before style change so preceding text gets current style
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
self->nextWordContinues = true;
}
self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
// Push inline style entry for italic tag
StyleStackEntry entry;
entry.depth = self->depth; // Track depth for matching pop
entry.hasItalic = true;
entry.italic = true;
if (cssStyle.hasFontWeight()) {
entry.hasBold = true;
entry.bold = cssStyle.fontWeight == CssFontWeight::Bold;
}
if (cssStyle.hasTextDecoration()) {
entry.hasUnderline = true;
entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline;
}
self->inlineStyleStack.push_back(entry);
self->updateEffectiveInlineStyle();
} else if (strcmp(name, "span") == 0 || !isHeaderOrBlock(name)) {
// Handle span and other inline elements for CSS styling
if (cssStyle.hasFontWeight() || cssStyle.hasFontStyle() || cssStyle.hasTextDecoration()) {
// Flush buffer before style change so preceding text gets current style
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
self->nextWordContinues = true;
}
StyleStackEntry entry;
entry.depth = self->depth; // Track depth for matching pop
if (cssStyle.hasFontWeight()) {
entry.hasBold = true;
entry.bold = cssStyle.fontWeight == CssFontWeight::Bold;
}
if (cssStyle.hasFontStyle()) {
entry.hasItalic = true;
entry.italic = cssStyle.fontStyle == CssFontStyle::Italic;
}
if (cssStyle.hasTextDecoration()) {
entry.hasUnderline = true;
entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline;
}
self->inlineStyleStack.push_back(entry);
self->updateEffectiveInlineStyle();
}
}
// Unprocessed tag, just increasing depth and continue forward
self->depth += 1;
}
void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
// Middle of skip
if (self->skipUntilDepth < self->depth) {
return;
}
for (int i = 0; i < len; i++) {
if (isWhitespace(s[i])) {
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
}
// Whitespace is a real word boundary — reset continuation state
self->nextWordContinues = false;
// Skip the whitespace char
continue;
}
// Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0
// Render a visible space without allowing a line break around it.
if (static_cast<uint8_t>(s[i]) == 0xC2 && i + 1 < len && static_cast<uint8_t>(s[i + 1]) == 0xA0) {
// Flush any pending text so style is applied correctly.
if (self->partWordBufferIndex > 0) {
self->flushPartWordBuffer();
}
// Add a standalone space that attaches to the previous word.
self->partWordBuffer[0] = ' ';
self->partWordBuffer[1] = '\0';
self->partWordBufferIndex = 1;
self->nextWordContinues = true; // Attach space to previous word (no break).
self->flushPartWordBuffer();
// Ensure the next real word attaches to this space (no break).
self->nextWordContinues = true;
i++; // Skip the second byte (0xA0)
continue;
}
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
const XML_Char FEFF_BYTE_3 = static_cast<XML_Char>(0xBF);
if (s[i] == FEFF_BYTE_1) {
// Check if the next two bytes complete the 3-byte sequence
if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) {
// Sequence 0xEF 0xBB 0xBF found!
i += 2; // Skip the next two bytes
continue; // Move to the next iteration
}
}
// If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->flushPartWordBuffer();
}
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
}
// If we have > 750 words buffered up, perform the layout and consume out all but the last line
// There should be enough here to build out 1-2 full pages and doing this will free up a lot of
// memory.
// Spotted when reading Intermezzo, there are some really long text blocks in there.
if (self->currentTextBlock->size() > 750) {
LOG_DBG("EHP", "Text block too long, splitting into multiple pages");
self->currentTextBlock->layoutAndExtractLines(
self->renderer, self->fontId, self->viewportWidth,
[self](const std::shared_ptr<TextBlock>& textBlock) { self->addLineToPage(textBlock); }, false);
}
}
void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) {
// Check if this looks like an entity reference (&...;)
if (len >= 3 && s[0] == '&' && s[len - 1] == ';') {
const char* utf8Value = lookupHtmlEntity(s, len);
if (utf8Value != nullptr) {
// Known entity: expand to its UTF-8 value
characterData(userData, utf8Value, strlen(utf8Value));
return;
}
// Unknown entity: preserve original &...; sequence
characterData(userData, s, len);
return;
}
// Not an entity we recognize - skip it
}
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
// Check if any style state will change after we decrement depth
// If so, we MUST flush the partWordBuffer with the CURRENT style first
// Note: depth hasn't been decremented yet, so we check against (depth - 1)
const bool willPopStyleStack =
!self->inlineStyleStack.empty() && self->inlineStyleStack.back().depth == self->depth - 1;
const bool willClearBold = self->boldUntilDepth == self->depth - 1;
const bool willClearItalic = self->italicUntilDepth == self->depth - 1;
const bool willClearUnderline = self->underlineUntilDepth == self->depth - 1;
const bool styleWillChange = willPopStyleStack || willClearBold || willClearItalic || willClearUnderline;
const bool headerOrBlockTag = isHeaderOrBlock(name);
// Flush buffer with current style BEFORE any style changes
if (self->partWordBufferIndex > 0) {
// Flush if style will change OR if we're closing a block/structural element
const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 &&
!matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1;
const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) ||
matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 ||
matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) || self->depth == 1;
if (shouldFlush) {
self->flushPartWordBuffer();
// If closing an inline element, the next word fragment continues the same visual word
if (isInlineTag) {
self->nextWordContinues = true;
}
}
}
self->depth -= 1;
// Leaving skip
if (self->skipUntilDepth == self->depth) {
self->skipUntilDepth = INT_MAX;
}
// Leaving bold tag
if (self->boldUntilDepth == self->depth) {
self->boldUntilDepth = INT_MAX;
}
// Leaving italic tag
if (self->italicUntilDepth == self->depth) {
self->italicUntilDepth = INT_MAX;
}
// Leaving underline tag
if (self->underlineUntilDepth == self->depth) {
self->underlineUntilDepth = INT_MAX;
}
// Pop from inline style stack if we pushed an entry at this depth
// This handles all inline elements: b, i, u, span, etc.
if (!self->inlineStyleStack.empty() && self->inlineStyleStack.back().depth == self->depth) {
self->inlineStyleStack.pop_back();
self->updateEffectiveInlineStyle();
}
// Clear block style when leaving header or block elements
if (headerOrBlockTag) {
self->currentCssStyle.reset();
self->updateEffectiveInlineStyle();
}
}
bool ChapterHtmlSlimParser::parseAndBuildPages() {
auto paragraphAlignmentBlockStyle = BlockStyle();
paragraphAlignmentBlockStyle.textAlignDefined = true;
// Resolve None sentinel to Justify for initial block (no CSS context yet)
const auto align = (this->paragraphAlignment == static_cast<uint8_t>(CssTextAlign::None))
? CssTextAlign::Justify
: static_cast<CssTextAlign>(this->paragraphAlignment);
paragraphAlignmentBlockStyle.alignment = align;
startNewTextBlock(paragraphAlignmentBlockStyle);
const XML_Parser parser = XML_ParserCreate(nullptr);
int done;
if (!parser) {
LOG_ERR("EHP", "Couldn't allocate memory for parser");
return false;
}
// Handle HTML entities (like &nbsp;) that aren't in XML spec or DTD
// Using DefaultHandlerExpand preserves normal entity expansion from DOCTYPE
XML_SetDefaultHandlerExpand(parser, defaultHandlerExpand);
FsFile file;
if (!Storage.openFileForRead("EHP", filepath, file)) {
XML_ParserFree(parser);
return false;
}
// Get file size to decide whether to show indexing popup.
if (popupFn && file.size() >= MIN_SIZE_FOR_POPUP) {
popupFn();
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
// Compute the time taken to parse and build pages
const uint32_t chapterStartTime = millis();
do {
void* const buf = XML_GetBuffer(parser, PARSE_BUFFER_SIZE);
if (!buf) {
LOG_ERR("EHP", "Couldn't allocate memory for buffer");
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
file.close();
return false;
}
const size_t len = file.read(buf, PARSE_BUFFER_SIZE);
if (len == 0 && file.available() > 0) {
LOG_ERR("EHP", "File read error");
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
file.close();
return false;
}
done = file.available() == 0;
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
LOG_ERR("EHP", "Parse error at line %lu:\n%s", XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
file.close();
return false;
}
} while (!done);
LOG_DBG("EHP", "Time to parse and build pages: %lu ms", millis() - chapterStartTime);
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
file.close();
// Process last page if there is still text
if (currentTextBlock) {
makePages();
completePageFn(std::move(currentPage));
currentPage.reset();
currentTextBlock.reset();
}
return true;
}
void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr<TextBlock> line) {
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
if (currentPageNextY + lineHeight > viewportHeight) {
completePageFn(std::move(currentPage));
currentPage.reset(new Page());
currentPageNextY = 0;
}
// Apply horizontal left inset (margin + padding) as x position offset
const int16_t xOffset = line->getBlockStyle().leftInset();
currentPage->elements.push_back(std::make_shared<PageLine>(line, xOffset, currentPageNextY));
currentPageNextY += lineHeight;
}
void ChapterHtmlSlimParser::makePages() {
if (!currentTextBlock) {
LOG_ERR("EHP", "!! No text block to make pages for !!");
return;
}
if (!currentPage) {
currentPage.reset(new Page());
currentPageNextY = 0;
}
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
// Apply top spacing before the paragraph (stored in pixels)
const BlockStyle& blockStyle = currentTextBlock->getBlockStyle();
if (blockStyle.marginTop > 0) {
currentPageNextY += blockStyle.marginTop;
}
if (blockStyle.paddingTop > 0) {
currentPageNextY += blockStyle.paddingTop;
}
// Calculate effective width accounting for horizontal margins/padding
const int horizontalInset = blockStyle.totalHorizontalInset();
const uint16_t effectiveWidth =
(horizontalInset < viewportWidth) ? static_cast<uint16_t>(viewportWidth - horizontalInset) : viewportWidth;
currentTextBlock->layoutAndExtractLines(
renderer, fontId, effectiveWidth,
[this](const std::shared_ptr<TextBlock>& textBlock) { addLineToPage(textBlock); });
// Apply bottom spacing after the paragraph (stored in pixels)
if (blockStyle.marginBottom > 0) {
currentPageNextY += blockStyle.marginBottom;
}
if (blockStyle.paddingBottom > 0) {
currentPageNextY += blockStyle.paddingBottom;
}
// Extra paragraph spacing if enabled (default behavior)
if (extraParagraphSpacing) {
currentPageNextY += lineHeight / 2;
}
}