#include "ChapterHtmlSlimParser.h" #include #include #include #include #include "../Page.h" const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"}; constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]); // Minimum file size (in bytes) to show indexing popup - smaller chapters don't benefit from it constexpr size_t MIN_SIZE_FOR_POPUP = 10 * 1024; // 10KB const char* BLOCK_TAGS[] = {"p", "li", "div", "br", "blockquote"}; constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]); const char* BOLD_TAGS[] = {"b", "strong"}; constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]); const char* ITALIC_TAGS[] = {"i", "em"}; constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]); const char* UNDERLINE_TAGS[] = {"u", "ins"}; constexpr int NUM_UNDERLINE_TAGS = sizeof(UNDERLINE_TAGS) / sizeof(UNDERLINE_TAGS[0]); const char* IMAGE_TAGS[] = {"img"}; constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]); const char* SKIP_TAGS[] = {"head"}; constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]); bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; } // given the start and end of a tag, check to see if it matches a known tag bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) { for (int i = 0; i < possible_tag_count; i++) { if (strcmp(tag_name, possible_tags[i]) == 0) { return true; } } return false; } bool isHeaderOrBlock(const char* name) { return matches(name, HEADER_TAGS, NUM_HEADER_TAGS) || matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS); } // Update effective bold/italic/underline based on block style and inline style stack void ChapterHtmlSlimParser::updateEffectiveInlineStyle() { // Start with block-level styles effectiveBold = currentCssStyle.hasFontWeight() && currentCssStyle.fontWeight == CssFontWeight::Bold; effectiveItalic = currentCssStyle.hasFontStyle() && currentCssStyle.fontStyle == CssFontStyle::Italic; effectiveUnderline = currentCssStyle.hasTextDecoration() && currentCssStyle.textDecoration == CssTextDecoration::Underline; // Apply inline style stack in order for (const auto& entry : inlineStyleStack) { if (entry.hasBold) { effectiveBold = entry.bold; } if (entry.hasItalic) { effectiveItalic = entry.italic; } if (entry.hasUnderline) { effectiveUnderline = entry.underline; } } } // flush the contents of partWordBuffer to currentTextBlock void ChapterHtmlSlimParser::flushPartWordBuffer() { // Determine font style from depth-based tracking and CSS effective style const bool isBold = boldUntilDepth < depth || effectiveBold; const bool isItalic = italicUntilDepth < depth || effectiveItalic; const bool isUnderline = underlineUntilDepth < depth || effectiveUnderline; // Combine style flags using bitwise OR EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR; if (isBold) { fontStyle = static_cast(fontStyle | EpdFontFamily::BOLD); } if (isItalic) { fontStyle = static_cast(fontStyle | EpdFontFamily::ITALIC); } if (isUnderline) { fontStyle = static_cast(fontStyle | EpdFontFamily::UNDERLINE); } // flush the buffer partWordBuffer[partWordBufferIndex] = '\0'; currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues); partWordBufferIndex = 0; nextWordContinues = false; } // start a new text block if needed void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) { nextWordContinues = false; // New block = new paragraph, no continuation if (currentTextBlock) { // already have a text block running and it is empty - just reuse it if (currentTextBlock->isEmpty()) { // Merge with existing block style to accumulate CSS styling from parent block elements. // This handles cases like

text

where the // div's margin should be preserved, even though it has no direct text content. currentTextBlock->setBlockStyle(currentTextBlock->getBlockStyle().getCombinedBlockStyle(blockStyle)); return; } makePages(); } currentTextBlock.reset(new ParsedText(extraParagraphSpacing, hyphenationEnabled, blockStyle)); } void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) { auto* self = static_cast(userData); // Middle of skip if (self->skipUntilDepth < self->depth) { self->depth += 1; return; } // Extract class and style attributes for CSS processing std::string classAttr; std::string styleAttr; if (atts != nullptr) { for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "class") == 0) { classAttr = atts[i + 1]; } else if (strcmp(atts[i], "style") == 0) { styleAttr = atts[i + 1]; } } } auto centeredBlockStyle = BlockStyle(); centeredBlockStyle.textAlignDefined = true; centeredBlockStyle.alignment = CssTextAlign::Center; // Special handling for tables - show placeholder text instead of dropping silently if (strcmp(name, "table") == 0) { // Add placeholder text self->startNewTextBlock(centeredBlockStyle); self->italicUntilDepth = min(self->italicUntilDepth, self->depth); // Advance depth before processing character data (like you would for an element with text) self->depth += 1; self->characterData(userData, "[Table omitted]", strlen("[Table omitted]")); // Skip table contents (skip until parent as we pre-advanced depth above) self->skipUntilDepth = self->depth - 1; return; } if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) { // TODO: Start processing image tags std::string alt = "[Image]"; if (atts != nullptr) { for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "alt") == 0) { if (strlen(atts[i + 1]) > 0) { alt = "[Image: " + std::string(atts[i + 1]) + "]"; } break; } } } LOG_DBG("EHP", "Image alt: %s", alt.c_str()); self->startNewTextBlock(centeredBlockStyle); self->italicUntilDepth = min(self->italicUntilDepth, self->depth); // Advance depth before processing character data (like you would for an element with text) self->depth += 1; self->characterData(userData, alt.c_str(), alt.length()); // Skip table contents (skip until parent as we pre-advanced depth above) self->skipUntilDepth = self->depth - 1; return; } if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) { // start skip self->skipUntilDepth = self->depth; self->depth += 1; return; } // Skip blocks with role="doc-pagebreak" and epub:type="pagebreak" if (atts != nullptr) { for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 || strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) { self->skipUntilDepth = self->depth; self->depth += 1; return; } } } // Compute CSS style for this element CssStyle cssStyle; if (self->cssParser) { // Get combined tag + class styles cssStyle = self->cssParser->resolveStyle(name, classAttr); // Merge inline style (highest priority) if (!styleAttr.empty()) { CssStyle inlineStyle = CssParser::parseInlineStyle(styleAttr); cssStyle.applyOver(inlineStyle); } } const float emSize = static_cast(self->renderer.getLineHeight(self->fontId)) * self->lineCompression; const auto userAlignmentBlockStyle = BlockStyle::fromCssStyle( cssStyle, emSize, static_cast(self->paragraphAlignment), self->viewportWidth); if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) { self->currentCssStyle = cssStyle; auto headerBlockStyle = BlockStyle::fromCssStyle(cssStyle, emSize, CssTextAlign::Center, self->viewportWidth); headerBlockStyle.textAlignDefined = true; if (self->embeddedStyle && cssStyle.hasTextAlign()) { headerBlockStyle.alignment = cssStyle.textAlign; } self->startNewTextBlock(headerBlockStyle); self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth); self->updateEffectiveInlineStyle(); } else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) { if (strcmp(name, "br") == 0) { if (self->partWordBufferIndex > 0) { // flush word preceding
to currentTextBlock before calling startNewTextBlock self->flushPartWordBuffer(); } self->startNewTextBlock(self->currentTextBlock->getBlockStyle()); } else { self->currentCssStyle = cssStyle; self->startNewTextBlock(userAlignmentBlockStyle); self->updateEffectiveInlineStyle(); if (strcmp(name, "li") == 0) { self->currentTextBlock->addWord("\xe2\x80\xa2", EpdFontFamily::REGULAR); } } } else if (matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS)) { // Flush buffer before style change so preceding text gets current style if (self->partWordBufferIndex > 0) { self->flushPartWordBuffer(); self->nextWordContinues = true; } self->underlineUntilDepth = std::min(self->underlineUntilDepth, self->depth); // Push inline style entry for underline tag StyleStackEntry entry; entry.depth = self->depth; // Track depth for matching pop entry.hasUnderline = true; entry.underline = true; if (cssStyle.hasFontWeight()) { entry.hasBold = true; entry.bold = cssStyle.fontWeight == CssFontWeight::Bold; } if (cssStyle.hasFontStyle()) { entry.hasItalic = true; entry.italic = cssStyle.fontStyle == CssFontStyle::Italic; } self->inlineStyleStack.push_back(entry); self->updateEffectiveInlineStyle(); } else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) { // Flush buffer before style change so preceding text gets current style if (self->partWordBufferIndex > 0) { self->flushPartWordBuffer(); self->nextWordContinues = true; } self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth); // Push inline style entry for bold tag StyleStackEntry entry; entry.depth = self->depth; // Track depth for matching pop entry.hasBold = true; entry.bold = true; if (cssStyle.hasFontStyle()) { entry.hasItalic = true; entry.italic = cssStyle.fontStyle == CssFontStyle::Italic; } if (cssStyle.hasTextDecoration()) { entry.hasUnderline = true; entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline; } self->inlineStyleStack.push_back(entry); self->updateEffectiveInlineStyle(); } else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) { // Flush buffer before style change so preceding text gets current style if (self->partWordBufferIndex > 0) { self->flushPartWordBuffer(); self->nextWordContinues = true; } self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth); // Push inline style entry for italic tag StyleStackEntry entry; entry.depth = self->depth; // Track depth for matching pop entry.hasItalic = true; entry.italic = true; if (cssStyle.hasFontWeight()) { entry.hasBold = true; entry.bold = cssStyle.fontWeight == CssFontWeight::Bold; } if (cssStyle.hasTextDecoration()) { entry.hasUnderline = true; entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline; } self->inlineStyleStack.push_back(entry); self->updateEffectiveInlineStyle(); } else if (strcmp(name, "span") == 0 || !isHeaderOrBlock(name)) { // Handle span and other inline elements for CSS styling if (cssStyle.hasFontWeight() || cssStyle.hasFontStyle() || cssStyle.hasTextDecoration()) { // Flush buffer before style change so preceding text gets current style if (self->partWordBufferIndex > 0) { self->flushPartWordBuffer(); self->nextWordContinues = true; } StyleStackEntry entry; entry.depth = self->depth; // Track depth for matching pop if (cssStyle.hasFontWeight()) { entry.hasBold = true; entry.bold = cssStyle.fontWeight == CssFontWeight::Bold; } if (cssStyle.hasFontStyle()) { entry.hasItalic = true; entry.italic = cssStyle.fontStyle == CssFontStyle::Italic; } if (cssStyle.hasTextDecoration()) { entry.hasUnderline = true; entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline; } self->inlineStyleStack.push_back(entry); self->updateEffectiveInlineStyle(); } } // Unprocessed tag, just increasing depth and continue forward self->depth += 1; } void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) { auto* self = static_cast(userData); // Middle of skip if (self->skipUntilDepth < self->depth) { return; } for (int i = 0; i < len; i++) { if (isWhitespace(s[i])) { // Currently looking at whitespace, if there's anything in the partWordBuffer, flush it if (self->partWordBufferIndex > 0) { self->flushPartWordBuffer(); } // Whitespace is a real word boundary — reset continuation state self->nextWordContinues = false; // Skip the whitespace char continue; } // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF const XML_Char FEFF_BYTE_1 = static_cast(0xEF); const XML_Char FEFF_BYTE_2 = static_cast(0xBB); const XML_Char FEFF_BYTE_3 = static_cast(0xBF); if (s[i] == FEFF_BYTE_1) { // Check if the next two bytes complete the 3-byte sequence if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) { // Sequence 0xEF 0xBB 0xBF found! i += 2; // Skip the next two bytes continue; // Move to the next iteration } } // If we're about to run out of space, then cut the word off and start a new one if (self->partWordBufferIndex >= MAX_WORD_SIZE) { self->flushPartWordBuffer(); } self->partWordBuffer[self->partWordBufferIndex++] = s[i]; } // If we have > 750 words buffered up, perform the layout and consume out all but the last line // There should be enough here to build out 1-2 full pages and doing this will free up a lot of // memory. // Spotted when reading Intermezzo, there are some really long text blocks in there. if (self->currentTextBlock->size() > 750) { LOG_DBG("EHP", "Text block too long, splitting into multiple pages"); self->currentTextBlock->layoutAndExtractLines( self->renderer, self->fontId, self->viewportWidth, [self](const std::shared_ptr& textBlock) { self->addLineToPage(textBlock); }, false); } } void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) { auto* self = static_cast(userData); // Check if any style state will change after we decrement depth // If so, we MUST flush the partWordBuffer with the CURRENT style first // Note: depth hasn't been decremented yet, so we check against (depth - 1) const bool willPopStyleStack = !self->inlineStyleStack.empty() && self->inlineStyleStack.back().depth == self->depth - 1; const bool willClearBold = self->boldUntilDepth == self->depth - 1; const bool willClearItalic = self->italicUntilDepth == self->depth - 1; const bool willClearUnderline = self->underlineUntilDepth == self->depth - 1; const bool styleWillChange = willPopStyleStack || willClearBold || willClearItalic || willClearUnderline; const bool headerOrBlockTag = isHeaderOrBlock(name); // Flush buffer with current style BEFORE any style changes if (self->partWordBufferIndex > 0) { // Flush if style will change OR if we're closing a block/structural element const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 && !matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1; const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 || matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) || self->depth == 1; if (shouldFlush) { self->flushPartWordBuffer(); // If closing an inline element, the next word fragment continues the same visual word if (isInlineTag) { self->nextWordContinues = true; } } } self->depth -= 1; // Leaving skip if (self->skipUntilDepth == self->depth) { self->skipUntilDepth = INT_MAX; } // Leaving bold tag if (self->boldUntilDepth == self->depth) { self->boldUntilDepth = INT_MAX; } // Leaving italic tag if (self->italicUntilDepth == self->depth) { self->italicUntilDepth = INT_MAX; } // Leaving underline tag if (self->underlineUntilDepth == self->depth) { self->underlineUntilDepth = INT_MAX; } // Pop from inline style stack if we pushed an entry at this depth // This handles all inline elements: b, i, u, span, etc. if (!self->inlineStyleStack.empty() && self->inlineStyleStack.back().depth == self->depth) { self->inlineStyleStack.pop_back(); self->updateEffectiveInlineStyle(); } // Clear block style when leaving header or block elements if (headerOrBlockTag) { self->currentCssStyle.reset(); self->updateEffectiveInlineStyle(); } } bool ChapterHtmlSlimParser::parseAndBuildPages() { auto paragraphAlignmentBlockStyle = BlockStyle(); paragraphAlignmentBlockStyle.textAlignDefined = true; // Resolve None sentinel to Justify for initial block (no CSS context yet) const auto align = (this->paragraphAlignment == static_cast(CssTextAlign::None)) ? CssTextAlign::Justify : static_cast(this->paragraphAlignment); paragraphAlignmentBlockStyle.alignment = align; startNewTextBlock(paragraphAlignmentBlockStyle); const XML_Parser parser = XML_ParserCreate(nullptr); int done; if (!parser) { LOG_ERR("EHP", "Couldn't allocate memory for parser"); return false; } FsFile file; if (!Storage.openFileForRead("EHP", filepath, file)) { XML_ParserFree(parser); return false; } // Get file size to decide whether to show indexing popup. if (popupFn && file.size() >= MIN_SIZE_FOR_POPUP) { popupFn(); } XML_SetUserData(parser, this); XML_SetElementHandler(parser, startElement, endElement); XML_SetCharacterDataHandler(parser, characterData); do { void* const buf = XML_GetBuffer(parser, 1024); if (!buf) { LOG_ERR("EHP", "Couldn't allocate memory for buffer"); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); file.close(); return false; } const size_t len = file.read(buf, 1024); if (len == 0 && file.available() > 0) { LOG_ERR("EHP", "File read error"); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); file.close(); return false; } done = file.available() == 0; if (XML_ParseBuffer(parser, static_cast(len), done) == XML_STATUS_ERROR) { LOG_ERR("EHP", "Parse error at line %lu:\n%s", XML_GetCurrentLineNumber(parser), XML_ErrorString(XML_GetErrorCode(parser))); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); file.close(); return false; } } while (!done); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); file.close(); // Process last page if there is still text if (currentTextBlock) { makePages(); completePageFn(std::move(currentPage)); currentPage.reset(); currentTextBlock.reset(); } return true; } void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr line) { const int lineHeight = renderer.getLineHeight(fontId) * lineCompression; if (currentPageNextY + lineHeight > viewportHeight) { completePageFn(std::move(currentPage)); currentPage.reset(new Page()); currentPageNextY = 0; } // Apply horizontal left inset (margin + padding) as x position offset const int16_t xOffset = line->getBlockStyle().leftInset(); currentPage->elements.push_back(std::make_shared(line, xOffset, currentPageNextY)); currentPageNextY += lineHeight; } void ChapterHtmlSlimParser::makePages() { if (!currentTextBlock) { LOG_ERR("EHP", "!! No text block to make pages for !!"); return; } if (!currentPage) { currentPage.reset(new Page()); currentPageNextY = 0; } const int lineHeight = renderer.getLineHeight(fontId) * lineCompression; // Apply top spacing before the paragraph (stored in pixels) const BlockStyle& blockStyle = currentTextBlock->getBlockStyle(); if (blockStyle.marginTop > 0) { currentPageNextY += blockStyle.marginTop; } if (blockStyle.paddingTop > 0) { currentPageNextY += blockStyle.paddingTop; } // Calculate effective width accounting for horizontal margins/padding const int horizontalInset = blockStyle.totalHorizontalInset(); const uint16_t effectiveWidth = (horizontalInset < viewportWidth) ? static_cast(viewportWidth - horizontalInset) : viewportWidth; currentTextBlock->layoutAndExtractLines( renderer, fontId, effectiveWidth, [this](const std::shared_ptr& textBlock) { addLineToPage(textBlock); }); // Apply bottom spacing after the paragraph (stored in pixels) if (blockStyle.marginBottom > 0) { currentPageNextY += blockStyle.marginBottom; } if (blockStyle.paddingBottom > 0) { currentPageNextY += blockStyle.paddingBottom; } // Extra paragraph spacing if enabled (default behavior) if (extraParagraphSpacing) { currentPageNextY += lineHeight / 2; } }