## Summary * During chapter parsing, every <img> tag triggered ZIP decompression and an SD card write regardless of whether the image format was supported. The mandatory delay(50) after each SD write compounded the cost. A chapter with 6 GIF images (a common decorative element in older EPUBs) wasted ~750 ms before any text rendering began. * **What changes are included?** Added an ``ImageDecoderFactory::isFormatSupported()`` check before any file I/O in the img-handler. Only JPEG and PNG proceed to extraction; all other formats (GIF, SVG, WebP, etc.) fall through immediately to alt-text rendering with no SD card access. ## Additional Context Measured impact on a representative chapter with 6 GIF decorations: | | Before | After| |-- | -- | --| |Total parse time | ~882 ms | ~207 ms| |Image handling | ~750 ms | ~76 ms| --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**NO**_
761 lines
29 KiB
C++
761 lines
29 KiB
C++
#include "ChapterHtmlSlimParser.h"
|
|
|
|
#include <FsHelpers.h>
|
|
#include <GfxRenderer.h>
|
|
#include <HalStorage.h>
|
|
#include <Logging.h>
|
|
#include <expat.h>
|
|
|
|
#include "../../Epub.h"
|
|
#include "../Page.h"
|
|
#include "../converters/ImageDecoderFactory.h"
|
|
#include "../converters/ImageToFramebufferDecoder.h"
|
|
#include "../htmlEntities.h"
|
|
|
|
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
|
|
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
|
|
|
|
// Minimum file size (in bytes) to show indexing popup - smaller chapters don't benefit from it
|
|
constexpr size_t MIN_SIZE_FOR_POPUP = 10 * 1024; // 10KB
|
|
constexpr size_t PARSE_BUFFER_SIZE = 1024;
|
|
|
|
const char* BLOCK_TAGS[] = {"p", "li", "div", "br", "blockquote"};
|
|
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
|
|
|
|
const char* BOLD_TAGS[] = {"b", "strong"};
|
|
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
|
|
|
|
const char* ITALIC_TAGS[] = {"i", "em"};
|
|
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
|
|
|
|
const char* UNDERLINE_TAGS[] = {"u", "ins"};
|
|
constexpr int NUM_UNDERLINE_TAGS = sizeof(UNDERLINE_TAGS) / sizeof(UNDERLINE_TAGS[0]);
|
|
|
|
const char* IMAGE_TAGS[] = {"img"};
|
|
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
|
|
|
|
const char* SKIP_TAGS[] = {"head"};
|
|
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
|
|
|
|
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; }
|
|
|
|
// given the start and end of a tag, check to see if it matches a known tag
|
|
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
|
|
for (int i = 0; i < possible_tag_count; i++) {
|
|
if (strcmp(tag_name, possible_tags[i]) == 0) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool isHeaderOrBlock(const char* name) {
|
|
return matches(name, HEADER_TAGS, NUM_HEADER_TAGS) || matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS);
|
|
}
|
|
|
|
// Update effective bold/italic/underline based on block style and inline style stack
|
|
void ChapterHtmlSlimParser::updateEffectiveInlineStyle() {
|
|
// Start with block-level styles
|
|
effectiveBold = currentCssStyle.hasFontWeight() && currentCssStyle.fontWeight == CssFontWeight::Bold;
|
|
effectiveItalic = currentCssStyle.hasFontStyle() && currentCssStyle.fontStyle == CssFontStyle::Italic;
|
|
effectiveUnderline =
|
|
currentCssStyle.hasTextDecoration() && currentCssStyle.textDecoration == CssTextDecoration::Underline;
|
|
|
|
// Apply inline style stack in order
|
|
for (const auto& entry : inlineStyleStack) {
|
|
if (entry.hasBold) {
|
|
effectiveBold = entry.bold;
|
|
}
|
|
if (entry.hasItalic) {
|
|
effectiveItalic = entry.italic;
|
|
}
|
|
if (entry.hasUnderline) {
|
|
effectiveUnderline = entry.underline;
|
|
}
|
|
}
|
|
}
|
|
|
|
// flush the contents of partWordBuffer to currentTextBlock
|
|
void ChapterHtmlSlimParser::flushPartWordBuffer() {
|
|
// Determine font style from depth-based tracking and CSS effective style
|
|
const bool isBold = boldUntilDepth < depth || effectiveBold;
|
|
const bool isItalic = italicUntilDepth < depth || effectiveItalic;
|
|
const bool isUnderline = underlineUntilDepth < depth || effectiveUnderline;
|
|
|
|
// Combine style flags using bitwise OR
|
|
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
|
|
if (isBold) {
|
|
fontStyle = static_cast<EpdFontFamily::Style>(fontStyle | EpdFontFamily::BOLD);
|
|
}
|
|
if (isItalic) {
|
|
fontStyle = static_cast<EpdFontFamily::Style>(fontStyle | EpdFontFamily::ITALIC);
|
|
}
|
|
if (isUnderline) {
|
|
fontStyle = static_cast<EpdFontFamily::Style>(fontStyle | EpdFontFamily::UNDERLINE);
|
|
}
|
|
|
|
// flush the buffer
|
|
partWordBuffer[partWordBufferIndex] = '\0';
|
|
currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues);
|
|
partWordBufferIndex = 0;
|
|
nextWordContinues = false;
|
|
}
|
|
|
|
// start a new text block if needed
|
|
void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) {
|
|
nextWordContinues = false; // New block = new paragraph, no continuation
|
|
if (currentTextBlock) {
|
|
// already have a text block running and it is empty - just reuse it
|
|
if (currentTextBlock->isEmpty()) {
|
|
// Merge with existing block style to accumulate CSS styling from parent block elements.
|
|
// This handles cases like <div style="margin-bottom:2em"><h1>text</h1></div> where the
|
|
// div's margin should be preserved, even though it has no direct text content.
|
|
currentTextBlock->setBlockStyle(currentTextBlock->getBlockStyle().getCombinedBlockStyle(blockStyle));
|
|
return;
|
|
}
|
|
|
|
makePages();
|
|
}
|
|
currentTextBlock.reset(new ParsedText(extraParagraphSpacing, hyphenationEnabled, blockStyle));
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
// Middle of skip
|
|
if (self->skipUntilDepth < self->depth) {
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
// Extract class and style attributes for CSS processing
|
|
std::string classAttr;
|
|
std::string styleAttr;
|
|
if (atts != nullptr) {
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "class") == 0) {
|
|
classAttr = atts[i + 1];
|
|
} else if (strcmp(atts[i], "style") == 0) {
|
|
styleAttr = atts[i + 1];
|
|
}
|
|
}
|
|
}
|
|
|
|
auto centeredBlockStyle = BlockStyle();
|
|
centeredBlockStyle.textAlignDefined = true;
|
|
centeredBlockStyle.alignment = CssTextAlign::Center;
|
|
|
|
// Special handling for tables - show placeholder text instead of dropping silently
|
|
if (strcmp(name, "table") == 0) {
|
|
// Add placeholder text
|
|
self->startNewTextBlock(centeredBlockStyle);
|
|
|
|
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
|
|
// Advance depth before processing character data (like you would for an element with text)
|
|
self->depth += 1;
|
|
self->characterData(userData, "[Table omitted]", strlen("[Table omitted]"));
|
|
|
|
// Skip table contents (skip until parent as we pre-advanced depth above)
|
|
self->skipUntilDepth = self->depth - 1;
|
|
return;
|
|
}
|
|
|
|
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
|
|
std::string src;
|
|
std::string alt;
|
|
if (atts != nullptr) {
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "src") == 0) {
|
|
src = atts[i + 1];
|
|
} else if (strcmp(atts[i], "alt") == 0) {
|
|
alt = atts[i + 1];
|
|
}
|
|
}
|
|
|
|
if (!src.empty()) {
|
|
LOG_DBG("EHP", "Found image: src=%s", src.c_str());
|
|
|
|
{
|
|
// Resolve the image path relative to the HTML file
|
|
std::string resolvedPath = FsHelpers::normalisePath(self->contentBase + src);
|
|
|
|
if (ImageDecoderFactory::isFormatSupported(resolvedPath)) {
|
|
// Create a unique filename for the cached image
|
|
std::string ext;
|
|
size_t extPos = resolvedPath.rfind('.');
|
|
if (extPos != std::string::npos) {
|
|
ext = resolvedPath.substr(extPos);
|
|
}
|
|
std::string cachedImagePath = self->imageBasePath + std::to_string(self->imageCounter++) + ext;
|
|
|
|
// Extract image to cache file
|
|
FsFile cachedImageFile;
|
|
bool extractSuccess = false;
|
|
if (Storage.openFileForWrite("EHP", cachedImagePath, cachedImageFile)) {
|
|
extractSuccess = self->epub->readItemContentsToStream(resolvedPath, cachedImageFile, 4096);
|
|
cachedImageFile.flush();
|
|
cachedImageFile.close();
|
|
delay(50); // Give SD card time to sync
|
|
}
|
|
|
|
if (extractSuccess) {
|
|
// Get image dimensions
|
|
ImageDimensions dims = {0, 0};
|
|
ImageToFramebufferDecoder* decoder = ImageDecoderFactory::getDecoder(cachedImagePath);
|
|
if (decoder && decoder->getDimensions(cachedImagePath, dims)) {
|
|
LOG_DBG("EHP", "Image dimensions: %dx%d", dims.width, dims.height);
|
|
|
|
// Scale to fit viewport while maintaining aspect ratio
|
|
int maxWidth = self->viewportWidth;
|
|
int maxHeight = self->viewportHeight;
|
|
float scaleX = (dims.width > maxWidth) ? (float)maxWidth / dims.width : 1.0f;
|
|
float scaleY = (dims.height > maxHeight) ? (float)maxHeight / dims.height : 1.0f;
|
|
float scale = (scaleX < scaleY) ? scaleX : scaleY;
|
|
if (scale > 1.0f) scale = 1.0f;
|
|
|
|
int displayWidth = (int)(dims.width * scale);
|
|
int displayHeight = (int)(dims.height * scale);
|
|
|
|
LOG_DBG("EHP", "Display size: %dx%d (scale %.2f)", displayWidth, displayHeight, scale);
|
|
|
|
// Create page for image - only break if image won't fit remaining space
|
|
if (self->currentPage && !self->currentPage->elements.empty() &&
|
|
(self->currentPageNextY + displayHeight > self->viewportHeight)) {
|
|
self->completePageFn(std::move(self->currentPage));
|
|
self->currentPage.reset(new Page());
|
|
if (!self->currentPage) {
|
|
LOG_ERR("EHP", "Failed to create new page");
|
|
return;
|
|
}
|
|
self->currentPageNextY = 0;
|
|
} else if (!self->currentPage) {
|
|
self->currentPage.reset(new Page());
|
|
if (!self->currentPage) {
|
|
LOG_ERR("EHP", "Failed to create initial page");
|
|
return;
|
|
}
|
|
self->currentPageNextY = 0;
|
|
}
|
|
|
|
// Create ImageBlock and add to page
|
|
auto imageBlock = std::make_shared<ImageBlock>(cachedImagePath, displayWidth, displayHeight);
|
|
if (!imageBlock) {
|
|
LOG_ERR("EHP", "Failed to create ImageBlock");
|
|
return;
|
|
}
|
|
int xPos = (self->viewportWidth - displayWidth) / 2;
|
|
auto pageImage = std::make_shared<PageImage>(imageBlock, xPos, self->currentPageNextY);
|
|
if (!pageImage) {
|
|
LOG_ERR("EHP", "Failed to create PageImage");
|
|
return;
|
|
}
|
|
self->currentPage->elements.push_back(pageImage);
|
|
self->currentPageNextY += displayHeight;
|
|
|
|
self->depth += 1;
|
|
return;
|
|
} else {
|
|
LOG_ERR("EHP", "Failed to get image dimensions");
|
|
Storage.remove(cachedImagePath.c_str());
|
|
}
|
|
} else {
|
|
LOG_ERR("EHP", "Failed to extract image");
|
|
}
|
|
} // isFormatSupported
|
|
}
|
|
}
|
|
|
|
// Fallback to alt text if image processing fails
|
|
if (!alt.empty()) {
|
|
alt = "[Image: " + alt + "]";
|
|
self->startNewTextBlock(centeredBlockStyle);
|
|
self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
|
|
self->depth += 1;
|
|
self->characterData(userData, alt.c_str(), alt.length());
|
|
// Skip any child content (skip until parent as we pre-advanced depth above)
|
|
self->skipUntilDepth = self->depth - 1;
|
|
return;
|
|
}
|
|
|
|
// No alt text, skip
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
|
|
// start skip
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
// Skip blocks with role="doc-pagebreak" and epub:type="pagebreak"
|
|
if (atts != nullptr) {
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 ||
|
|
strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) {
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Compute CSS style for this element
|
|
CssStyle cssStyle;
|
|
if (self->cssParser) {
|
|
// Get combined tag + class styles
|
|
cssStyle = self->cssParser->resolveStyle(name, classAttr);
|
|
// Merge inline style (highest priority)
|
|
if (!styleAttr.empty()) {
|
|
CssStyle inlineStyle = CssParser::parseInlineStyle(styleAttr);
|
|
cssStyle.applyOver(inlineStyle);
|
|
}
|
|
}
|
|
|
|
const float emSize = static_cast<float>(self->renderer.getLineHeight(self->fontId)) * self->lineCompression;
|
|
const auto userAlignmentBlockStyle = BlockStyle::fromCssStyle(
|
|
cssStyle, emSize, static_cast<CssTextAlign>(self->paragraphAlignment), self->viewportWidth);
|
|
|
|
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
|
|
self->currentCssStyle = cssStyle;
|
|
auto headerBlockStyle = BlockStyle::fromCssStyle(cssStyle, emSize, CssTextAlign::Center, self->viewportWidth);
|
|
headerBlockStyle.textAlignDefined = true;
|
|
if (self->embeddedStyle && cssStyle.hasTextAlign()) {
|
|
headerBlockStyle.alignment = cssStyle.textAlign;
|
|
}
|
|
self->startNewTextBlock(headerBlockStyle);
|
|
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
|
|
self->updateEffectiveInlineStyle();
|
|
} else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
|
|
if (strcmp(name, "br") == 0) {
|
|
if (self->partWordBufferIndex > 0) {
|
|
// flush word preceding <br/> to currentTextBlock before calling startNewTextBlock
|
|
self->flushPartWordBuffer();
|
|
}
|
|
self->startNewTextBlock(self->currentTextBlock->getBlockStyle());
|
|
} else {
|
|
self->currentCssStyle = cssStyle;
|
|
self->startNewTextBlock(userAlignmentBlockStyle);
|
|
self->updateEffectiveInlineStyle();
|
|
|
|
if (strcmp(name, "li") == 0) {
|
|
self->currentTextBlock->addWord("\xe2\x80\xa2", EpdFontFamily::REGULAR);
|
|
}
|
|
}
|
|
} else if (matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS)) {
|
|
// Flush buffer before style change so preceding text gets current style
|
|
if (self->partWordBufferIndex > 0) {
|
|
self->flushPartWordBuffer();
|
|
self->nextWordContinues = true;
|
|
}
|
|
self->underlineUntilDepth = std::min(self->underlineUntilDepth, self->depth);
|
|
// Push inline style entry for underline tag
|
|
StyleStackEntry entry;
|
|
entry.depth = self->depth; // Track depth for matching pop
|
|
entry.hasUnderline = true;
|
|
entry.underline = true;
|
|
if (cssStyle.hasFontWeight()) {
|
|
entry.hasBold = true;
|
|
entry.bold = cssStyle.fontWeight == CssFontWeight::Bold;
|
|
}
|
|
if (cssStyle.hasFontStyle()) {
|
|
entry.hasItalic = true;
|
|
entry.italic = cssStyle.fontStyle == CssFontStyle::Italic;
|
|
}
|
|
self->inlineStyleStack.push_back(entry);
|
|
self->updateEffectiveInlineStyle();
|
|
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
|
|
// Flush buffer before style change so preceding text gets current style
|
|
if (self->partWordBufferIndex > 0) {
|
|
self->flushPartWordBuffer();
|
|
self->nextWordContinues = true;
|
|
}
|
|
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
|
|
// Push inline style entry for bold tag
|
|
StyleStackEntry entry;
|
|
entry.depth = self->depth; // Track depth for matching pop
|
|
entry.hasBold = true;
|
|
entry.bold = true;
|
|
if (cssStyle.hasFontStyle()) {
|
|
entry.hasItalic = true;
|
|
entry.italic = cssStyle.fontStyle == CssFontStyle::Italic;
|
|
}
|
|
if (cssStyle.hasTextDecoration()) {
|
|
entry.hasUnderline = true;
|
|
entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline;
|
|
}
|
|
self->inlineStyleStack.push_back(entry);
|
|
self->updateEffectiveInlineStyle();
|
|
} else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
|
|
// Flush buffer before style change so preceding text gets current style
|
|
if (self->partWordBufferIndex > 0) {
|
|
self->flushPartWordBuffer();
|
|
self->nextWordContinues = true;
|
|
}
|
|
self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
|
|
// Push inline style entry for italic tag
|
|
StyleStackEntry entry;
|
|
entry.depth = self->depth; // Track depth for matching pop
|
|
entry.hasItalic = true;
|
|
entry.italic = true;
|
|
if (cssStyle.hasFontWeight()) {
|
|
entry.hasBold = true;
|
|
entry.bold = cssStyle.fontWeight == CssFontWeight::Bold;
|
|
}
|
|
if (cssStyle.hasTextDecoration()) {
|
|
entry.hasUnderline = true;
|
|
entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline;
|
|
}
|
|
self->inlineStyleStack.push_back(entry);
|
|
self->updateEffectiveInlineStyle();
|
|
} else if (strcmp(name, "span") == 0 || !isHeaderOrBlock(name)) {
|
|
// Handle span and other inline elements for CSS styling
|
|
if (cssStyle.hasFontWeight() || cssStyle.hasFontStyle() || cssStyle.hasTextDecoration()) {
|
|
// Flush buffer before style change so preceding text gets current style
|
|
if (self->partWordBufferIndex > 0) {
|
|
self->flushPartWordBuffer();
|
|
self->nextWordContinues = true;
|
|
}
|
|
StyleStackEntry entry;
|
|
entry.depth = self->depth; // Track depth for matching pop
|
|
if (cssStyle.hasFontWeight()) {
|
|
entry.hasBold = true;
|
|
entry.bold = cssStyle.fontWeight == CssFontWeight::Bold;
|
|
}
|
|
if (cssStyle.hasFontStyle()) {
|
|
entry.hasItalic = true;
|
|
entry.italic = cssStyle.fontStyle == CssFontStyle::Italic;
|
|
}
|
|
if (cssStyle.hasTextDecoration()) {
|
|
entry.hasUnderline = true;
|
|
entry.underline = cssStyle.textDecoration == CssTextDecoration::Underline;
|
|
}
|
|
self->inlineStyleStack.push_back(entry);
|
|
self->updateEffectiveInlineStyle();
|
|
}
|
|
}
|
|
|
|
// Unprocessed tag, just increasing depth and continue forward
|
|
self->depth += 1;
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
// Middle of skip
|
|
if (self->skipUntilDepth < self->depth) {
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i < len; i++) {
|
|
if (isWhitespace(s[i])) {
|
|
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
|
|
if (self->partWordBufferIndex > 0) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
// Whitespace is a real word boundary — reset continuation state
|
|
self->nextWordContinues = false;
|
|
// Skip the whitespace char
|
|
continue;
|
|
}
|
|
|
|
// Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0
|
|
// Render a visible space without allowing a line break around it.
|
|
if (static_cast<uint8_t>(s[i]) == 0xC2 && i + 1 < len && static_cast<uint8_t>(s[i + 1]) == 0xA0) {
|
|
// Flush any pending text so style is applied correctly.
|
|
if (self->partWordBufferIndex > 0) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
|
|
// Add a standalone space that attaches to the previous word.
|
|
self->partWordBuffer[0] = ' ';
|
|
self->partWordBuffer[1] = '\0';
|
|
self->partWordBufferIndex = 1;
|
|
self->nextWordContinues = true; // Attach space to previous word (no break).
|
|
self->flushPartWordBuffer();
|
|
|
|
// Ensure the next real word attaches to this space (no break).
|
|
self->nextWordContinues = true;
|
|
|
|
i++; // Skip the second byte (0xA0)
|
|
continue;
|
|
}
|
|
|
|
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
|
|
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
|
|
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
|
|
const XML_Char FEFF_BYTE_3 = static_cast<XML_Char>(0xBF);
|
|
|
|
if (s[i] == FEFF_BYTE_1) {
|
|
// Check if the next two bytes complete the 3-byte sequence
|
|
if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) {
|
|
// Sequence 0xEF 0xBB 0xBF found!
|
|
i += 2; // Skip the next two bytes
|
|
continue; // Move to the next iteration
|
|
}
|
|
}
|
|
|
|
// If we're about to run out of space, then cut the word off and start a new one
|
|
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
|
|
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
|
|
}
|
|
|
|
// If we have > 750 words buffered up, perform the layout and consume out all but the last line
|
|
// There should be enough here to build out 1-2 full pages and doing this will free up a lot of
|
|
// memory.
|
|
// Spotted when reading Intermezzo, there are some really long text blocks in there.
|
|
if (self->currentTextBlock->size() > 750) {
|
|
LOG_DBG("EHP", "Text block too long, splitting into multiple pages");
|
|
self->currentTextBlock->layoutAndExtractLines(
|
|
self->renderer, self->fontId, self->viewportWidth,
|
|
[self](const std::shared_ptr<TextBlock>& textBlock) { self->addLineToPage(textBlock); }, false);
|
|
}
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) {
|
|
// Check if this looks like an entity reference (&...;)
|
|
if (len >= 3 && s[0] == '&' && s[len - 1] == ';') {
|
|
const char* utf8Value = lookupHtmlEntity(s, len);
|
|
if (utf8Value != nullptr) {
|
|
// Known entity: expand to its UTF-8 value
|
|
characterData(userData, utf8Value, strlen(utf8Value));
|
|
return;
|
|
}
|
|
// Unknown entity: preserve original &...; sequence
|
|
characterData(userData, s, len);
|
|
return;
|
|
}
|
|
// Not an entity we recognize - skip it
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
// Check if any style state will change after we decrement depth
|
|
// If so, we MUST flush the partWordBuffer with the CURRENT style first
|
|
// Note: depth hasn't been decremented yet, so we check against (depth - 1)
|
|
const bool willPopStyleStack =
|
|
!self->inlineStyleStack.empty() && self->inlineStyleStack.back().depth == self->depth - 1;
|
|
const bool willClearBold = self->boldUntilDepth == self->depth - 1;
|
|
const bool willClearItalic = self->italicUntilDepth == self->depth - 1;
|
|
const bool willClearUnderline = self->underlineUntilDepth == self->depth - 1;
|
|
|
|
const bool styleWillChange = willPopStyleStack || willClearBold || willClearItalic || willClearUnderline;
|
|
const bool headerOrBlockTag = isHeaderOrBlock(name);
|
|
|
|
// Flush buffer with current style BEFORE any style changes
|
|
if (self->partWordBufferIndex > 0) {
|
|
// Flush if style will change OR if we're closing a block/structural element
|
|
const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 &&
|
|
!matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1;
|
|
const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) ||
|
|
matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
|
|
matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 ||
|
|
matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) || self->depth == 1;
|
|
|
|
if (shouldFlush) {
|
|
self->flushPartWordBuffer();
|
|
// If closing an inline element, the next word fragment continues the same visual word
|
|
if (isInlineTag) {
|
|
self->nextWordContinues = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
self->depth -= 1;
|
|
|
|
// Leaving skip
|
|
if (self->skipUntilDepth == self->depth) {
|
|
self->skipUntilDepth = INT_MAX;
|
|
}
|
|
|
|
// Leaving bold tag
|
|
if (self->boldUntilDepth == self->depth) {
|
|
self->boldUntilDepth = INT_MAX;
|
|
}
|
|
|
|
// Leaving italic tag
|
|
if (self->italicUntilDepth == self->depth) {
|
|
self->italicUntilDepth = INT_MAX;
|
|
}
|
|
|
|
// Leaving underline tag
|
|
if (self->underlineUntilDepth == self->depth) {
|
|
self->underlineUntilDepth = INT_MAX;
|
|
}
|
|
|
|
// Pop from inline style stack if we pushed an entry at this depth
|
|
// This handles all inline elements: b, i, u, span, etc.
|
|
if (!self->inlineStyleStack.empty() && self->inlineStyleStack.back().depth == self->depth) {
|
|
self->inlineStyleStack.pop_back();
|
|
self->updateEffectiveInlineStyle();
|
|
}
|
|
|
|
// Clear block style when leaving header or block elements
|
|
if (headerOrBlockTag) {
|
|
self->currentCssStyle.reset();
|
|
self->updateEffectiveInlineStyle();
|
|
}
|
|
}
|
|
|
|
bool ChapterHtmlSlimParser::parseAndBuildPages() {
|
|
auto paragraphAlignmentBlockStyle = BlockStyle();
|
|
paragraphAlignmentBlockStyle.textAlignDefined = true;
|
|
// Resolve None sentinel to Justify for initial block (no CSS context yet)
|
|
const auto align = (this->paragraphAlignment == static_cast<uint8_t>(CssTextAlign::None))
|
|
? CssTextAlign::Justify
|
|
: static_cast<CssTextAlign>(this->paragraphAlignment);
|
|
paragraphAlignmentBlockStyle.alignment = align;
|
|
startNewTextBlock(paragraphAlignmentBlockStyle);
|
|
|
|
const XML_Parser parser = XML_ParserCreate(nullptr);
|
|
int done;
|
|
|
|
if (!parser) {
|
|
LOG_ERR("EHP", "Couldn't allocate memory for parser");
|
|
return false;
|
|
}
|
|
|
|
// Handle HTML entities (like ) that aren't in XML spec or DTD
|
|
// Using DefaultHandlerExpand preserves normal entity expansion from DOCTYPE
|
|
XML_SetDefaultHandlerExpand(parser, defaultHandlerExpand);
|
|
|
|
FsFile file;
|
|
if (!Storage.openFileForRead("EHP", filepath, file)) {
|
|
XML_ParserFree(parser);
|
|
return false;
|
|
}
|
|
|
|
// Get file size to decide whether to show indexing popup.
|
|
if (popupFn && file.size() >= MIN_SIZE_FOR_POPUP) {
|
|
popupFn();
|
|
}
|
|
|
|
XML_SetUserData(parser, this);
|
|
XML_SetElementHandler(parser, startElement, endElement);
|
|
XML_SetCharacterDataHandler(parser, characterData);
|
|
|
|
// Compute the time taken to parse and build pages
|
|
const uint32_t chapterStartTime = millis();
|
|
do {
|
|
void* const buf = XML_GetBuffer(parser, PARSE_BUFFER_SIZE);
|
|
if (!buf) {
|
|
LOG_ERR("EHP", "Couldn't allocate memory for buffer");
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
|
|
const size_t len = file.read(buf, PARSE_BUFFER_SIZE);
|
|
|
|
if (len == 0 && file.available() > 0) {
|
|
LOG_ERR("EHP", "File read error");
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
|
|
done = file.available() == 0;
|
|
|
|
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
|
|
LOG_ERR("EHP", "Parse error at line %lu:\n%s", XML_GetCurrentLineNumber(parser),
|
|
XML_ErrorString(XML_GetErrorCode(parser)));
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
} while (!done);
|
|
LOG_DBG("EHP", "Time to parse and build pages: %lu ms", millis() - chapterStartTime);
|
|
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
|
|
// Process last page if there is still text
|
|
if (currentTextBlock) {
|
|
makePages();
|
|
completePageFn(std::move(currentPage));
|
|
currentPage.reset();
|
|
currentTextBlock.reset();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr<TextBlock> line) {
|
|
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
|
|
|
|
if (currentPageNextY + lineHeight > viewportHeight) {
|
|
completePageFn(std::move(currentPage));
|
|
currentPage.reset(new Page());
|
|
currentPageNextY = 0;
|
|
}
|
|
|
|
// Apply horizontal left inset (margin + padding) as x position offset
|
|
const int16_t xOffset = line->getBlockStyle().leftInset();
|
|
currentPage->elements.push_back(std::make_shared<PageLine>(line, xOffset, currentPageNextY));
|
|
currentPageNextY += lineHeight;
|
|
}
|
|
|
|
void ChapterHtmlSlimParser::makePages() {
|
|
if (!currentTextBlock) {
|
|
LOG_ERR("EHP", "!! No text block to make pages for !!");
|
|
return;
|
|
}
|
|
|
|
if (!currentPage) {
|
|
currentPage.reset(new Page());
|
|
currentPageNextY = 0;
|
|
}
|
|
|
|
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
|
|
|
|
// Apply top spacing before the paragraph (stored in pixels)
|
|
const BlockStyle& blockStyle = currentTextBlock->getBlockStyle();
|
|
if (blockStyle.marginTop > 0) {
|
|
currentPageNextY += blockStyle.marginTop;
|
|
}
|
|
if (blockStyle.paddingTop > 0) {
|
|
currentPageNextY += blockStyle.paddingTop;
|
|
}
|
|
|
|
// Calculate effective width accounting for horizontal margins/padding
|
|
const int horizontalInset = blockStyle.totalHorizontalInset();
|
|
const uint16_t effectiveWidth =
|
|
(horizontalInset < viewportWidth) ? static_cast<uint16_t>(viewportWidth - horizontalInset) : viewportWidth;
|
|
|
|
currentTextBlock->layoutAndExtractLines(
|
|
renderer, fontId, effectiveWidth,
|
|
[this](const std::shared_ptr<TextBlock>& textBlock) { addLineToPage(textBlock); });
|
|
|
|
// Apply bottom spacing after the paragraph (stored in pixels)
|
|
if (blockStyle.marginBottom > 0) {
|
|
currentPageNextY += blockStyle.marginBottom;
|
|
}
|
|
if (blockStyle.paddingBottom > 0) {
|
|
currentPageNextY += blockStyle.paddingBottom;
|
|
}
|
|
|
|
// Extra paragraph spacing if enabled (default behavior)
|
|
if (extraParagraphSpacing) {
|
|
currentPageNextY += lineHeight / 2;
|
|
}
|
|
}
|