## Summary * Correctly render italics on image alt placeholders * Parser incorrectly handled depth of self-closing tags * Self-closing tags immediately call start and end tag ## Additional Context * Previously, it would incorrectly make the whole chapter bold/italics, or not italicised the image alt --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? No
400 lines
13 KiB
C++
400 lines
13 KiB
C++
#include "ChapterHtmlSlimParser.h"
|
|
|
|
#include <GfxRenderer.h>
|
|
#include <HardwareSerial.h>
|
|
#include <SDCardManager.h>
|
|
#include <expat.h>
|
|
|
|
#include "../Page.h"
|
|
|
|
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
|
|
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
|
|
|
|
// Minimum file size (in bytes) to show progress bar - smaller chapters don't benefit from it
|
|
constexpr size_t MIN_SIZE_FOR_PROGRESS = 50 * 1024; // 50KB
|
|
|
|
const char* BLOCK_TAGS[] = {"p", "li", "div", "br", "blockquote"};
|
|
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
|
|
|
|
const char* BOLD_TAGS[] = {"b", "strong"};
|
|
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
|
|
|
|
const char* ITALIC_TAGS[] = {"i", "em"};
|
|
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
|
|
|
|
const char* IMAGE_TAGS[] = {"img"};
|
|
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
|
|
|
|
const char* SKIP_TAGS[] = {"head"};
|
|
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
|
|
|
|
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; }
|
|
|
|
// given the start and end of a tag, check to see if it matches a known tag
|
|
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
|
|
for (int i = 0; i < possible_tag_count; i++) {
|
|
if (strcmp(tag_name, possible_tags[i]) == 0) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// flush the contents of partWordBuffer to currentTextBlock
|
|
void ChapterHtmlSlimParser::flushPartWordBuffer() {
|
|
// determine font style
|
|
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
|
|
if (boldUntilDepth < depth && italicUntilDepth < depth) {
|
|
fontStyle = EpdFontFamily::BOLD_ITALIC;
|
|
} else if (boldUntilDepth < depth) {
|
|
fontStyle = EpdFontFamily::BOLD;
|
|
} else if (italicUntilDepth < depth) {
|
|
fontStyle = EpdFontFamily::ITALIC;
|
|
}
|
|
// flush the buffer
|
|
partWordBuffer[partWordBufferIndex] = '\0';
|
|
currentTextBlock->addWord(partWordBuffer, fontStyle);
|
|
partWordBufferIndex = 0;
|
|
}
|
|
|
|
// start a new text block if needed
|
|
void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::Style style) {
|
|
if (currentTextBlock) {
|
|
// already have a text block running and it is empty - just reuse it
|
|
if (currentTextBlock->isEmpty()) {
|
|
currentTextBlock->setStyle(style);
|
|
return;
|
|
}
|
|
|
|
makePages();
|
|
}
|
|
currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing, hyphenationEnabled));
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
// Middle of skip
|
|
if (self->skipUntilDepth < self->depth) {
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
// Special handling for tables - show placeholder text instead of dropping silently
|
|
if (strcmp(name, "table") == 0) {
|
|
// Add placeholder text
|
|
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
|
|
|
|
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
|
|
// Advance depth before processing character data (like you would for a element with text)
|
|
self->depth += 1;
|
|
self->characterData(userData, "[Table omitted]", strlen("[Table omitted]"));
|
|
|
|
// Skip table contents (skip until parent as we pre-advanced depth above)
|
|
self->skipUntilDepth = self->depth - 1;
|
|
return;
|
|
}
|
|
|
|
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
|
|
// TODO: Start processing image tags
|
|
std::string alt = "[Image]";
|
|
if (atts != nullptr) {
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "alt") == 0) {
|
|
if (strlen(atts[i + 1]) > 0) {
|
|
alt = "[Image: " + std::string(atts[i + 1]) + "]";
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
Serial.printf("[%lu] [EHP] Image alt: %s\n", millis(), alt.c_str());
|
|
|
|
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
|
|
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
|
|
// Advance depth before processing character data (like you would for a element with text)
|
|
self->depth += 1;
|
|
self->characterData(userData, alt.c_str(), alt.length());
|
|
|
|
// Skip table contents (skip until parent as we pre-advanced depth above)
|
|
self->skipUntilDepth = self->depth - 1;
|
|
return;
|
|
}
|
|
|
|
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
|
|
// start skip
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
// Skip blocks with role="doc-pagebreak" and epub:type="pagebreak"
|
|
if (atts != nullptr) {
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 ||
|
|
strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) {
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
|
|
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
|
|
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
|
|
if (strcmp(name, "br") == 0) {
|
|
if (self->partWordBufferIndex > 0) {
|
|
// flush word preceding <br/> to currentTextBlock before calling startNewTextBlock
|
|
self->flushPartWordBuffer();
|
|
}
|
|
self->startNewTextBlock(self->currentTextBlock->getStyle());
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
self->startNewTextBlock(static_cast<TextBlock::Style>(self->paragraphAlignment));
|
|
if (strcmp(name, "li") == 0) {
|
|
self->currentTextBlock->addWord("\xe2\x80\xa2", EpdFontFamily::REGULAR);
|
|
}
|
|
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
|
|
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
|
|
self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
// Unprocessed tag, just increasing depth and continue forward
|
|
self->depth += 1;
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
// Middle of skip
|
|
if (self->skipUntilDepth < self->depth) {
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i < len; i++) {
|
|
if (isWhitespace(s[i])) {
|
|
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
|
|
if (self->partWordBufferIndex > 0) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
// Skip the whitespace char
|
|
continue;
|
|
}
|
|
|
|
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
|
|
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
|
|
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
|
|
const XML_Char FEFF_BYTE_3 = static_cast<XML_Char>(0xBF);
|
|
|
|
if (s[i] == FEFF_BYTE_1) {
|
|
// Check if the next two bytes complete the 3-byte sequence
|
|
if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) {
|
|
// Sequence 0xEF 0xBB 0xBF found!
|
|
i += 2; // Skip the next two bytes
|
|
continue; // Move to the next iteration
|
|
}
|
|
}
|
|
|
|
// If we're about to run out of space, then cut the word off and start a new one
|
|
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
|
|
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
|
|
}
|
|
|
|
// If we have > 750 words buffered up, perform the layout and consume out all but the last line
|
|
// There should be enough here to build out 1-2 full pages and doing this will free up a lot of
|
|
// memory.
|
|
// Spotted when reading Intermezzo, there are some really long text blocks in there.
|
|
if (self->currentTextBlock->size() > 750) {
|
|
Serial.printf("[%lu] [EHP] Text block too long, splitting into multiple pages\n", millis());
|
|
self->currentTextBlock->layoutAndExtractLines(
|
|
self->renderer, self->fontId, self->viewportWidth,
|
|
[self](const std::shared_ptr<TextBlock>& textBlock) { self->addLineToPage(textBlock); }, false);
|
|
}
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
if (self->partWordBufferIndex > 0) {
|
|
// Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file.
|
|
// We don't want to flush out content when closing inline tags like <span>.
|
|
// Currently this also flushes out on closing <b> and <i> tags, but they are line tags so that shouldn't happen,
|
|
// text styling needs to be overhauled to fix it.
|
|
const bool shouldBreakText =
|
|
matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) || matches(name, HEADER_TAGS, NUM_HEADER_TAGS) ||
|
|
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) ||
|
|
strcmp(name, "table") == 0 || matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) || self->depth == 1;
|
|
|
|
if (shouldBreakText) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
}
|
|
|
|
self->depth -= 1;
|
|
|
|
// Leaving skip
|
|
if (self->skipUntilDepth == self->depth) {
|
|
self->skipUntilDepth = INT_MAX;
|
|
}
|
|
|
|
// Leaving bold
|
|
if (self->boldUntilDepth == self->depth) {
|
|
self->boldUntilDepth = INT_MAX;
|
|
}
|
|
|
|
// Leaving italic
|
|
if (self->italicUntilDepth == self->depth) {
|
|
self->italicUntilDepth = INT_MAX;
|
|
}
|
|
}
|
|
|
|
bool ChapterHtmlSlimParser::parseAndBuildPages() {
|
|
startNewTextBlock((TextBlock::Style)this->paragraphAlignment);
|
|
|
|
const XML_Parser parser = XML_ParserCreate(nullptr);
|
|
int done;
|
|
|
|
if (!parser) {
|
|
Serial.printf("[%lu] [EHP] Couldn't allocate memory for parser\n", millis());
|
|
return false;
|
|
}
|
|
|
|
FsFile file;
|
|
if (!SdMan.openFileForRead("EHP", filepath, file)) {
|
|
XML_ParserFree(parser);
|
|
return false;
|
|
}
|
|
|
|
// Get file size for progress calculation
|
|
const size_t totalSize = file.size();
|
|
size_t bytesRead = 0;
|
|
int lastProgress = -1;
|
|
|
|
XML_SetUserData(parser, this);
|
|
XML_SetElementHandler(parser, startElement, endElement);
|
|
XML_SetCharacterDataHandler(parser, characterData);
|
|
|
|
do {
|
|
void* const buf = XML_GetBuffer(parser, 1024);
|
|
if (!buf) {
|
|
Serial.printf("[%lu] [EHP] Couldn't allocate memory for buffer\n", millis());
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
|
|
const size_t len = file.read(buf, 1024);
|
|
|
|
if (len == 0 && file.available() > 0) {
|
|
Serial.printf("[%lu] [EHP] File read error\n", millis());
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
|
|
// Update progress (call every 10% change to avoid too frequent updates)
|
|
// Only show progress for larger chapters where rendering overhead is worth it
|
|
bytesRead += len;
|
|
if (progressFn && totalSize >= MIN_SIZE_FOR_PROGRESS) {
|
|
const int progress = static_cast<int>((bytesRead * 100) / totalSize);
|
|
if (lastProgress / 10 != progress / 10) {
|
|
lastProgress = progress;
|
|
progressFn(progress);
|
|
}
|
|
}
|
|
|
|
done = file.available() == 0;
|
|
|
|
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
|
|
Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(parser),
|
|
XML_ErrorString(XML_GetErrorCode(parser)));
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
} while (!done);
|
|
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
|
|
// Process last page if there is still text
|
|
if (currentTextBlock) {
|
|
makePages();
|
|
completePageFn(std::move(currentPage));
|
|
currentPage.reset();
|
|
currentTextBlock.reset();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr<TextBlock> line) {
|
|
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
|
|
|
|
if (currentPageNextY + lineHeight > viewportHeight) {
|
|
completePageFn(std::move(currentPage));
|
|
currentPage.reset(new Page());
|
|
currentPageNextY = 0;
|
|
}
|
|
|
|
currentPage->elements.push_back(std::make_shared<PageLine>(line, 0, currentPageNextY));
|
|
currentPageNextY += lineHeight;
|
|
}
|
|
|
|
void ChapterHtmlSlimParser::makePages() {
|
|
if (!currentTextBlock) {
|
|
Serial.printf("[%lu] [EHP] !! No text block to make pages for !!\n", millis());
|
|
return;
|
|
}
|
|
|
|
if (!currentPage) {
|
|
currentPage.reset(new Page());
|
|
currentPageNextY = 0;
|
|
}
|
|
|
|
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
|
|
currentTextBlock->layoutAndExtractLines(
|
|
renderer, fontId, viewportWidth,
|
|
[this](const std::shared_ptr<TextBlock>& textBlock) { addLineToPage(textBlock); });
|
|
// Extra paragraph spacing if enabled
|
|
if (extraParagraphSpacing) {
|
|
currentPageNextY += lineHeight / 2;
|
|
}
|
|
}
|