## Summary Should address issues discussed in #168 and potentially fix #478. --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**PARTIALLY**_
380 lines
13 KiB
C++
380 lines
13 KiB
C++
#include "ChapterHtmlSlimParser.h"
|
|
|
|
#include <GfxRenderer.h>
|
|
#include <HardwareSerial.h>
|
|
#include <SDCardManager.h>
|
|
#include <expat.h>
|
|
|
|
#include "../Page.h"
|
|
|
|
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
|
|
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
|
|
|
|
// Minimum file size (in bytes) to show progress bar - smaller chapters don't benefit from it
|
|
constexpr size_t MIN_SIZE_FOR_PROGRESS = 50 * 1024; // 50KB
|
|
|
|
const char* BLOCK_TAGS[] = {"p", "li", "div", "br", "blockquote"};
|
|
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
|
|
|
|
const char* BOLD_TAGS[] = {"b", "strong"};
|
|
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
|
|
|
|
const char* ITALIC_TAGS[] = {"i", "em"};
|
|
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
|
|
|
|
const char* IMAGE_TAGS[] = {"img"};
|
|
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
|
|
|
|
const char* SKIP_TAGS[] = {"head"};
|
|
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
|
|
|
|
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; }
|
|
|
|
// given the start and end of a tag, check to see if it matches a known tag
|
|
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
|
|
for (int i = 0; i < possible_tag_count; i++) {
|
|
if (strcmp(tag_name, possible_tags[i]) == 0) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// flush the contents of partWordBuffer to currentTextBlock
|
|
void ChapterHtmlSlimParser::flushPartWordBuffer() {
|
|
// determine font style
|
|
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
|
|
if (boldUntilDepth < depth && italicUntilDepth < depth) {
|
|
fontStyle = EpdFontFamily::BOLD_ITALIC;
|
|
} else if (boldUntilDepth < depth) {
|
|
fontStyle = EpdFontFamily::BOLD;
|
|
} else if (italicUntilDepth < depth) {
|
|
fontStyle = EpdFontFamily::ITALIC;
|
|
}
|
|
// flush the buffer
|
|
partWordBuffer[partWordBufferIndex] = '\0';
|
|
currentTextBlock->addWord(partWordBuffer, fontStyle);
|
|
partWordBufferIndex = 0;
|
|
}
|
|
|
|
// start a new text block if needed
|
|
void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::Style style) {
|
|
if (currentTextBlock) {
|
|
// already have a text block running and it is empty - just reuse it
|
|
if (currentTextBlock->isEmpty()) {
|
|
currentTextBlock->setStyle(style);
|
|
return;
|
|
}
|
|
|
|
makePages();
|
|
}
|
|
currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing, hyphenationEnabled));
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
// Middle of skip
|
|
if (self->skipUntilDepth < self->depth) {
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
// Special handling for tables - show placeholder text instead of dropping silently
|
|
if (strcmp(name, "table") == 0) {
|
|
// Add placeholder text
|
|
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
|
|
if (self->currentTextBlock) {
|
|
self->currentTextBlock->addWord("[Table omitted]", EpdFontFamily::ITALIC);
|
|
}
|
|
|
|
// Skip table contents
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
|
|
// TODO: Start processing image tags
|
|
std::string alt;
|
|
if (atts != nullptr) {
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "alt") == 0) {
|
|
// add " " (counts as whitespace) at the end of alt
|
|
// so the corresponding text block ends.
|
|
// TODO: A zero-width breaking space would be more appropriate (once/if we support it)
|
|
alt = "[Image: " + std::string(atts[i + 1]) + "] ";
|
|
}
|
|
}
|
|
Serial.printf("[%lu] [EHP] Image alt: %s\n", millis(), alt.c_str());
|
|
|
|
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
|
|
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
|
|
self->depth += 1;
|
|
self->characterData(userData, alt.c_str(), alt.length());
|
|
return;
|
|
} else {
|
|
// Skip for now
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
|
|
// start skip
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
|
|
// Skip blocks with role="doc-pagebreak" and epub:type="pagebreak"
|
|
if (atts != nullptr) {
|
|
for (int i = 0; atts[i]; i += 2) {
|
|
if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 ||
|
|
strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) {
|
|
self->skipUntilDepth = self->depth;
|
|
self->depth += 1;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
|
|
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
|
|
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
|
|
} else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
|
|
if (strcmp(name, "br") == 0) {
|
|
if (self->partWordBufferIndex > 0) {
|
|
// flush word preceding <br/> to currentTextBlock before calling startNewTextBlock
|
|
self->flushPartWordBuffer();
|
|
}
|
|
self->startNewTextBlock(self->currentTextBlock->getStyle());
|
|
} else {
|
|
self->startNewTextBlock((TextBlock::Style)self->paragraphAlignment);
|
|
if (strcmp(name, "li") == 0) {
|
|
self->currentTextBlock->addWord("\xe2\x80\xa2", EpdFontFamily::REGULAR);
|
|
}
|
|
}
|
|
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
|
|
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
|
|
} else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
|
|
self->italicUntilDepth = std::min(self->italicUntilDepth, self->depth);
|
|
}
|
|
|
|
self->depth += 1;
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
// Middle of skip
|
|
if (self->skipUntilDepth < self->depth) {
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i < len; i++) {
|
|
if (isWhitespace(s[i])) {
|
|
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
|
|
if (self->partWordBufferIndex > 0) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
// Skip the whitespace char
|
|
continue;
|
|
}
|
|
|
|
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
|
|
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
|
|
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
|
|
const XML_Char FEFF_BYTE_3 = static_cast<XML_Char>(0xBF);
|
|
|
|
if (s[i] == FEFF_BYTE_1) {
|
|
// Check if the next two bytes complete the 3-byte sequence
|
|
if ((i + 2 < len) && (s[i + 1] == FEFF_BYTE_2) && (s[i + 2] == FEFF_BYTE_3)) {
|
|
// Sequence 0xEF 0xBB 0xBF found!
|
|
i += 2; // Skip the next two bytes
|
|
continue; // Move to the next iteration
|
|
}
|
|
}
|
|
|
|
// If we're about to run out of space, then cut the word off and start a new one
|
|
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
|
|
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
|
|
}
|
|
|
|
// If we have > 750 words buffered up, perform the layout and consume out all but the last line
|
|
// There should be enough here to build out 1-2 full pages and doing this will free up a lot of
|
|
// memory.
|
|
// Spotted when reading Intermezzo, there are some really long text blocks in there.
|
|
if (self->currentTextBlock->size() > 750) {
|
|
Serial.printf("[%lu] [EHP] Text block too long, splitting into multiple pages\n", millis());
|
|
self->currentTextBlock->layoutAndExtractLines(
|
|
self->renderer, self->fontId, self->viewportWidth,
|
|
[self](const std::shared_ptr<TextBlock>& textBlock) { self->addLineToPage(textBlock); }, false);
|
|
}
|
|
}
|
|
|
|
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
|
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
|
|
|
if (self->partWordBufferIndex > 0) {
|
|
// Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file.
|
|
// We don't want to flush out content when closing inline tags like <span>.
|
|
// Currently this also flushes out on closing <b> and <i> tags, but they are line tags so that shouldn't happen,
|
|
// text styling needs to be overhauled to fix it.
|
|
const bool shouldBreakText =
|
|
matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) || matches(name, HEADER_TAGS, NUM_HEADER_TAGS) ||
|
|
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || self->depth == 1;
|
|
|
|
if (shouldBreakText) {
|
|
self->flushPartWordBuffer();
|
|
}
|
|
}
|
|
|
|
self->depth -= 1;
|
|
|
|
// Leaving skip
|
|
if (self->skipUntilDepth == self->depth) {
|
|
self->skipUntilDepth = INT_MAX;
|
|
}
|
|
|
|
// Leaving bold
|
|
if (self->boldUntilDepth == self->depth) {
|
|
self->boldUntilDepth = INT_MAX;
|
|
}
|
|
|
|
// Leaving italic
|
|
if (self->italicUntilDepth == self->depth) {
|
|
self->italicUntilDepth = INT_MAX;
|
|
}
|
|
}
|
|
|
|
bool ChapterHtmlSlimParser::parseAndBuildPages() {
|
|
startNewTextBlock((TextBlock::Style)this->paragraphAlignment);
|
|
|
|
const XML_Parser parser = XML_ParserCreate(nullptr);
|
|
int done;
|
|
|
|
if (!parser) {
|
|
Serial.printf("[%lu] [EHP] Couldn't allocate memory for parser\n", millis());
|
|
return false;
|
|
}
|
|
|
|
FsFile file;
|
|
if (!SdMan.openFileForRead("EHP", filepath, file)) {
|
|
XML_ParserFree(parser);
|
|
return false;
|
|
}
|
|
|
|
// Get file size for progress calculation
|
|
const size_t totalSize = file.size();
|
|
size_t bytesRead = 0;
|
|
int lastProgress = -1;
|
|
|
|
XML_SetUserData(parser, this);
|
|
XML_SetElementHandler(parser, startElement, endElement);
|
|
XML_SetCharacterDataHandler(parser, characterData);
|
|
|
|
do {
|
|
void* const buf = XML_GetBuffer(parser, 1024);
|
|
if (!buf) {
|
|
Serial.printf("[%lu] [EHP] Couldn't allocate memory for buffer\n", millis());
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
|
|
const size_t len = file.read(buf, 1024);
|
|
|
|
if (len == 0 && file.available() > 0) {
|
|
Serial.printf("[%lu] [EHP] File read error\n", millis());
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
|
|
// Update progress (call every 10% change to avoid too frequent updates)
|
|
// Only show progress for larger chapters where rendering overhead is worth it
|
|
bytesRead += len;
|
|
if (progressFn && totalSize >= MIN_SIZE_FOR_PROGRESS) {
|
|
const int progress = static_cast<int>((bytesRead * 100) / totalSize);
|
|
if (lastProgress / 10 != progress / 10) {
|
|
lastProgress = progress;
|
|
progressFn(progress);
|
|
}
|
|
}
|
|
|
|
done = file.available() == 0;
|
|
|
|
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
|
|
Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(parser),
|
|
XML_ErrorString(XML_GetErrorCode(parser)));
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
return false;
|
|
}
|
|
} while (!done);
|
|
|
|
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
|
|
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
|
|
XML_SetCharacterDataHandler(parser, nullptr);
|
|
XML_ParserFree(parser);
|
|
file.close();
|
|
|
|
// Process last page if there is still text
|
|
if (currentTextBlock) {
|
|
makePages();
|
|
completePageFn(std::move(currentPage));
|
|
currentPage.reset();
|
|
currentTextBlock.reset();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr<TextBlock> line) {
|
|
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
|
|
|
|
if (currentPageNextY + lineHeight > viewportHeight) {
|
|
completePageFn(std::move(currentPage));
|
|
currentPage.reset(new Page());
|
|
currentPageNextY = 0;
|
|
}
|
|
|
|
currentPage->elements.push_back(std::make_shared<PageLine>(line, 0, currentPageNextY));
|
|
currentPageNextY += lineHeight;
|
|
}
|
|
|
|
void ChapterHtmlSlimParser::makePages() {
|
|
if (!currentTextBlock) {
|
|
Serial.printf("[%lu] [EHP] !! No text block to make pages for !!\n", millis());
|
|
return;
|
|
}
|
|
|
|
if (!currentPage) {
|
|
currentPage.reset(new Page());
|
|
currentPageNextY = 0;
|
|
}
|
|
|
|
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
|
|
currentTextBlock->layoutAndExtractLines(
|
|
renderer, fontId, viewportWidth,
|
|
[this](const std::shared_ptr<TextBlock>& textBlock) { addLineToPage(textBlock); });
|
|
// Extra paragraph spacing if enabled
|
|
if (extraParagraphSpacing) {
|
|
currentPageNextY += lineHeight / 2;
|
|
}
|
|
}
|