crosspoint-reader/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
Dave Allie b9e46641cb
Skip pagebreak blocks when parsing epub file
These blocks break the flow and often contain the page number in them which
should not interrupt the flow of the content
2025-12-19 01:05:39 +11:00

294 lines
9.3 KiB
C++

#include "ChapterHtmlSlimParser.h"
#include <GfxRenderer.h>
#include <HardwareSerial.h>
#include <expat.h>
#include "../Page.h"
#include "../htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
const char* BOLD_TAGS[] = {"b"};
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
const char* ITALIC_TAGS[] = {"i"};
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
const char* IMAGE_TAGS[] = {"img"};
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
const char* SKIP_TAGS[] = {"head", "table"};
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; }
// given the start and end of a tag, check to see if it matches a known tag
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
for (int i = 0; i < possible_tag_count; i++) {
if (strcmp(tag_name, possible_tags[i]) == 0) {
return true;
}
}
return false;
}
// start a new text block if needed
void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::BLOCK_STYLE style) {
if (currentTextBlock) {
// already have a text block running and it is empty - just reuse it
if (currentTextBlock->isEmpty()) {
currentTextBlock->setStyle(style);
return;
}
makePages();
}
currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing));
}
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
(void)atts;
// Middle of skip
if (self->skipUntilDepth < self->depth) {
self->depth += 1;
return;
}
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
// TODO: Start processing image tags
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
// start skip
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
// Skip blocks with role="doc-pagebreak" and epub:type="pagebreak"
if (atts != nullptr) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 ||
strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) {
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
}
}
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
} else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
if (strcmp(name, "br") == 0) {
self->startNewTextBlock(self->currentTextBlock->getStyle());
} else {
self->startNewTextBlock(TextBlock::JUSTIFIED);
}
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
} else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
}
self->depth += 1;
}
void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
// Middle of skip
if (self->skipUntilDepth < self->depth) {
return;
}
EpdFontStyle fontStyle = REGULAR;
if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) {
fontStyle = BOLD_ITALIC;
} else if (self->boldUntilDepth < self->depth) {
fontStyle = BOLD;
} else if (self->italicUntilDepth < self->depth) {
fontStyle = ITALIC;
}
for (int i = 0; i < len; i++) {
if (isWhitespace(s[i])) {
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->partWordBufferIndex = 0;
}
// Skip the whitespace char
continue;
}
// If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->partWordBufferIndex = 0;
}
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
}
}
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
(void)name;
if (self->partWordBufferIndex > 0) {
// Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file.
// We don't want to flush out content when closing inline tags like <span>.
// Currently this also flushes out on closing <b> and <i> tags, but they are line tags so that shouldn't happen,
// text styling needs to be overhauled to fix it.
const bool shouldBreakText =
matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) || matches(name, HEADER_TAGS, NUM_HEADER_TAGS) ||
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || self->depth == 1;
if (shouldBreakText) {
EpdFontStyle fontStyle = REGULAR;
if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) {
fontStyle = BOLD_ITALIC;
} else if (self->boldUntilDepth < self->depth) {
fontStyle = BOLD;
} else if (self->italicUntilDepth < self->depth) {
fontStyle = ITALIC;
}
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->partWordBufferIndex = 0;
}
}
self->depth -= 1;
// Leaving skip
if (self->skipUntilDepth == self->depth) {
self->skipUntilDepth = INT_MAX;
}
// Leaving bold
if (self->boldUntilDepth == self->depth) {
self->boldUntilDepth = INT_MAX;
}
// Leaving italic
if (self->italicUntilDepth == self->depth) {
self->italicUntilDepth = INT_MAX;
}
}
bool ChapterHtmlSlimParser::parseAndBuildPages() {
startNewTextBlock(TextBlock::JUSTIFIED);
const XML_Parser parser = XML_ParserCreate(nullptr);
int done;
if (!parser) {
Serial.printf("[%lu] [EHP] Couldn't allocate memory for parser\n", millis());
return false;
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
FILE* file = fopen(filepath, "r");
if (!file) {
Serial.printf("[%lu] [EHP] Couldn't open file %s\n", millis(), filepath);
XML_ParserFree(parser);
return false;
}
do {
void* const buf = XML_GetBuffer(parser, 1024);
if (!buf) {
Serial.printf("[%lu] [EHP] Couldn't allocate memory for buffer\n", millis());
XML_ParserFree(parser);
fclose(file);
return false;
}
const size_t len = fread(buf, 1, 1024, file);
if (ferror(file)) {
Serial.printf("[%lu] [EHP] File read error\n", millis());
XML_ParserFree(parser);
fclose(file);
return false;
}
done = feof(file);
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_ParserFree(parser);
fclose(file);
return false;
}
} while (!done);
XML_ParserFree(parser);
fclose(file);
// Process last page if there is still text
if (currentTextBlock) {
makePages();
completePageFn(std::move(currentPage));
currentPage.reset();
currentTextBlock.reset();
}
return true;
}
void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr<TextBlock> line) {
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
const int pageHeight = GfxRenderer::getScreenHeight() - marginTop - marginBottom;
if (currentPageNextY + lineHeight > pageHeight) {
completePageFn(std::move(currentPage));
currentPage.reset(new Page());
currentPageNextY = marginTop;
}
currentPage->elements.push_back(std::make_shared<PageLine>(line, marginLeft, currentPageNextY));
currentPageNextY += lineHeight;
}
void ChapterHtmlSlimParser::makePages() {
if (!currentTextBlock) {
Serial.printf("[%lu] [EHP] !! No text block to make pages for !!\n", millis());
return;
}
if (!currentPage) {
currentPage.reset(new Page());
currentPageNextY = marginTop;
}
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
currentTextBlock->layoutAndExtractLines(
renderer, fontId, marginLeft + marginRight,
[this](const std::shared_ptr<TextBlock>& textBlock) { addLineToPage(textBlock); });
// Extra paragraph spacing if enabled
if (extraParagraphSpacing) {
currentPageNextY += lineHeight / 2;
}
}