#include "ChapterHtmlSlimParser.h" #include #include #include #include "../Page.h" #include "../htmlEntities.h" const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"}; constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]); const char* BLOCK_TAGS[] = {"p", "li", "div", "br"}; constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]); const char* BOLD_TAGS[] = {"b"}; constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]); const char* ITALIC_TAGS[] = {"i"}; constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]); const char* IMAGE_TAGS[] = {"img"}; constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]); const char* SKIP_TAGS[] = {"head", "table"}; constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]); bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; } // given the start and end of a tag, check to see if it matches a known tag bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) { for (int i = 0; i < possible_tag_count; i++) { if (strcmp(tag_name, possible_tags[i]) == 0) { return true; } } return false; } // start a new text block if needed void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::BLOCK_STYLE style) { if (currentTextBlock) { // already have a text block running and it is empty - just reuse it if (currentTextBlock->isEmpty()) { currentTextBlock->setStyle(style); return; } makePages(); } currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing)); } void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) { auto* self = static_cast(userData); (void)atts; // Middle of skip if (self->skipUntilDepth < self->depth) { self->depth += 1; return; } if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) { // TODO: Start processing image tags self->skipUntilDepth = self->depth; self->depth += 1; return; } if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) { // start skip self->skipUntilDepth = self->depth; self->depth += 1; return; } // Skip blocks with role="doc-pagebreak" and epub:type="pagebreak" if (atts != nullptr) { for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "role") == 0 && strcmp(atts[i + 1], "doc-pagebreak") == 0 || strcmp(atts[i], "epub:type") == 0 && strcmp(atts[i + 1], "pagebreak") == 0) { self->skipUntilDepth = self->depth; self->depth += 1; return; } } } if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) { self->startNewTextBlock(TextBlock::CENTER_ALIGN); self->boldUntilDepth = min(self->boldUntilDepth, self->depth); } else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) { if (strcmp(name, "br") == 0) { self->startNewTextBlock(self->currentTextBlock->getStyle()); } else { self->startNewTextBlock(TextBlock::JUSTIFIED); } } else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) { self->boldUntilDepth = min(self->boldUntilDepth, self->depth); } else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) { self->italicUntilDepth = min(self->italicUntilDepth, self->depth); } self->depth += 1; } void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) { auto* self = static_cast(userData); // Middle of skip if (self->skipUntilDepth < self->depth) { return; } EpdFontStyle fontStyle = REGULAR; if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) { fontStyle = BOLD_ITALIC; } else if (self->boldUntilDepth < self->depth) { fontStyle = BOLD; } else if (self->italicUntilDepth < self->depth) { fontStyle = ITALIC; } for (int i = 0; i < len; i++) { if (isWhitespace(s[i])) { // Currently looking at whitespace, if there's anything in the partWordBuffer, flush it if (self->partWordBufferIndex > 0) { self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle); self->partWordBufferIndex = 0; } // Skip the whitespace char continue; } // If we're about to run out of space, then cut the word off and start a new one if (self->partWordBufferIndex >= MAX_WORD_SIZE) { self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle); self->partWordBufferIndex = 0; } self->partWordBuffer[self->partWordBufferIndex++] = s[i]; } } void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) { auto* self = static_cast(userData); (void)name; if (self->partWordBufferIndex > 0) { // Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file. // We don't want to flush out content when closing inline tags like . // Currently this also flushes out on closing and tags, but they are line tags so that shouldn't happen, // text styling needs to be overhauled to fix it. const bool shouldBreakText = matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) || matches(name, HEADER_TAGS, NUM_HEADER_TAGS) || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || self->depth == 1; if (shouldBreakText) { EpdFontStyle fontStyle = REGULAR; if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) { fontStyle = BOLD_ITALIC; } else if (self->boldUntilDepth < self->depth) { fontStyle = BOLD; } else if (self->italicUntilDepth < self->depth) { fontStyle = ITALIC; } self->partWordBuffer[self->partWordBufferIndex] = '\0'; self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle); self->partWordBufferIndex = 0; } } self->depth -= 1; // Leaving skip if (self->skipUntilDepth == self->depth) { self->skipUntilDepth = INT_MAX; } // Leaving bold if (self->boldUntilDepth == self->depth) { self->boldUntilDepth = INT_MAX; } // Leaving italic if (self->italicUntilDepth == self->depth) { self->italicUntilDepth = INT_MAX; } } bool ChapterHtmlSlimParser::parseAndBuildPages() { startNewTextBlock(TextBlock::JUSTIFIED); const XML_Parser parser = XML_ParserCreate(nullptr); int done; if (!parser) { Serial.printf("[%lu] [EHP] Couldn't allocate memory for parser\n", millis()); return false; } XML_SetUserData(parser, this); XML_SetElementHandler(parser, startElement, endElement); XML_SetCharacterDataHandler(parser, characterData); FILE* file = fopen(filepath, "r"); if (!file) { Serial.printf("[%lu] [EHP] Couldn't open file %s\n", millis(), filepath); XML_ParserFree(parser); return false; } do { void* const buf = XML_GetBuffer(parser, 1024); if (!buf) { Serial.printf("[%lu] [EHP] Couldn't allocate memory for buffer\n", millis()); XML_ParserFree(parser); fclose(file); return false; } const size_t len = fread(buf, 1, 1024, file); if (ferror(file)) { Serial.printf("[%lu] [EHP] File read error\n", millis()); XML_ParserFree(parser); fclose(file); return false; } done = feof(file); if (XML_ParseBuffer(parser, static_cast(len), done) == XML_STATUS_ERROR) { Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(parser), XML_ErrorString(XML_GetErrorCode(parser))); XML_ParserFree(parser); fclose(file); return false; } } while (!done); XML_ParserFree(parser); fclose(file); // Process last page if there is still text if (currentTextBlock) { makePages(); completePageFn(std::move(currentPage)); currentPage.reset(); currentTextBlock.reset(); } return true; } void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr line) { const int lineHeight = renderer.getLineHeight(fontId) * lineCompression; const int pageHeight = GfxRenderer::getScreenHeight() - marginTop - marginBottom; if (currentPageNextY + lineHeight > pageHeight) { completePageFn(std::move(currentPage)); currentPage.reset(new Page()); currentPageNextY = marginTop; } currentPage->elements.push_back(std::make_shared(line, marginLeft, currentPageNextY)); currentPageNextY += lineHeight; } void ChapterHtmlSlimParser::makePages() { if (!currentTextBlock) { Serial.printf("[%lu] [EHP] !! No text block to make pages for !!\n", millis()); return; } if (!currentPage) { currentPage.reset(new Page()); currentPageNextY = marginTop; } const int lineHeight = renderer.getLineHeight(fontId) * lineCompression; currentTextBlock->layoutAndExtractLines( renderer, fontId, marginLeft + marginRight, [this](const std::shared_ptr& textBlock) { addLineToPage(textBlock); }); // Extra paragraph spacing if enabled if (extraParagraphSpacing) { currentPageNextY += lineHeight / 2; } }