Remove tinyxml2 dependency replace with expat parsers (#9)

This commit is contained in:
Dave Allie
2025-12-13 19:36:01 +11:00
committed by GitHub
parent d450f362d1
commit c7a32fe41f
17 changed files with 696 additions and 253 deletions

View File

@@ -0,0 +1,279 @@
#include "ChapterHtmlSlimParser.h"
#include <GfxRenderer.h>
#include <HardwareSerial.h>
#include <expat.h>
#include "../Page.h"
#include "../htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
const char* BLOCK_TAGS[] = {"p", "li", "div", "br"};
constexpr int NUM_BLOCK_TAGS = sizeof(BLOCK_TAGS) / sizeof(BLOCK_TAGS[0]);
const char* BOLD_TAGS[] = {"b"};
constexpr int NUM_BOLD_TAGS = sizeof(BOLD_TAGS) / sizeof(BOLD_TAGS[0]);
const char* ITALIC_TAGS[] = {"i"};
constexpr int NUM_ITALIC_TAGS = sizeof(ITALIC_TAGS) / sizeof(ITALIC_TAGS[0]);
const char* IMAGE_TAGS[] = {"img"};
constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]);
const char* SKIP_TAGS[] = {"head", "table"};
constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]);
bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n'; }
// given the start and end of a tag, check to see if it matches a known tag
bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) {
for (int i = 0; i < possible_tag_count; i++) {
if (strcmp(tag_name, possible_tags[i]) == 0) {
return true;
}
}
return false;
}
// start a new text block if needed
void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::BLOCK_STYLE style) {
if (currentTextBlock) {
// already have a text block running and it is empty - just reuse it
if (currentTextBlock->isEmpty()) {
currentTextBlock->setStyle(style);
return;
}
makePages();
}
currentTextBlock.reset(new ParsedText(style));
}
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
(void)atts;
// Middle of skip
if (self->skipUntilDepth < self->depth) {
self->depth += 1;
return;
}
if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) {
// TODO: Start processing image tags
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
if (matches(name, SKIP_TAGS, NUM_SKIP_TAGS)) {
// start skip
self->skipUntilDepth = self->depth;
self->depth += 1;
return;
}
if (matches(name, HEADER_TAGS, NUM_HEADER_TAGS)) {
self->startNewTextBlock(TextBlock::CENTER_ALIGN);
self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
} else if (matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS)) {
if (strcmp(name, "br") == 0) {
self->startNewTextBlock(self->currentTextBlock->getStyle());
} else {
self->startNewTextBlock(TextBlock::JUSTIFIED);
}
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
self->boldUntilDepth = min(self->boldUntilDepth, self->depth);
} else if (matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS)) {
self->italicUntilDepth = min(self->italicUntilDepth, self->depth);
}
self->depth += 1;
}
void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
// Middle of skip
if (self->skipUntilDepth < self->depth) {
return;
}
EpdFontStyle fontStyle = REGULAR;
if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) {
fontStyle = BOLD_ITALIC;
} else if (self->boldUntilDepth < self->depth) {
fontStyle = BOLD;
} else if (self->italicUntilDepth < self->depth) {
fontStyle = ITALIC;
}
for (int i = 0; i < len; i++) {
if (isWhitespace(s[i])) {
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->partWordBufferIndex = 0;
}
// Skip the whitespace char
continue;
}
// If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->partWordBufferIndex = 0;
}
self->partWordBuffer[self->partWordBufferIndex++] = s[i];
}
}
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
(void)name;
if (self->partWordBufferIndex > 0) {
// Only flush out part word buffer if we're closing a block tag or are at the top of the HTML file.
// We don't want to flush out content when closing inline tags like <span>.
// Currently this also flushes out on closing <b> and <i> tags, but they are line tags so that shouldn't happen,
// text styling needs to be overhauled to fix it.
const bool shouldBreakText =
matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS) || matches(name, HEADER_TAGS, NUM_HEADER_TAGS) ||
matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || self->depth == 1;
if (shouldBreakText) {
EpdFontStyle fontStyle = REGULAR;
if (self->boldUntilDepth < self->depth && self->italicUntilDepth < self->depth) {
fontStyle = BOLD_ITALIC;
} else if (self->boldUntilDepth < self->depth) {
fontStyle = BOLD;
} else if (self->italicUntilDepth < self->depth) {
fontStyle = ITALIC;
}
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->partWordBufferIndex = 0;
}
}
self->depth -= 1;
// Leaving skip
if (self->skipUntilDepth == self->depth) {
self->skipUntilDepth = INT_MAX;
}
// Leaving bold
if (self->boldUntilDepth == self->depth) {
self->boldUntilDepth = INT_MAX;
}
// Leaving italic
if (self->italicUntilDepth == self->depth) {
self->italicUntilDepth = INT_MAX;
}
}
bool ChapterHtmlSlimParser::parseAndBuildPages() {
startNewTextBlock(TextBlock::JUSTIFIED);
const XML_Parser parser = XML_ParserCreate(nullptr);
int done;
if (!parser) {
Serial.printf("[%lu] [EHP] Couldn't allocate memory for parser\n", millis());
return false;
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
FILE* file = fopen(filepath, "r");
if (!file) {
Serial.printf("[%lu] [EHP] Couldn't open file %s\n", millis(), filepath);
XML_ParserFree(parser);
return false;
}
do {
void* const buf = XML_GetBuffer(parser, 1024);
if (!buf) {
Serial.printf("[%lu] [EHP] Couldn't allocate memory for buffer\n", millis());
XML_ParserFree(parser);
fclose(file);
return false;
}
const size_t len = fread(buf, 1, 1024, file);
if (ferror(file)) {
Serial.printf("[%lu] [EHP] File read error\n", millis());
XML_ParserFree(parser);
fclose(file);
return false;
}
done = feof(file);
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_ParserFree(parser);
fclose(file);
return false;
}
} while (!done);
XML_ParserFree(parser);
fclose(file);
// Process last page if there is still text
if (currentTextBlock) {
makePages();
completePageFn(std::move(currentPage));
currentPage.reset();
currentTextBlock.reset();
}
return true;
}
void ChapterHtmlSlimParser::makePages() {
if (!currentTextBlock) {
Serial.printf("[%lu] [EHP] !! No text block to make pages for !!\n", millis());
return;
}
if (!currentPage) {
currentPage.reset(new Page());
currentPageNextY = marginTop;
}
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
const int pageHeight = GfxRenderer::getScreenHeight() - marginTop - marginBottom;
// Long running task, make sure to let other things happen
vTaskDelay(1);
const auto lines = currentTextBlock->layoutAndExtractLines(renderer, fontId, marginLeft + marginRight);
for (auto&& line : lines) {
if (currentPageNextY + lineHeight > pageHeight) {
completePageFn(std::move(currentPage));
currentPage.reset(new Page());
currentPageNextY = marginTop;
}
currentPage->elements.push_back(std::make_shared<PageLine>(line, marginLeft, currentPageNextY));
currentPageNextY += lineHeight;
}
// add some extra line between blocks
currentPageNextY += lineHeight / 2;
}

View File

@@ -0,0 +1,62 @@
#pragma once
#include <expat.h>
#include <climits>
#include <functional>
#include <memory>
#include "../ParsedText.h"
#include "../blocks/TextBlock.h"
class Page;
class GfxRenderer;
#define MAX_WORD_SIZE 200
class ChapterHtmlSlimParser {
const char* filepath;
GfxRenderer& renderer;
std::function<void(std::unique_ptr<Page>)> completePageFn;
int depth = 0;
int skipUntilDepth = INT_MAX;
int boldUntilDepth = INT_MAX;
int italicUntilDepth = INT_MAX;
// buffer for building up words from characters, will auto break if longer than this
// leave one char at end for null pointer
char partWordBuffer[MAX_WORD_SIZE + 1] = {};
int partWordBufferIndex = 0;
std::unique_ptr<ParsedText> currentTextBlock = nullptr;
std::unique_ptr<Page> currentPage = nullptr;
int16_t currentPageNextY = 0;
int fontId;
float lineCompression;
int marginTop;
int marginRight;
int marginBottom;
int marginLeft;
void startNewTextBlock(TextBlock::BLOCK_STYLE style);
void makePages();
// XML callbacks
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
static void XMLCALL endElement(void* userData, const XML_Char* name);
public:
explicit ChapterHtmlSlimParser(const char* filepath, GfxRenderer& renderer, const int fontId,
const float lineCompression, const int marginTop, const int marginRight,
const int marginBottom, const int marginLeft,
const std::function<void(std::unique_ptr<Page>)>& completePageFn)
: filepath(filepath),
renderer(renderer),
fontId(fontId),
lineCompression(lineCompression),
marginTop(marginTop),
marginRight(marginRight),
marginBottom(marginBottom),
marginLeft(marginLeft),
completePageFn(completePageFn) {}
~ChapterHtmlSlimParser() = default;
bool parseAndBuildPages();
};

View File

@@ -0,0 +1,96 @@
#include "ContainerParser.h"
#include <HardwareSerial.h>
bool ContainerParser::setup() {
parser = XML_ParserCreate(nullptr);
if (!parser) {
Serial.printf("[%lu] [CTR] Couldn't allocate memory for parser\n", millis());
return false;
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
return true;
}
bool ContainerParser::teardown() {
if (parser) {
XML_ParserFree(parser);
parser = nullptr;
}
return true;
}
size_t ContainerParser::write(const uint8_t data) { return write(&data, 1); }
size_t ContainerParser::write(const uint8_t* buffer, const size_t size) {
if (!parser) return 0;
const uint8_t* currentBufferPos = buffer;
auto remainingInBuffer = size;
while (remainingInBuffer > 0) {
void* const buf = XML_GetBuffer(parser, 1024);
if (!buf) {
Serial.printf("[%lu] [CTR] Couldn't allocate buffer\n", millis());
return 0;
}
const auto toRead = remainingInBuffer < 1024 ? remainingInBuffer : 1024;
memcpy(buf, currentBufferPos, toRead);
if (XML_ParseBuffer(parser, static_cast<int>(toRead), remainingSize == toRead) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [CTR] Parse error: %s\n", millis(), XML_ErrorString(XML_GetErrorCode(parser)));
return 0;
}
currentBufferPos += toRead;
remainingInBuffer -= toRead;
remainingSize -= toRead;
}
return size;
}
void XMLCALL ContainerParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<ContainerParser*>(userData);
// Simple state tracking to ensure we are looking at the valid schema structure
if (self->state == START && strcmp(name, "container") == 0) {
self->state = IN_CONTAINER;
return;
}
if (self->state == IN_CONTAINER && strcmp(name, "rootfiles") == 0) {
self->state = IN_ROOTFILES;
return;
}
if (self->state == IN_ROOTFILES && strcmp(name, "rootfile") == 0) {
const char* mediaType = nullptr;
const char* path = nullptr;
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "media-type") == 0) {
mediaType = atts[i + 1];
} else if (strcmp(atts[i], "full-path") == 0) {
path = atts[i + 1];
}
}
// Check if this is the standard OEBPS package
if (mediaType && path && strcmp(mediaType, "application/oebps-package+xml") == 0) {
self->fullPath = path;
}
}
}
void XMLCALL ContainerParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<ContainerParser*>(userData);
if (self->state == IN_ROOTFILES && strcmp(name, "rootfiles") == 0) {
self->state = IN_CONTAINER;
} else if (self->state == IN_CONTAINER && strcmp(name, "container") == 0) {
self->state = START;
}
}

View File

@@ -0,0 +1,32 @@
#pragma once
#include <Print.h>
#include <string>
#include "expat.h"
class ContainerParser final : public Print {
enum ParserState {
START,
IN_CONTAINER,
IN_ROOTFILES,
};
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
static void startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void endElement(void* userData, const XML_Char* name);
public:
std::string fullPath;
explicit ContainerParser(const size_t xmlSize) : remainingSize(xmlSize) {}
bool setup();
bool teardown();
size_t write(uint8_t) override;
size_t write(const uint8_t* buffer, size_t size) override;
};

View File

@@ -0,0 +1,161 @@
#include "ContentOpfParser.h"
#include <HardwareSerial.h>
#include <ZipFile.h>
bool ContentOpfParser::setup() {
parser = XML_ParserCreate(nullptr);
if (!parser) {
Serial.printf("[%lu] [COF] Couldn't allocate memory for parser\n", millis());
return false;
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
return true;
}
bool ContentOpfParser::teardown() {
if (parser) {
XML_ParserFree(parser);
parser = nullptr;
}
return true;
}
size_t ContentOpfParser::write(const uint8_t data) { return write(&data, 1); }
size_t ContentOpfParser::write(const uint8_t* buffer, const size_t size) {
if (!parser) return 0;
const uint8_t* currentBufferPos = buffer;
auto remainingInBuffer = size;
while (remainingInBuffer > 0) {
void* const buf = XML_GetBuffer(parser, 1024);
if (!buf) {
Serial.printf("[%lu] [COF] Couldn't allocate memory for buffer\n", millis());
XML_ParserFree(parser);
parser = nullptr;
return 0;
}
const auto toRead = remainingInBuffer < 1024 ? remainingInBuffer : 1024;
memcpy(buf, currentBufferPos, toRead);
if (XML_ParseBuffer(parser, static_cast<int>(toRead), remainingSize == toRead) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [COF] Parse error at line %lu: %s\n", millis(), XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_ParserFree(parser);
parser = nullptr;
return 0;
}
currentBufferPos += toRead;
remainingInBuffer -= toRead;
remainingSize -= toRead;
}
return size;
}
void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<ContentOpfParser*>(userData);
(void)atts;
if (self->state == START && (strcmp(name, "package") == 0 || strcmp(name, "opf:package") == 0)) {
self->state = IN_PACKAGE;
return;
}
if (self->state == IN_PACKAGE && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) {
self->state = IN_METADATA;
return;
}
if (self->state == IN_METADATA && strcmp(name, "dc:title") == 0) {
self->state = IN_BOOK_TITLE;
return;
}
if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
self->state = IN_MANIFEST;
return;
}
if (self->state == IN_PACKAGE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) {
self->state = IN_SPINE;
return;
}
// TODO: Support book cover
// if (self->state == IN_METADATA && (strcmp(name, "meta") == 0 || strcmp(name, "opf:meta") == 0)) {
// }
if (self->state == IN_MANIFEST && (strcmp(name, "item") == 0 || strcmp(name, "opf:item") == 0)) {
std::string itemId;
std::string href;
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "id") == 0) {
itemId = atts[i + 1];
} else if (strcmp(atts[i], "href") == 0) {
href = self->baseContentPath + atts[i + 1];
}
}
self->items[itemId] = href;
return;
}
if (self->state == IN_SPINE && (strcmp(name, "itemref") == 0 || strcmp(name, "opf:itemref") == 0)) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "idref") == 0) {
self->spineRefs.emplace_back(atts[i + 1]);
break;
}
}
return;
}
}
void XMLCALL ContentOpfParser::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<ContentOpfParser*>(userData);
if (self->state == IN_BOOK_TITLE) {
self->title.append(s, len);
return;
}
}
void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<ContentOpfParser*>(userData);
(void)name;
if (self->state == IN_SPINE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) {
self->state = IN_PACKAGE;
return;
}
if (self->state == IN_MANIFEST && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
self->state = IN_PACKAGE;
return;
}
if (self->state == IN_BOOK_TITLE && strcmp(name, "dc:title") == 0) {
self->state = IN_METADATA;
return;
}
if (self->state == IN_METADATA && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) {
self->state = IN_PACKAGE;
return;
}
if (self->state == IN_PACKAGE && (strcmp(name, "package") == 0 || strcmp(name, "opf:package") == 0)) {
self->state = START;
return;
}
}

View File

@@ -0,0 +1,42 @@
#pragma once
#include <Print.h>
#include <map>
#include "Epub.h"
#include "expat.h"
class ContentOpfParser final : public Print {
enum ParserState {
START,
IN_PACKAGE,
IN_METADATA,
IN_BOOK_TITLE,
IN_MANIFEST,
IN_SPINE,
};
const std::string& baseContentPath;
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
static void startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void characterData(void* userData, const XML_Char* s, int len);
static void endElement(void* userData, const XML_Char* name);
public:
std::string title;
std::string tocNcxPath;
std::map<std::string, std::string> items;
std::vector<std::string> spineRefs;
explicit ContentOpfParser(const std::string& baseContentPath, const size_t xmlSize)
: baseContentPath(baseContentPath), remainingSize(xmlSize) {}
bool setup();
bool teardown();
size_t write(uint8_t) override;
size_t write(const uint8_t* buffer, size_t size) override;
};

View File

@@ -0,0 +1,165 @@
#include "TocNcxParser.h"
#include <HardwareSerial.h>
bool TocNcxParser::setup() {
parser = XML_ParserCreate(nullptr);
if (!parser) {
Serial.printf("[%lu] [TOC] Couldn't allocate memory for parser\n", millis());
return false;
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
return true;
}
bool TocNcxParser::teardown() {
if (parser) {
XML_ParserFree(parser);
parser = nullptr;
}
return true;
}
size_t TocNcxParser::write(const uint8_t data) { return write(&data, 1); }
size_t TocNcxParser::write(const uint8_t* buffer, const size_t size) {
if (!parser) return 0;
const uint8_t* currentBufferPos = buffer;
auto remainingInBuffer = size;
while (remainingInBuffer > 0) {
void* const buf = XML_GetBuffer(parser, 1024);
if (!buf) {
Serial.printf("[%lu] [TOC] Couldn't allocate memory for buffer\n", millis());
return 0;
}
const auto toRead = remainingInBuffer < 1024 ? remainingInBuffer : 1024;
memcpy(buf, currentBufferPos, toRead);
if (XML_ParseBuffer(parser, static_cast<int>(toRead), remainingSize == toRead) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [TOC] Parse error at line %lu: %s\n", millis(), XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
return 0;
}
currentBufferPos += toRead;
remainingInBuffer -= toRead;
remainingSize -= toRead;
}
return size;
}
void XMLCALL TocNcxParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
// NOTE: We rely on navPoint label and content coming before any nested navPoints, this will be fine:
// <navPoint>
// <navLabel><text>Chapter 1</text></navLabel>
// <content src="ch1.html"/>
// <navPoint> ...nested... </navPoint>
// </navPoint>
//
// This will NOT:
// <navPoint>
// <navPoint> ...nested... </navPoint>
// <navLabel><text>Chapter 1</text></navLabel>
// <content src="ch1.html"/>
// </navPoint>
auto* self = static_cast<TocNcxParser*>(userData);
if (self->state == START && strcmp(name, "ncx") == 0) {
self->state = IN_NCX;
return;
}
if (self->state == IN_NCX && strcmp(name, "navMap") == 0) {
self->state = IN_NAV_MAP;
return;
}
// Handles both top-level and nested navPoints
if ((self->state == IN_NAV_MAP || self->state == IN_NAV_POINT) && strcmp(name, "navPoint") == 0) {
self->state = IN_NAV_POINT;
self->currentDepth++;
self->currentLabel.clear();
self->currentSrc.clear();
return;
}
if (self->state == IN_NAV_POINT && strcmp(name, "navLabel") == 0) {
self->state = IN_NAV_LABEL;
return;
}
if (self->state == IN_NAV_LABEL && strcmp(name, "text") == 0) {
self->state = IN_NAV_LABEL_TEXT;
return;
}
if (self->state == IN_NAV_POINT && strcmp(name, "content") == 0) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "src") == 0) {
self->currentSrc = atts[i + 1];
break;
}
}
return;
}
}
void XMLCALL TocNcxParser::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<TocNcxParser*>(userData);
if (self->state == IN_NAV_LABEL_TEXT) {
self->currentLabel.append(s, len);
}
}
void XMLCALL TocNcxParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<TocNcxParser*>(userData);
if (self->state == IN_NAV_LABEL_TEXT && strcmp(name, "text") == 0) {
self->state = IN_NAV_LABEL;
return;
}
if (self->state == IN_NAV_LABEL && strcmp(name, "navLabel") == 0) {
self->state = IN_NAV_POINT;
return;
}
if (self->state == IN_NAV_POINT && strcmp(name, "navPoint") == 0) {
self->currentDepth--;
if (self->currentDepth == 0) {
self->state = IN_NAV_MAP;
}
return;
}
if (self->state == IN_NAV_POINT && strcmp(name, "content") == 0) {
// At this point (end of content tag), we likely have both Label (from previous tags) and Src.
// This is the safest place to push the data, assuming <navLabel> always comes before <content>.
// NCX spec says navLabel comes before content.
if (!self->currentLabel.empty() && !self->currentSrc.empty()) {
std::string href = self->baseContentPath + self->currentSrc;
std::string anchor;
const size_t pos = href.find('#');
if (pos != std::string::npos) {
anchor = href.substr(pos + 1);
href = href.substr(0, pos);
}
// Push to vector
self->toc.emplace_back(self->currentLabel, href, anchor, self->currentDepth);
// Clear them so we don't re-add them if there are weird XML structures
self->currentLabel.clear();
self->currentSrc.clear();
}
}
}

View File

@@ -0,0 +1,37 @@
#pragma once
#include <Print.h>
#include <string>
#include <vector>
#include "Epub/EpubTocEntry.h"
#include "expat.h"
class TocNcxParser final : public Print {
enum ParserState { START, IN_NCX, IN_NAV_MAP, IN_NAV_POINT, IN_NAV_LABEL, IN_NAV_LABEL_TEXT, IN_CONTENT };
const std::string& baseContentPath;
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
std::string currentLabel;
std::string currentSrc;
size_t currentDepth = 0;
static void startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void characterData(void* userData, const XML_Char* s, int len);
static void endElement(void* userData, const XML_Char* name);
public:
std::vector<EpubTocEntry> toc;
explicit TocNcxParser(const std::string& baseContentPath, const size_t xmlSize)
: baseContentPath(baseContentPath), remainingSize(xmlSize) {}
bool setup();
bool teardown();
size_t write(uint8_t) override;
size_t write(const uint8_t* buffer, size_t size) override;
};