Merge upstream/master into feature/continue-reading-cover

Resolve conflicts:
- GfxRenderer: Add cropX/cropY params to drawBitmap, keep 1-bit BMP support
- GfxRenderer: Update drawBitmap1Bit to use readNextRow (API change)
- JpegToBmpConverter: Use upstream scaling logic (larger dimension)
- HomeActivity: Use StringUtils::checkFileExtension, add hasOpdsUrl
- HomeActivity: Keep cover image functionality with own buffer management
This commit is contained in:
Eunchurn Park
2026-01-09 23:57:19 +09:00
61 changed files with 3501 additions and 512 deletions

View File

@@ -8,6 +8,7 @@
#include "Epub/parsers/ContainerParser.h"
#include "Epub/parsers/ContentOpfParser.h"
#include "Epub/parsers/TocNavParser.h"
#include "Epub/parsers/TocNcxParser.h"
bool Epub::findContentOpfFile(std::string* contentOpfFile) const {
@@ -80,6 +81,10 @@ bool Epub::parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata) {
tocNcxItem = opfParser.tocNcxPath;
}
if (!opfParser.tocNavPath.empty()) {
tocNavItem = opfParser.tocNavPath;
}
Serial.printf("[%lu] [EBP] Successfully parsed content.opf\n", millis());
return true;
}
@@ -141,6 +146,60 @@ bool Epub::parseTocNcxFile() const {
return true;
}
bool Epub::parseTocNavFile() const {
// the nav file should have been specified in the content.opf file (EPUB 3)
if (tocNavItem.empty()) {
Serial.printf("[%lu] [EBP] No nav file specified\n", millis());
return false;
}
Serial.printf("[%lu] [EBP] Parsing toc nav file: %s\n", millis(), tocNavItem.c_str());
const auto tmpNavPath = getCachePath() + "/toc.nav";
FsFile tempNavFile;
if (!SdMan.openFileForWrite("EBP", tmpNavPath, tempNavFile)) {
return false;
}
readItemContentsToStream(tocNavItem, tempNavFile, 1024);
tempNavFile.close();
if (!SdMan.openFileForRead("EBP", tmpNavPath, tempNavFile)) {
return false;
}
const auto navSize = tempNavFile.size();
TocNavParser navParser(contentBasePath, navSize, bookMetadataCache.get());
if (!navParser.setup()) {
Serial.printf("[%lu] [EBP] Could not setup toc nav parser\n", millis());
return false;
}
const auto navBuffer = static_cast<uint8_t*>(malloc(1024));
if (!navBuffer) {
Serial.printf("[%lu] [EBP] Could not allocate memory for toc nav parser\n", millis());
return false;
}
while (tempNavFile.available()) {
const auto readSize = tempNavFile.read(navBuffer, 1024);
const auto processedSize = navParser.write(navBuffer, readSize);
if (processedSize != readSize) {
Serial.printf("[%lu] [EBP] Could not process all toc nav data\n", millis());
free(navBuffer);
tempNavFile.close();
return false;
}
}
free(navBuffer);
tempNavFile.close();
SdMan.remove(tmpNavPath.c_str());
Serial.printf("[%lu] [EBP] Parsed TOC nav items\n", millis());
return true;
}
// load in the meta data for the epub file
bool Epub::load(const bool buildIfMissing) {
Serial.printf("[%lu] [EBP] Loading ePub: %s\n", millis(), filepath.c_str());
@@ -184,15 +243,31 @@ bool Epub::load(const bool buildIfMissing) {
return false;
}
// TOC Pass
// TOC Pass - try EPUB 3 nav first, fall back to NCX
if (!bookMetadataCache->beginTocPass()) {
Serial.printf("[%lu] [EBP] Could not begin writing toc pass\n", millis());
return false;
}
if (!parseTocNcxFile()) {
Serial.printf("[%lu] [EBP] Could not parse toc\n", millis());
return false;
bool tocParsed = false;
// Try EPUB 3 nav document first (preferred)
if (!tocNavItem.empty()) {
Serial.printf("[%lu] [EBP] Attempting to parse EPUB 3 nav document\n", millis());
tocParsed = parseTocNavFile();
}
// Fall back to NCX if nav parsing failed or wasn't available
if (!tocParsed && !tocNcxItem.empty()) {
Serial.printf("[%lu] [EBP] Falling back to NCX TOC\n", millis());
tocParsed = parseTocNcxFile();
}
if (!tocParsed) {
Serial.printf("[%lu] [EBP] Warning: Could not parse any TOC format\n", millis());
// Continue anyway - book will work without TOC
}
if (!bookMetadataCache->endTocPass()) {
Serial.printf("[%lu] [EBP] Could not end writing toc pass\n", millis());
return false;

View File

@@ -12,8 +12,10 @@
class ZipFile;
class Epub {
// the ncx file
// the ncx file (EPUB 2)
std::string tocNcxItem;
// the nav file (EPUB 3)
std::string tocNavItem;
// where is the EPUBfile?
std::string filepath;
// the base path for items in the EPUB file
@@ -26,6 +28,7 @@ class Epub {
bool findContentOpfFile(std::string* contentOpfFile) const;
bool parseContentOpf(BookMetadataCache::BookMetadata& bookMetadata);
bool parseTocNcxFile() const;
bool parseTocNavFile() const;
public:
explicit Epub(std::string filepath, const std::string& cacheDir) : filepath(std::move(filepath)) {

View File

@@ -7,9 +7,9 @@
#include "parsers/ChapterHtmlSlimParser.h"
namespace {
constexpr uint8_t SECTION_FILE_VERSION = 8;
constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint16_t) +
sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t);
constexpr uint8_t SECTION_FILE_VERSION = 9;
constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint8_t) +
sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t);
} // namespace
uint32_t Section::onPageComplete(std::unique_ptr<Page> page) {
@@ -30,19 +30,21 @@ uint32_t Section::onPageComplete(std::unique_ptr<Page> page) {
}
void Section::writeSectionFileHeader(const int fontId, const float lineCompression, const bool extraParagraphSpacing,
const uint16_t viewportWidth, const uint16_t viewportHeight) {
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight) {
if (!file) {
Serial.printf("[%lu] [SCT] File not open for writing header\n", millis());
return;
}
static_assert(HEADER_SIZE == sizeof(SECTION_FILE_VERSION) + sizeof(fontId) + sizeof(lineCompression) +
sizeof(extraParagraphSpacing) + sizeof(viewportWidth) + sizeof(viewportHeight) +
sizeof(pageCount) + sizeof(uint32_t),
sizeof(extraParagraphSpacing) + sizeof(paragraphAlignment) + sizeof(viewportWidth) +
sizeof(viewportHeight) + sizeof(pageCount) + sizeof(uint32_t),
"Header size mismatch");
serialization::writePod(file, SECTION_FILE_VERSION);
serialization::writePod(file, fontId);
serialization::writePod(file, lineCompression);
serialization::writePod(file, extraParagraphSpacing);
serialization::writePod(file, paragraphAlignment);
serialization::writePod(file, viewportWidth);
serialization::writePod(file, viewportHeight);
serialization::writePod(file, pageCount); // Placeholder for page count (will be initially 0 when written)
@@ -50,7 +52,8 @@ void Section::writeSectionFileHeader(const int fontId, const float lineCompressi
}
bool Section::loadSectionFile(const int fontId, const float lineCompression, const bool extraParagraphSpacing,
const uint16_t viewportWidth, const uint16_t viewportHeight) {
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight) {
if (!SdMan.openFileForRead("SCT", filePath, file)) {
return false;
}
@@ -70,15 +73,17 @@ bool Section::loadSectionFile(const int fontId, const float lineCompression, con
uint16_t fileViewportWidth, fileViewportHeight;
float fileLineCompression;
bool fileExtraParagraphSpacing;
uint8_t fileParagraphAlignment;
serialization::readPod(file, fileFontId);
serialization::readPod(file, fileLineCompression);
serialization::readPod(file, fileExtraParagraphSpacing);
serialization::readPod(file, fileParagraphAlignment);
serialization::readPod(file, fileViewportWidth);
serialization::readPod(file, fileViewportHeight);
if (fontId != fileFontId || lineCompression != fileLineCompression ||
extraParagraphSpacing != fileExtraParagraphSpacing || viewportWidth != fileViewportWidth ||
viewportHeight != fileViewportHeight) {
extraParagraphSpacing != fileExtraParagraphSpacing || paragraphAlignment != fileParagraphAlignment ||
viewportWidth != fileViewportWidth || viewportHeight != fileViewportHeight) {
file.close();
Serial.printf("[%lu] [SCT] Deserialization failed: Parameters do not match\n", millis());
clearCache();
@@ -109,8 +114,8 @@ bool Section::clearCache() const {
}
bool Section::createSectionFile(const int fontId, const float lineCompression, const bool extraParagraphSpacing,
const uint16_t viewportWidth, const uint16_t viewportHeight,
const std::function<void()>& progressSetupFn,
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight, const std::function<void()>& progressSetupFn,
const std::function<void(int)>& progressFn) {
constexpr uint32_t MIN_SIZE_FOR_PROGRESS = 50 * 1024; // 50KB
const auto localPath = epub->getSpineItem(spineIndex).href;
@@ -166,11 +171,13 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
if (!SdMan.openFileForWrite("SCT", filePath, file)) {
return false;
}
writeSectionFileHeader(fontId, lineCompression, extraParagraphSpacing, viewportWidth, viewportHeight);
writeSectionFileHeader(fontId, lineCompression, extraParagraphSpacing, paragraphAlignment, viewportWidth,
viewportHeight);
std::vector<uint32_t> lut = {};
ChapterHtmlSlimParser visitor(
tmpHtmlPath, renderer, fontId, lineCompression, extraParagraphSpacing, viewportWidth, viewportHeight,
tmpHtmlPath, renderer, fontId, lineCompression, extraParagraphSpacing, paragraphAlignment, viewportWidth,
viewportHeight,
[this, &lut](std::unique_ptr<Page> page) { lut.emplace_back(this->onPageComplete(std::move(page))); },
progressFn);
success = visitor.parseAndBuildPages();

View File

@@ -14,8 +14,8 @@ class Section {
std::string filePath;
FsFile file;
void writeSectionFileHeader(int fontId, float lineCompression, bool extraParagraphSpacing, uint16_t viewportWidth,
uint16_t viewportHeight);
void writeSectionFileHeader(int fontId, float lineCompression, bool extraParagraphSpacing, uint8_t paragraphAlignment,
uint16_t viewportWidth, uint16_t viewportHeight);
uint32_t onPageComplete(std::unique_ptr<Page> page);
public:
@@ -28,11 +28,12 @@ class Section {
renderer(renderer),
filePath(epub->getCachePath() + "/sections/" + std::to_string(spineIndex) + ".bin") {}
~Section() = default;
bool loadSectionFile(int fontId, float lineCompression, bool extraParagraphSpacing, uint16_t viewportWidth,
uint16_t viewportHeight);
bool loadSectionFile(int fontId, float lineCompression, bool extraParagraphSpacing, uint8_t paragraphAlignment,
uint16_t viewportWidth, uint16_t viewportHeight);
bool clearCache() const;
bool createSectionFile(int fontId, float lineCompression, bool extraParagraphSpacing, uint16_t viewportWidth,
uint16_t viewportHeight, const std::function<void()>& progressSetupFn = nullptr,
bool createSectionFile(int fontId, float lineCompression, bool extraParagraphSpacing, uint8_t paragraphAlignment,
uint16_t viewportWidth, uint16_t viewportHeight,
const std::function<void()>& progressSetupFn = nullptr,
const std::function<void(int)>& progressFn = nullptr);
std::unique_ptr<Page> loadPageFromSectionFile();
};

View File

@@ -1,163 +0,0 @@
// from
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
#include "htmlEntities.h"
#include <cstring>
#include <unordered_map>
const int MAX_ENTITY_LENGTH = 10;
// Use book: entities_ww2.epub to test this (Page 7: Entities parser test)
// Note the supported keys are only in lowercase
// Store the mappings in a unordered hash map
static std::unordered_map<std::string, std::string> entity_lookup(
{{"&quot;", "\""}, {"&frasl;", ""}, {"&amp;", "&"}, {"&lt;", "<"}, {"&gt;", ">"},
{"&Agrave;", "À"}, {"&Aacute;", "Á"}, {"&Acirc;", "Â"}, {"&Atilde;", "Ã"}, {"&Auml;", "Ä"},
{"&Aring;", "Å"}, {"&AElig;", "Æ"}, {"&Ccedil;", "Ç"}, {"&Egrave;", "È"}, {"&Eacute;", "É"},
{"&Ecirc;", "Ê"}, {"&Euml;", "Ë"}, {"&Igrave;", "Ì"}, {"&Iacute;", "Í"}, {"&Icirc;", "Î"},
{"&Iuml;", "Ï"}, {"&ETH;", "Ð"}, {"&Ntilde;", "Ñ"}, {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"},
{"&Ocirc;", "Ô"}, {"&Otilde;", "Õ"}, {"&Ouml;", "Ö"}, {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"},
{"&Uacute;", "Ú"}, {"&Ucirc;", "Û"}, {"&Uuml;", "Ü"}, {"&Yacute;", "Ý"}, {"&THORN;", "Þ"},
{"&szlig;", "ß"}, {"&agrave;", "à"}, {"&aacute;", "á"}, {"&acirc;", "â"}, {"&atilde;", "ã"},
{"&auml;", "ä"}, {"&aring;", "å"}, {"&aelig;", "æ"}, {"&ccedil;", "ç"}, {"&egrave;", "è"},
{"&eacute;", "é"}, {"&ecirc;", "ê"}, {"&euml;", "ë"}, {"&igrave;", "ì"}, {"&iacute;", "í"},
{"&icirc;", "î"}, {"&iuml;", "ï"}, {"&eth;", "ð"}, {"&ntilde;", "ñ"}, {"&ograve;", "ò"},
{"&oacute;", "ó"}, {"&ocirc;", "ô"}, {"&otilde;", "õ"}, {"&ouml;", "ö"}, {"&oslash;", "ø"},
{"&ugrave;", "ù"}, {"&uacute;", "ú"}, {"&ucirc;", "û"}, {"&uuml;", "ü"}, {"&yacute;", "ý"},
{"&thorn;", "þ"}, {"&yuml;", "ÿ"}, {"&nbsp;", " "}, {"&iexcl;", "¡"}, {"&cent;", "¢"},
{"&pound;", "£"}, {"&curren;", "¤"}, {"&yen;", "¥"}, {"&brvbar;", "¦"}, {"&sect;", "§"},
{"&uml;", "¨"}, {"&copy;", "©"}, {"&ordf;", "ª"}, {"&laquo;", "«"}, {"&not;", "¬"},
{"&shy;", "­"}, {"&reg;", "®"}, {"&macr;", "¯"}, {"&deg;", "°"}, {"&plusmn;", "±"},
{"&sup2;", "²"}, {"&sup3;", "³"}, {"&acute;", "´"}, {"&micro;", "µ"}, {"&para;", ""},
{"&cedil;", "¸"}, {"&sup1;", "¹"}, {"&ordm;", "º"}, {"&raquo;", "»"}, {"&frac14;", "¼"},
{"&frac12;", "½"}, {"&frac34;", "¾"}, {"&iquest;", "¿"}, {"&times;", "×"}, {"&divide;", "÷"},
{"&forall;", ""}, {"&part;", ""}, {"&exist;", ""}, {"&empty;", ""}, {"&nabla;", ""},
{"&isin;", ""}, {"&notin;", ""}, {"&ni;", ""}, {"&prod;", ""}, {"&sum;", ""},
{"&minus;", ""}, {"&lowast;", ""}, {"&radic;", ""}, {"&prop;", ""}, {"&infin;", ""},
{"&ang;", ""}, {"&and;", ""}, {"&or;", ""}, {"&cap;", ""}, {"&cup;", ""},
{"&int;", ""}, {"&there4;", ""}, {"&sim;", ""}, {"&cong;", ""}, {"&asymp;", ""},
{"&ne;", ""}, {"&equiv;", ""}, {"&le;", ""}, {"&ge;", ""}, {"&sub;", ""},
{"&sup;", ""}, {"&nsub;", ""}, {"&sube;", ""}, {"&supe;", ""}, {"&oplus;", ""},
{"&otimes;", ""}, {"&perp;", ""}, {"&sdot;", ""}, {"&Alpha;", "Α"}, {"&Beta;", "Β"},
{"&Gamma;", "Γ"}, {"&Delta;", "Δ"}, {"&Epsilon;", "Ε"}, {"&Zeta;", "Ζ"}, {"&Eta;", "Η"},
{"&Theta;", "Θ"}, {"&Iota;", "Ι"}, {"&Kappa;", "Κ"}, {"&Lambda;", "Λ"}, {"&Mu;", "Μ"},
{"&Nu;", "Ν"}, {"&Xi;", "Ξ"}, {"&Omicron;", "Ο"}, {"&Pi;", "Π"}, {"&Rho;", "Ρ"},
{"&Sigma;", "Σ"}, {"&Tau;", "Τ"}, {"&Upsilon;", "Υ"}, {"&Phi;", "Φ"}, {"&Chi;", "Χ"},
{"&Psi;", "Ψ"}, {"&Omega;", "Ω"}, {"&alpha;", "α"}, {"&beta;", "β"}, {"&gamma;", "γ"},
{"&delta;", "δ"}, {"&epsilon;", "ε"}, {"&zeta;", "ζ"}, {"&eta;", "η"}, {"&theta;", "θ"},
{"&iota;", "ι"}, {"&kappa;", "κ"}, {"&lambda;", "λ"}, {"&mu;", "μ"}, {"&nu;", "ν"},
{"&xi;", "ξ"}, {"&omicron;", "ο"}, {"&pi;", "π"}, {"&rho;", "ρ"}, {"&sigmaf;", "ς"},
{"&sigma;", "σ"}, {"&tau;", "τ"}, {"&upsilon;", "υ"}, {"&phi;", "φ"}, {"&chi;", "χ"},
{"&psi;", "ψ"}, {"&omega;", "ω"}, {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"}, {"&piv;", "ϖ"},
{"&OElig;", "Œ"}, {"&oelig;", "œ"}, {"&Scaron;", "Š"}, {"&scaron;", "š"}, {"&Yuml;", "Ÿ"},
{"&fnof;", "ƒ"}, {"&circ;", "ˆ"}, {"&tilde;", "˜"}, {"&ensp;", ""}, {"&emsp;", ""},
{"&thinsp;", ""}, {"&zwnj;", ""}, {"&zwj;", ""}, {"&lrm;", ""}, {"&rlm;", ""},
{"&ndash;", ""}, {"&mdash;", ""}, {"&lsquo;", ""}, {"&rsquo;", ""}, {"&sbquo;", ""},
{"&ldquo;", ""}, {"&rdquo;", ""}, {"&bdquo;", ""}, {"&dagger;", ""}, {"&Dagger;", ""},
{"&bull;", ""}, {"&hellip;", ""}, {"&permil;", ""}, {"&prime;", ""}, {"&Prime;", ""},
{"&lsaquo;", ""}, {"&rsaquo;", ""}, {"&oline;", ""}, {"&euro;", ""}, {"&trade;", ""},
{"&larr;", ""}, {"&uarr;", ""}, {"&rarr;", ""}, {"&darr;", ""}, {"&harr;", ""},
{"&crarr;", ""}, {"&lceil;", ""}, {"&rceil;", ""}, {"&lfloor;", ""}, {"&rfloor;", ""},
{"&loz;", ""}, {"&spades;", ""}, {"&clubs;", ""}, {"&hearts;", ""}, {"&diams;", ""}});
// converts from a unicode code point to the utf8 equivalent
void convert_to_utf8(const int code, std::string& res) {
// convert to a utf8 sequence
if (code < 0x80) {
res += static_cast<char>(code);
} else if (code < 0x800) {
res += static_cast<char>(0xc0 | (code >> 6));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x10000) {
res += static_cast<char>(0xe0 | (code >> 12));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x200000) {
res += static_cast<char>(0xf0 | (code >> 18));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x4000000) {
res += static_cast<char>(0xf8 | (code >> 24));
res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x80000000) {
res += static_cast<char>(0xfc | (code >> 30));
res += static_cast<char>(0x80 | ((code >> 24) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
}
}
// handles numeric entities - e.g. &#1234; or &#x1234;
bool process_numeric_entity(const std::string& entity, std::string& res) {
int code = 0;
// is it hex?
if (entity[2] == 'x' || entity[2] == 'X') {
// parse the hex code
code = strtol(entity.substr(3, entity.size() - 3).c_str(), nullptr, 16);
} else {
code = strtol(entity.substr(2, entity.size() - 3).c_str(), nullptr, 10);
}
if (code != 0) {
// special handling for nbsp
if (code == 0xA0) {
res += " ";
} else {
convert_to_utf8(code, res);
}
return true;
}
return false;
}
// handles named entities - e.g. &amp;
bool process_string_entity(const std::string& entity, std::string& res) {
// it's a named entity - find it in the lookup table
// find it in the map
const auto it = entity_lookup.find(entity);
if (it != entity_lookup.end()) {
res += it->second;
return true;
}
return false;
}
// replace all the entities in the string
std::string replaceHtmlEntities(const char* text) {
std::string res;
res.reserve(strlen(text));
for (int i = 0; i < strlen(text); ++i) {
bool flag = false;
// do we have a potential entity?
if (text[i] == '&') {
// find the end of the entity
int j = i + 1;
while (j < strlen(text) && text[j] != ';' && j - i < MAX_ENTITY_LENGTH) {
j++;
}
if (j - i > 2) {
char entity[j - i + 1];
strncpy(entity, text + i, j - i);
// is it a numeric code?
if (entity[1] == '#') {
flag = process_numeric_entity(entity, res);
} else {
flag = process_string_entity(entity, res);
}
// skip past the entity if we successfully decoded it
if (flag) {
i = j;
}
}
}
if (!flag) {
res += text[i];
}
}
return res;
}

View File

@@ -1,7 +0,0 @@
// from
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
#pragma once
#include <string>
std::string replaceHtmlEntities(const char* text);

View File

@@ -6,7 +6,6 @@
#include <expat.h>
#include "../Page.h"
#include "../htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
@@ -97,7 +96,7 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char*
if (strcmp(name, "br") == 0) {
self->startNewTextBlock(self->currentTextBlock->getStyle());
} else {
self->startNewTextBlock(TextBlock::JUSTIFIED);
self->startNewTextBlock((TextBlock::Style)self->paragraphAlignment);
}
} else if (matches(name, BOLD_TAGS, NUM_BOLD_TAGS)) {
self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth);
@@ -130,17 +129,32 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
// Skip the whitespace char
continue;
}
// Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD
const XML_Char SHY_BYTE_1 = static_cast<XML_Char>(0xC2);
const XML_Char SHY_BYTE_2 = static_cast<XML_Char>(0xAD);
// 1. Check for the start of the 2-byte Soft Hyphen sequence
if (s[i] == SHY_BYTE_1) {
// 2. Check if the next byte exists AND if it completes the sequence
// We must check i + 1 < len to prevent reading past the end of the buffer.
if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) {
// Sequence 0xC2 0xAD found!
// Skip the current byte (0xC2) and the next byte (0xAD)
i++; // Increment 'i' one more time to skip the 0xAD byte
continue; // Skip the rest of the loop and move to the next iteration
}
}
// If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
@@ -182,7 +196,7 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
}
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
}
@@ -206,7 +220,7 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
}
bool ChapterHtmlSlimParser::parseAndBuildPages() {
startNewTextBlock(TextBlock::JUSTIFIED);
startNewTextBlock((TextBlock::Style)this->paragraphAlignment);
const XML_Parser parser = XML_ParserCreate(nullptr);
int done;

View File

@@ -33,6 +33,7 @@ class ChapterHtmlSlimParser {
int fontId;
float lineCompression;
bool extraParagraphSpacing;
uint8_t paragraphAlignment;
uint16_t viewportWidth;
uint16_t viewportHeight;
@@ -46,7 +47,8 @@ class ChapterHtmlSlimParser {
public:
explicit ChapterHtmlSlimParser(const std::string& filepath, GfxRenderer& renderer, const int fontId,
const float lineCompression, const bool extraParagraphSpacing,
const uint16_t viewportWidth, const uint16_t viewportHeight,
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight,
const std::function<void(std::unique_ptr<Page>)>& completePageFn,
const std::function<void(int)>& progressFn = nullptr)
: filepath(filepath),
@@ -54,6 +56,7 @@ class ChapterHtmlSlimParser {
fontId(fontId),
lineCompression(lineCompression),
extraParagraphSpacing(extraParagraphSpacing),
paragraphAlignment(paragraphAlignment),
viewportWidth(viewportWidth),
viewportHeight(viewportHeight),
completePageFn(completePageFn),

View File

@@ -161,6 +161,7 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
std::string itemId;
std::string href;
std::string mediaType;
std::string properties;
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "id") == 0) {
@@ -169,6 +170,8 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
href = self->baseContentPath + atts[i + 1];
} else if (strcmp(atts[i], "media-type") == 0) {
mediaType = atts[i + 1];
} else if (strcmp(atts[i], "properties") == 0) {
properties = atts[i + 1];
}
}
@@ -188,6 +191,15 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
href.c_str());
}
}
// EPUB 3: Check for nav document (properties contains "nav")
if (!properties.empty() && self->tocNavPath.empty()) {
// Properties is space-separated, check if "nav" is present as a word
if (properties == "nav" || properties.find("nav ") == 0 || properties.find(" nav") != std::string::npos) {
self->tocNavPath = href;
Serial.printf("[%lu] [COF] Found EPUB 3 nav document: %s\n", millis(), href.c_str());
}
}
return;
}

View File

@@ -35,6 +35,7 @@ class ContentOpfParser final : public Print {
std::string title;
std::string author;
std::string tocNcxPath;
std::string tocNavPath; // EPUB 3 nav document path
std::string coverItemHref;
std::string textReferenceHref;

View File

@@ -0,0 +1,184 @@
#include "TocNavParser.h"
#include <HardwareSerial.h>
#include "../BookMetadataCache.h"
bool TocNavParser::setup() {
parser = XML_ParserCreate(nullptr);
if (!parser) {
Serial.printf("[%lu] [NAV] Couldn't allocate memory for parser\n", millis());
return false;
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
return true;
}
TocNavParser::~TocNavParser() {
if (parser) {
XML_StopParser(parser, XML_FALSE);
XML_SetElementHandler(parser, nullptr, nullptr);
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
parser = nullptr;
}
}
size_t TocNavParser::write(const uint8_t data) { return write(&data, 1); }
size_t TocNavParser::write(const uint8_t* buffer, const size_t size) {
if (!parser) return 0;
const uint8_t* currentBufferPos = buffer;
auto remainingInBuffer = size;
while (remainingInBuffer > 0) {
void* const buf = XML_GetBuffer(parser, 1024);
if (!buf) {
Serial.printf("[%lu] [NAV] Couldn't allocate memory for buffer\n", millis());
XML_StopParser(parser, XML_FALSE);
XML_SetElementHandler(parser, nullptr, nullptr);
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
parser = nullptr;
return 0;
}
const auto toRead = remainingInBuffer < 1024 ? remainingInBuffer : 1024;
memcpy(buf, currentBufferPos, toRead);
if (XML_ParseBuffer(parser, static_cast<int>(toRead), remainingSize == toRead) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [NAV] Parse error at line %lu: %s\n", millis(), XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_StopParser(parser, XML_FALSE);
XML_SetElementHandler(parser, nullptr, nullptr);
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
parser = nullptr;
return 0;
}
currentBufferPos += toRead;
remainingInBuffer -= toRead;
remainingSize -= toRead;
}
return size;
}
void XMLCALL TocNavParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<TocNavParser*>(userData);
// Track HTML structure loosely - we mainly care about finding <nav epub:type="toc">
if (strcmp(name, "html") == 0) {
self->state = IN_HTML;
return;
}
if (self->state == IN_HTML && strcmp(name, "body") == 0) {
self->state = IN_BODY;
return;
}
// Look for <nav epub:type="toc"> anywhere in body (or nested elements)
if (self->state >= IN_BODY && strcmp(name, "nav") == 0) {
for (int i = 0; atts[i]; i += 2) {
if ((strcmp(atts[i], "epub:type") == 0 || strcmp(atts[i], "type") == 0) && strcmp(atts[i + 1], "toc") == 0) {
self->state = IN_NAV_TOC;
Serial.printf("[%lu] [NAV] Found nav toc element\n", millis());
return;
}
}
return;
}
// Only process ol/li/a if we're inside the toc nav
if (self->state < IN_NAV_TOC) {
return;
}
if (strcmp(name, "ol") == 0) {
self->olDepth++;
self->state = IN_OL;
return;
}
if (self->state == IN_OL && strcmp(name, "li") == 0) {
self->state = IN_LI;
self->currentLabel.clear();
self->currentHref.clear();
return;
}
if (self->state == IN_LI && strcmp(name, "a") == 0) {
self->state = IN_ANCHOR;
// Get href attribute
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "href") == 0) {
self->currentHref = atts[i + 1];
break;
}
}
return;
}
}
void XMLCALL TocNavParser::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<TocNavParser*>(userData);
// Only collect text when inside an anchor within the TOC nav
if (self->state == IN_ANCHOR) {
self->currentLabel.append(s, len);
}
}
void XMLCALL TocNavParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<TocNavParser*>(userData);
if (strcmp(name, "a") == 0 && self->state == IN_ANCHOR) {
// Create TOC entry when closing anchor tag (we have all data now)
if (!self->currentLabel.empty() && !self->currentHref.empty()) {
std::string href = self->baseContentPath + self->currentHref;
std::string anchor;
const size_t pos = href.find('#');
if (pos != std::string::npos) {
anchor = href.substr(pos + 1);
href = href.substr(0, pos);
}
if (self->cache) {
// olDepth gives us the nesting level (1-based from the outer ol)
self->cache->createTocEntry(self->currentLabel, href, anchor, self->olDepth);
}
self->currentLabel.clear();
self->currentHref.clear();
}
self->state = IN_LI;
return;
}
if (strcmp(name, "li") == 0 && (self->state == IN_LI || self->state == IN_OL)) {
self->state = IN_OL;
return;
}
if (strcmp(name, "ol") == 0 && self->state >= IN_NAV_TOC) {
self->olDepth--;
if (self->olDepth == 0) {
self->state = IN_NAV_TOC;
} else {
self->state = IN_LI; // Back to parent li
}
return;
}
if (strcmp(name, "nav") == 0 && self->state >= IN_NAV_TOC) {
self->state = IN_BODY;
Serial.printf("[%lu] [NAV] Finished parsing nav toc\n", millis());
return;
}
}

View File

@@ -0,0 +1,47 @@
#pragma once
#include <Print.h>
#include <expat.h>
#include <string>
class BookMetadataCache;
// Parser for EPUB 3 nav.xhtml navigation documents
// Parses HTML5 nav elements with epub:type="toc" to extract table of contents
class TocNavParser final : public Print {
enum ParserState {
START,
IN_HTML,
IN_BODY,
IN_NAV_TOC, // Inside <nav epub:type="toc">
IN_OL, // Inside <ol>
IN_LI, // Inside <li>
IN_ANCHOR, // Inside <a>
};
const std::string& baseContentPath;
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
BookMetadataCache* cache;
// Track nesting depth for <ol> elements to determine TOC depth
uint8_t olDepth = 0;
// Current entry data being collected
std::string currentLabel;
std::string currentHref;
static void startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void characterData(void* userData, const XML_Char* s, int len);
static void endElement(void* userData, const XML_Char* name);
public:
explicit TocNavParser(const std::string& baseContentPath, const size_t xmlSize, BookMetadataCache* cache)
: baseContentPath(baseContentPath), remainingSize(xmlSize), cache(cache) {}
~TocNavParser() override;
bool setup();
size_t write(uint8_t) override;
size_t write(const uint8_t* buffer, size_t size) override;
};