From f264efdb12a47c6f07c475f8ff734c71658f6833 Mon Sep 17 00:00:00 2001 From: Dave Allie Date: Sun, 21 Dec 2025 17:08:34 +1100 Subject: [PATCH] Extract EPUB TOC into temp file before parsing (#85) ## Summary * Extract EPUB TOC into temp file before parsing * Streaming ZIP -> XML parser uses up a lot of memory as we're allocating inflation buffers while also holding a few copies of the buffer in different forms * Instead, but streaming the inflated file down to the SD card (like we do for HTML parsing, we can lower memory usage) ## Additional Context * This should help with https://github.com/daveallie/crosspoint-reader/issues/60 and https://github.com/daveallie/crosspoint-reader/issues/10. It won't remove those class of issues completely, but will allow for many more books to be opened. --- lib/Epub/Epub.cpp | 36 +++++++++++++++++++------- lib/Epub/Epub/EpubTocEntry.h | 7 ++--- lib/Epub/Epub/parsers/TocNcxParser.cpp | 2 +- lib/Epub/Epub/parsers/TocNcxParser.h | 2 +- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/lib/Epub/Epub.cpp b/lib/Epub/Epub.cpp index 2df5a3f..cc3bc90 100644 --- a/lib/Epub/Epub.cpp +++ b/lib/Epub/Epub.cpp @@ -93,24 +93,42 @@ bool Epub::parseTocNcxFile() { Serial.printf("[%lu] [EBP] Parsing toc ncx file: %s\n", millis(), tocNcxItem.c_str()); - size_t tocSize; - if (!getItemSize(tocNcxItem, &tocSize)) { - Serial.printf("[%lu] [EBP] Could not get size of toc ncx\n", millis()); - return false; - } + const auto tmpNcxPath = getCachePath() + "/toc.ncx"; + File tempNcxFile = SD.open(tmpNcxPath.c_str(), FILE_WRITE); + readItemContentsToStream(tocNcxItem, tempNcxFile, 1024); + tempNcxFile.close(); + tempNcxFile = SD.open(tmpNcxPath.c_str(), FILE_READ); + const auto ncxSize = tempNcxFile.size(); - TocNcxParser ncxParser(contentBasePath, tocSize); + TocNcxParser ncxParser(contentBasePath, ncxSize); if (!ncxParser.setup()) { Serial.printf("[%lu] [EBP] Could not setup toc ncx parser\n", millis()); return false; } - if (!readItemContentsToStream(tocNcxItem, ncxParser, 1024)) { - Serial.printf("[%lu] [EBP] Could not read toc ncx stream\n", millis()); + const auto ncxBuffer = static_cast(malloc(1024)); + if (!ncxBuffer) { + Serial.printf("[%lu] [EBP] Could not allocate memory for toc ncx parser\n", millis()); return false; } + while (tempNcxFile.available()) { + const auto readSize = tempNcxFile.read(ncxBuffer, 1024); + const auto processedSize = ncxParser.write(ncxBuffer, readSize); + + if (processedSize != readSize) { + Serial.printf("[%lu] [EBP] Could not process all toc ncx data\n", millis()); + free(ncxBuffer); + tempNcxFile.close(); + return false; + } + } + + free(ncxBuffer); + tempNcxFile.close(); + SD.remove(tmpNcxPath.c_str()); + this->toc = std::move(ncxParser.toc); Serial.printf("[%lu] [EBP] Parsed %d TOC items\n", millis(), this->toc.size()); @@ -293,7 +311,7 @@ std::string& Epub::getSpineItem(const int spineIndex) { } EpubTocEntry& Epub::getTocItem(const int tocTndex) { - static EpubTocEntry emptyEntry("", "", "", 0); + static EpubTocEntry emptyEntry = {}; if (toc.empty()) { Serial.printf("[%lu] [EBP] getTocItem called but toc is empty\n", millis()); return emptyEntry; diff --git a/lib/Epub/Epub/EpubTocEntry.h b/lib/Epub/Epub/EpubTocEntry.h index 715e4a4..94f0c90 100644 --- a/lib/Epub/Epub/EpubTocEntry.h +++ b/lib/Epub/Epub/EpubTocEntry.h @@ -2,12 +2,9 @@ #include -class EpubTocEntry { - public: +struct EpubTocEntry { std::string title; std::string href; std::string anchor; - int level; - EpubTocEntry(std::string title, std::string href, std::string anchor, const int level) - : title(std::move(title)), href(std::move(href)), anchor(std::move(anchor)), level(level) {} + uint8_t level; }; diff --git a/lib/Epub/Epub/parsers/TocNcxParser.cpp b/lib/Epub/Epub/parsers/TocNcxParser.cpp index 4d541f5..0a613f3 100644 --- a/lib/Epub/Epub/parsers/TocNcxParser.cpp +++ b/lib/Epub/Epub/parsers/TocNcxParser.cpp @@ -155,7 +155,7 @@ void XMLCALL TocNcxParser::endElement(void* userData, const XML_Char* name) { } // Push to vector - self->toc.emplace_back(self->currentLabel, href, anchor, self->currentDepth); + self->toc.push_back({std::move(self->currentLabel), std::move(href), std::move(anchor), self->currentDepth}); // Clear them so we don't re-add them if there are weird XML structures self->currentLabel.clear(); diff --git a/lib/Epub/Epub/parsers/TocNcxParser.h b/lib/Epub/Epub/parsers/TocNcxParser.h index 5d5df0b..2f3601a 100644 --- a/lib/Epub/Epub/parsers/TocNcxParser.h +++ b/lib/Epub/Epub/parsers/TocNcxParser.h @@ -17,7 +17,7 @@ class TocNcxParser final : public Print { std::string currentLabel; std::string currentSrc; - size_t currentDepth = 0; + uint8_t currentDepth = 0; static void startElement(void* userData, const XML_Char* name, const XML_Char** atts); static void characterData(void* userData, const XML_Char* s, int len);