diff --git a/lib/Epub/Epub/BookMetadataCache.cpp b/lib/Epub/Epub/BookMetadataCache.cpp index 8ba7c5b..01dc87e 100644 --- a/lib/Epub/Epub/BookMetadataCache.cpp +++ b/lib/Epub/Epub/BookMetadataCache.cpp @@ -61,9 +61,9 @@ bool BookMetadataCache::beginTocPass() { spineHrefIndex.push_back(idx); } std::sort(spineHrefIndex.begin(), spineHrefIndex.end(), - [](const SpineHrefIndexEntry& a, const SpineHrefIndexEntry& b) { - return a.hrefHash < b.hrefHash || (a.hrefHash == b.hrefHash && a.hrefLen < b.hrefLen); - }); + [](const SpineHrefIndexEntry& a, const SpineHrefIndexEntry& b) { + return a.hrefHash < b.hrefHash || (a.hrefHash == b.hrefHash && a.hrefLen < b.hrefLen); + }); spineFile.seek(0); useSpineHrefIndex = true; Serial.printf("[%lu] [BMC] Using fast index for %d spine items\n", millis(), spineCount); @@ -186,9 +186,46 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta // NOTE: We intentionally skip calling loadAllFileStatSlims() here. // For large EPUBs (2000+ chapters), pre-loading all ZIP central directory entries // into memory causes OOM crashes on ESP32-C3's limited ~380KB RAM. - // Instead, we let loadFileStatSlim() do individual lookups per spine item. - // This is O(n*m) instead of O(n) for lookups, but avoids memory exhaustion. + // Instead, for large books we use a one-pass batch lookup that scans the ZIP + // central directory once and matches against spine targets using hash comparison. + // This is O(n*log(m)) instead of O(n*m) while avoiding memory exhaustion. // See: https://github.com/crosspoint-reader/crosspoint-reader/issues/134 + + std::vector spineSizes; + bool useBatchSizes = false; + + if (spineCount >= LARGE_SPINE_THRESHOLD) { + Serial.printf("[%lu] [BMC] Using batch size lookup for %d spine items\n", millis(), spineCount); + + std::vector targets; + targets.reserve(spineCount); + + spineFile.seek(0); + for (int i = 0; i < spineCount; i++) { + auto entry = readSpineEntry(spineFile); + std::string path = FsHelpers::normalisePath(entry.href); + + ZipFile::SizeTarget t; + t.hash = ZipFile::fnvHash64(path.c_str(), path.size()); + t.len = static_cast(path.size()); + t.index = static_cast(i); + targets.push_back(t); + } + + std::sort(targets.begin(), targets.end(), [](const ZipFile::SizeTarget& a, const ZipFile::SizeTarget& b) { + return a.hash < b.hash || (a.hash == b.hash && a.len < b.len); + }); + + spineSizes.resize(spineCount, 0); + int matched = zip.fillUncompressedSizes(targets, spineSizes); + Serial.printf("[%lu] [BMC] Batch lookup matched %d/%d spine items\n", millis(), matched, spineCount); + + targets.clear(); + targets.shrink_to_fit(); + + useBatchSizes = true; + } + uint32_t cumSize = 0; spineFile.seek(0); int lastSpineTocIndex = -1; @@ -207,16 +244,25 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta } lastSpineTocIndex = spineEntry.tocIndex; - // Calculate size for cumulative size size_t itemSize = 0; - const std::string path = FsHelpers::normalisePath(spineEntry.href); - if (zip.getInflatedFileSize(path.c_str(), &itemSize)) { - cumSize += itemSize; - spineEntry.cumulativeSize = cumSize; + if (useBatchSizes) { + itemSize = spineSizes[i]; + if (itemSize == 0) { + const std::string path = FsHelpers::normalisePath(spineEntry.href); + if (!zip.getInflatedFileSize(path.c_str(), &itemSize)) { + Serial.printf("[%lu] [BMC] Warning: Could not get size for spine item: %s\n", millis(), path.c_str()); + } + } } else { - Serial.printf("[%lu] [BMC] Warning: Could not get size for spine item: %s\n", millis(), path.c_str()); + const std::string path = FsHelpers::normalisePath(spineEntry.href); + if (!zip.getInflatedFileSize(path.c_str(), &itemSize)) { + Serial.printf("[%lu] [BMC] Warning: Could not get size for spine item: %s\n", millis(), path.c_str()); + } } + cumSize += itemSize; + spineEntry.cumulativeSize = cumSize; + // Write out spine data to book.bin writeSpineEntry(bookFile, spineEntry); } @@ -292,11 +338,11 @@ void BookMetadataCache::createTocEntry(const std::string& title, const std::stri uint64_t targetHash = fnvHash64(href); uint16_t targetLen = static_cast(href.size()); - auto it = std::lower_bound(spineHrefIndex.begin(), spineHrefIndex.end(), - SpineHrefIndexEntry{targetHash, targetLen, 0}, - [](const SpineHrefIndexEntry& a, const SpineHrefIndexEntry& b) { - return a.hrefHash < b.hrefHash || (a.hrefHash == b.hrefHash && a.hrefLen < b.hrefLen); - }); + auto it = + std::lower_bound(spineHrefIndex.begin(), spineHrefIndex.end(), SpineHrefIndexEntry{targetHash, targetLen, 0}, + [](const SpineHrefIndexEntry& a, const SpineHrefIndexEntry& b) { + return a.hrefHash < b.hrefHash || (a.hrefHash == b.hrefHash && a.hrefLen < b.hrefLen); + }); while (it != spineHrefIndex.end() && it->hrefHash == targetHash && it->hrefLen == targetLen) { spineIndex = it->spineIndex; diff --git a/lib/Epub/Epub/BookMetadataCache.h b/lib/Epub/Epub/BookMetadataCache.h index 180f1b3..b5ac938 100644 --- a/lib/Epub/Epub/BookMetadataCache.h +++ b/lib/Epub/Epub/BookMetadataCache.h @@ -58,8 +58,8 @@ class BookMetadataCache { // Index for fast href→spineIndex lookup (used only for large EPUBs) struct SpineHrefIndexEntry { - uint64_t hrefHash; // FNV-1a 64-bit hash - uint16_t hrefLen; // length for collision reduction + uint64_t hrefHash; // FNV-1a 64-bit hash + uint16_t hrefLen; // length for collision reduction int16_t spineIndex; }; std::vector spineHrefIndex; diff --git a/lib/Epub/Epub/parsers/ContentOpfParser.cpp b/lib/Epub/Epub/parsers/ContentOpfParser.cpp index 4b64d7c..c6cdec4 100644 --- a/lib/Epub/Epub/parsers/ContentOpfParser.cpp +++ b/lib/Epub/Epub/parsers/ContentOpfParser.cpp @@ -133,13 +133,12 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name "[%lu] [COF] Couldn't open temp items file for reading. This is probably going to be a fatal error.\n", millis()); } - + // Sort item index for binary search if we have enough items if (self->itemIndex.size() >= LARGE_SPINE_THRESHOLD) { - std::sort(self->itemIndex.begin(), self->itemIndex.end(), - [](const ItemIndexEntry& a, const ItemIndexEntry& b) { - return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen); - }); + std::sort(self->itemIndex.begin(), self->itemIndex.end(), [](const ItemIndexEntry& a, const ItemIndexEntry& b) { + return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen); + }); self->useItemIndex = true; Serial.printf("[%lu] [COF] Using fast index for %zu manifest items\n", millis(), self->itemIndex.size()); } @@ -252,10 +251,10 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name uint16_t targetLen = static_cast(idref.size()); auto it = std::lower_bound(self->itemIndex.begin(), self->itemIndex.end(), - ItemIndexEntry{targetHash, targetLen, 0}, - [](const ItemIndexEntry& a, const ItemIndexEntry& b) { - return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen); - }); + ItemIndexEntry{targetHash, targetLen, 0}, + [](const ItemIndexEntry& a, const ItemIndexEntry& b) { + return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen); + }); // Check for match (may need to check a few due to hash collisions) while (it != self->itemIndex.end() && it->idHash == targetHash) { diff --git a/lib/Epub/Epub/parsers/ContentOpfParser.h b/lib/Epub/Epub/parsers/ContentOpfParser.h index 2e7e3a4..1253eae 100644 --- a/lib/Epub/Epub/parsers/ContentOpfParser.h +++ b/lib/Epub/Epub/parsers/ContentOpfParser.h @@ -1,8 +1,8 @@ #pragma once #include -#include #include +#include #include "Epub.h" #include "expat.h" diff --git a/lib/ZipFile/ZipFile.cpp b/lib/ZipFile/ZipFile.cpp index fec8b9f..a5f65ea 100644 --- a/lib/ZipFile/ZipFile.cpp +++ b/lib/ZipFile/ZipFile.cpp @@ -4,6 +4,8 @@ #include #include +#include + bool inflateOneShot(const uint8_t* inputBuf, const size_t deflatedSize, uint8_t* outputBuf, const size_t inflatedSize) { // Setup inflator const auto inflator = static_cast(malloc(sizeof(tinfl_decompressor))); @@ -302,6 +304,80 @@ bool ZipFile::getInflatedFileSize(const char* filename, size_t* size) { return true; } +int ZipFile::fillUncompressedSizes(std::vector& targets, std::vector& sizes) { + if (targets.empty()) { + return 0; + } + + const bool wasOpen = isOpen(); + if (!wasOpen && !open()) { + return 0; + } + + if (!loadZipDetails()) { + if (!wasOpen) { + close(); + } + return 0; + } + + file.seek(zipDetails.centralDirOffset); + + int matched = 0; + uint32_t sig; + char itemName[256]; + + while (file.available()) { + file.read(&sig, 4); + if (sig != 0x02014b50) break; + + file.seekCur(6); + uint16_t method; + file.read(&method, 2); + file.seekCur(8); + uint32_t compressedSize, uncompressedSize; + file.read(&compressedSize, 4); + file.read(&uncompressedSize, 4); + uint16_t nameLen, m, k; + file.read(&nameLen, 2); + file.read(&m, 2); + file.read(&k, 2); + file.seekCur(8); + uint32_t localHeaderOffset; + file.read(&localHeaderOffset, 4); + + if (nameLen < 256) { + file.read(itemName, nameLen); + itemName[nameLen] = '\0'; + + uint64_t hash = fnvHash64(itemName, nameLen); + SizeTarget key = {hash, nameLen, 0}; + + auto it = std::lower_bound(targets.begin(), targets.end(), key, [](const SizeTarget& a, const SizeTarget& b) { + return a.hash < b.hash || (a.hash == b.hash && a.len < b.len); + }); + + while (it != targets.end() && it->hash == hash && it->len == nameLen) { + if (it->index < sizes.size()) { + sizes[it->index] = uncompressedSize; + matched++; + } + ++it; + } + } else { + file.seekCur(nameLen); + } + + file.seekCur(m + k); + } + + if (!wasOpen) { + close(); + } + + return matched; +} + uint8_t* ZipFile::readFileToMemory(const char* filename, size_t* size, const bool trailingNullByte) { const bool wasOpen = isOpen(); if (!wasOpen && !open()) { diff --git a/lib/ZipFile/ZipFile.h b/lib/ZipFile/ZipFile.h index b895881..0c82e5a 100644 --- a/lib/ZipFile/ZipFile.h +++ b/lib/ZipFile/ZipFile.h @@ -3,6 +3,7 @@ #include #include +#include class ZipFile { public: @@ -19,6 +20,23 @@ class ZipFile { bool isSet; }; + // Target for batch uncompressed size lookup (sorted by hash, then len) + struct SizeTarget { + uint64_t hash; // FNV-1a 64-bit hash of normalized path + uint16_t len; // Length of path for collision reduction + uint16_t index; // Caller's index (e.g. spine index) + }; + + // FNV-1a 64-bit hash computed from char buffer (no std::string allocation) + static uint64_t fnvHash64(const char* s, size_t len) { + uint64_t hash = 14695981039346656037ull; + for (size_t i = 0; i < len; i++) { + hash ^= static_cast(s[i]); + hash *= 1099511628211ull; + } + return hash; + } + private: const std::string& filePath; FsFile file; @@ -43,6 +61,10 @@ class ZipFile { bool close(); bool loadAllFileStatSlims(); bool getInflatedFileSize(const char* filename, size_t* size); + // Batch lookup: scan ZIP central dir once and fill sizes for matching targets. + // targets must be sorted by (hash, len). sizes[target.index] receives uncompressedSize. + // Returns number of targets matched. + int fillUncompressedSizes(std::vector& targets, std::vector& sizes); // Due to the memory required to run each of these, it is recommended to not preopen the zip file for multiple // These functions will open and close the zip as needed uint8_t* readFileToMemory(const char* filename, size_t* size = nullptr, bool trailingNullByte = false);