diff --git a/lib/Epub/Epub.cpp b/lib/Epub/Epub.cpp index 932f8b5..a9e501d 100644 --- a/lib/Epub/Epub.cpp +++ b/lib/Epub/Epub.cpp @@ -280,6 +280,8 @@ bool Epub::load(const bool buildIfMissing) { Serial.printf("[%lu] [EBP] Cache not found, building spine/TOC cache\n", millis()); setupCacheDir(); + const uint32_t indexingStart = millis(); + // Begin building cache - stream entries to disk immediately if (!bookMetadataCache->beginWrite()) { Serial.printf("[%lu] [EBP] Could not begin writing cache\n", millis()); @@ -287,6 +289,7 @@ bool Epub::load(const bool buildIfMissing) { } // OPF Pass + const uint32_t opfStart = millis(); BookMetadataCache::BookMetadata bookMetadata; if (!bookMetadataCache->beginContentOpfPass()) { Serial.printf("[%lu] [EBP] Could not begin writing content.opf pass\n", millis()); @@ -300,8 +303,10 @@ bool Epub::load(const bool buildIfMissing) { Serial.printf("[%lu] [EBP] Could not end writing content.opf pass\n", millis()); return false; } + Serial.printf("[%lu] [EBP] OPF pass completed in %lu ms\n", millis(), millis() - opfStart); // TOC Pass - try EPUB 3 nav first, fall back to NCX + const uint32_t tocStart = millis(); if (!bookMetadataCache->beginTocPass()) { Serial.printf("[%lu] [EBP] Could not begin writing toc pass\n", millis()); return false; @@ -330,6 +335,7 @@ bool Epub::load(const bool buildIfMissing) { Serial.printf("[%lu] [EBP] Could not end writing toc pass\n", millis()); return false; } + Serial.printf("[%lu] [EBP] TOC pass completed in %lu ms\n", millis(), millis() - tocStart); // Close the cache files if (!bookMetadataCache->endWrite()) { @@ -338,10 +344,13 @@ bool Epub::load(const bool buildIfMissing) { } // Build final book.bin + const uint32_t buildStart = millis(); if (!bookMetadataCache->buildBookBin(filepath, bookMetadata)) { Serial.printf("[%lu] [EBP] Could not update mappings and sizes\n", millis()); return false; } + Serial.printf("[%lu] [EBP] buildBookBin completed in %lu ms\n", millis(), millis() - buildStart); + Serial.printf("[%lu] [EBP] Total indexing completed in %lu ms\n", millis(), millis() - indexingStart); if (!bookMetadataCache->cleanupTmpFiles()) { Serial.printf("[%lu] [EBP] Could not cleanup tmp files - ignoring\n", millis()); diff --git a/lib/Epub/Epub/BookMetadataCache.cpp b/lib/Epub/Epub/BookMetadataCache.cpp index 98f86c1..8ba7c5b 100644 --- a/lib/Epub/Epub/BookMetadataCache.cpp +++ b/lib/Epub/Epub/BookMetadataCache.cpp @@ -40,7 +40,6 @@ bool BookMetadataCache::endContentOpfPass() { bool BookMetadataCache::beginTocPass() { Serial.printf("[%lu] [BMC] Beginning toc pass\n", millis()); - // Open spine file for reading if (!SdMan.openFileForRead("BMC", cachePath + tmpSpineBinFile, spineFile)) { return false; } @@ -49,12 +48,40 @@ bool BookMetadataCache::beginTocPass() { return false; } + if (spineCount >= LARGE_SPINE_THRESHOLD) { + spineHrefIndex.clear(); + spineHrefIndex.reserve(spineCount); + spineFile.seek(0); + for (int i = 0; i < spineCount; i++) { + auto entry = readSpineEntry(spineFile); + SpineHrefIndexEntry idx; + idx.hrefHash = fnvHash64(entry.href); + idx.hrefLen = static_cast(entry.href.size()); + idx.spineIndex = static_cast(i); + spineHrefIndex.push_back(idx); + } + std::sort(spineHrefIndex.begin(), spineHrefIndex.end(), + [](const SpineHrefIndexEntry& a, const SpineHrefIndexEntry& b) { + return a.hrefHash < b.hrefHash || (a.hrefHash == b.hrefHash && a.hrefLen < b.hrefLen); + }); + spineFile.seek(0); + useSpineHrefIndex = true; + Serial.printf("[%lu] [BMC] Using fast index for %d spine items\n", millis(), spineCount); + } else { + useSpineHrefIndex = false; + } + return true; } bool BookMetadataCache::endTocPass() { tocFile.close(); spineFile.close(); + + spineHrefIndex.clear(); + spineHrefIndex.shrink_to_fit(); + useSpineHrefIndex = false; + return true; } @@ -260,16 +287,37 @@ void BookMetadataCache::createTocEntry(const std::string& title, const std::stri } int16_t spineIndex = -1; - spineFile.seek(0); - for (int i = 0; i < spineCount; i++) { - auto spineEntry = readSpineEntry(spineFile); - if (spineEntry.href == href) { - spineIndex = static_cast(i); + + if (useSpineHrefIndex) { + uint64_t targetHash = fnvHash64(href); + uint16_t targetLen = static_cast(href.size()); + + auto it = std::lower_bound(spineHrefIndex.begin(), spineHrefIndex.end(), + SpineHrefIndexEntry{targetHash, targetLen, 0}, + [](const SpineHrefIndexEntry& a, const SpineHrefIndexEntry& b) { + return a.hrefHash < b.hrefHash || (a.hrefHash == b.hrefHash && a.hrefLen < b.hrefLen); + }); + + while (it != spineHrefIndex.end() && it->hrefHash == targetHash && it->hrefLen == targetLen) { + spineIndex = it->spineIndex; break; } - } - if (spineIndex == -1) { - Serial.printf("[%lu] [BMC] addTocEntry: Could not find spine item for TOC href %s\n", millis(), href.c_str()); + + if (spineIndex == -1) { + Serial.printf("[%lu] [BMC] createTocEntry: Could not find spine item for TOC href %s\n", millis(), href.c_str()); + } + } else { + spineFile.seek(0); + for (int i = 0; i < spineCount; i++) { + auto spineEntry = readSpineEntry(spineFile); + if (spineEntry.href == href) { + spineIndex = static_cast(i); + break; + } + } + if (spineIndex == -1) { + Serial.printf("[%lu] [BMC] createTocEntry: Could not find spine item for TOC href %s\n", millis(), href.c_str()); + } } const TocEntry entry(title, href, anchor, level, spineIndex); diff --git a/lib/Epub/Epub/BookMetadataCache.h b/lib/Epub/Epub/BookMetadataCache.h index c7e9590..180f1b3 100644 --- a/lib/Epub/Epub/BookMetadataCache.h +++ b/lib/Epub/Epub/BookMetadataCache.h @@ -2,6 +2,7 @@ #include +#include #include #include @@ -55,6 +56,27 @@ class BookMetadataCache { FsFile spineFile; FsFile tocFile; + // Index for fast href→spineIndex lookup (used only for large EPUBs) + struct SpineHrefIndexEntry { + uint64_t hrefHash; // FNV-1a 64-bit hash + uint16_t hrefLen; // length for collision reduction + int16_t spineIndex; + }; + std::vector spineHrefIndex; + bool useSpineHrefIndex = false; + + static constexpr uint16_t LARGE_SPINE_THRESHOLD = 400; + + // FNV-1a 64-bit hash function + static uint64_t fnvHash64(const std::string& s) { + uint64_t hash = 14695981039346656037ull; + for (char c : s) { + hash ^= static_cast(c); + hash *= 1099511628211ull; + } + return hash; + } + uint32_t writeSpineEntry(FsFile& file, const SpineEntry& entry) const; uint32_t writeTocEntry(FsFile& file, const TocEntry& entry) const; SpineEntry readSpineEntry(FsFile& file) const; diff --git a/lib/Epub/Epub/parsers/ContentOpfParser.cpp b/lib/Epub/Epub/parsers/ContentOpfParser.cpp index 3a63a30..4b64d7c 100644 --- a/lib/Epub/Epub/parsers/ContentOpfParser.cpp +++ b/lib/Epub/Epub/parsers/ContentOpfParser.cpp @@ -39,6 +39,9 @@ ContentOpfParser::~ContentOpfParser() { if (SdMan.exists((cachePath + itemCacheFile).c_str())) { SdMan.remove((cachePath + itemCacheFile).c_str()); } + itemIndex.clear(); + itemIndex.shrink_to_fit(); + useItemIndex = false; } size_t ContentOpfParser::write(const uint8_t data) { return write(&data, 1); } @@ -130,6 +133,16 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name "[%lu] [COF] Couldn't open temp items file for reading. This is probably going to be a fatal error.\n", millis()); } + + // Sort item index for binary search if we have enough items + if (self->itemIndex.size() >= LARGE_SPINE_THRESHOLD) { + std::sort(self->itemIndex.begin(), self->itemIndex.end(), + [](const ItemIndexEntry& a, const ItemIndexEntry& b) { + return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen); + }); + self->useItemIndex = true; + Serial.printf("[%lu] [COF] Using fast index for %zu manifest items\n", millis(), self->itemIndex.size()); + } return; } @@ -181,6 +194,15 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name } } + // Record index entry for fast lookup later + if (self->tempItemStore) { + ItemIndexEntry entry; + entry.idHash = fnvHash(itemId); + entry.idLen = static_cast(itemId.size()); + entry.fileOffset = static_cast(self->tempItemStore.position()); + self->itemIndex.push_back(entry); + } + // Write items down to SD card serialization::writeString(self->tempItemStore, itemId); serialization::writeString(self->tempItemStore, href); @@ -221,19 +243,50 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "idref") == 0) { const std::string idref = atts[i + 1]; - // Resolve the idref to href using items map - // TODO: This lookup is slow as need to scan through all items each time. - // It can take up to 200ms per item when getting to 1500 items. - self->tempItemStore.seek(0); - std::string itemId; std::string href; - while (self->tempItemStore.available()) { - serialization::readString(self->tempItemStore, itemId); - serialization::readString(self->tempItemStore, href); - if (itemId == idref) { - self->cache->createSpineEntry(href); - break; + bool found = false; + + if (self->useItemIndex) { + // Fast path: binary search + uint32_t targetHash = fnvHash(idref); + uint16_t targetLen = static_cast(idref.size()); + + auto it = std::lower_bound(self->itemIndex.begin(), self->itemIndex.end(), + ItemIndexEntry{targetHash, targetLen, 0}, + [](const ItemIndexEntry& a, const ItemIndexEntry& b) { + return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen); + }); + + // Check for match (may need to check a few due to hash collisions) + while (it != self->itemIndex.end() && it->idHash == targetHash) { + self->tempItemStore.seek(it->fileOffset); + std::string itemId; + serialization::readString(self->tempItemStore, itemId); + if (itemId == idref) { + serialization::readString(self->tempItemStore, href); + found = true; + break; + } + ++it; } + } else { + // Slow path: linear scan (for small manifests, keeps original behavior) + // TODO: This lookup is slow as need to scan through all items each time. + // It can take up to 200ms per item when getting to 1500 items. + self->tempItemStore.seek(0); + std::string itemId; + while (self->tempItemStore.available()) { + serialization::readString(self->tempItemStore, itemId); + serialization::readString(self->tempItemStore, href); + if (itemId == idref) { + found = true; + break; + } + } + } + + if (found && self->cache) { + self->cache->createSpineEntry(href); } } } diff --git a/lib/Epub/Epub/parsers/ContentOpfParser.h b/lib/Epub/Epub/parsers/ContentOpfParser.h index ce0039c..2e7e3a4 100644 --- a/lib/Epub/Epub/parsers/ContentOpfParser.h +++ b/lib/Epub/Epub/parsers/ContentOpfParser.h @@ -2,6 +2,7 @@ #include #include +#include #include "Epub.h" #include "expat.h" @@ -30,6 +31,27 @@ class ContentOpfParser final : public Print { FsFile tempItemStore; std::string coverItemId; + // Index for fast idref→href lookup (used only for large EPUBs) + struct ItemIndexEntry { + uint32_t idHash; // FNV-1a hash of itemId + uint16_t idLen; // length for collision reduction + uint32_t fileOffset; // offset in .items.bin + }; + std::vector itemIndex; + bool useItemIndex = false; + + static constexpr uint16_t LARGE_SPINE_THRESHOLD = 400; + + // FNV-1a hash function + static uint32_t fnvHash(const std::string& s) { + uint32_t hash = 2166136261u; + for (char c : s) { + hash ^= static_cast(c); + hash *= 16777619u; + } + return hash; + } + static void startElement(void* userData, const XML_Char* name, const XML_Char** atts); static void characterData(void* userData, const XML_Char* s, int len); static void endElement(void* userData, const XML_Char* name); diff --git a/lib/ZipFile/ZipFile.cpp b/lib/ZipFile/ZipFile.cpp index 2a97858..fec8b9f 100644 --- a/lib/ZipFile/ZipFile.cpp +++ b/lib/ZipFile/ZipFile.cpp @@ -74,6 +74,10 @@ bool ZipFile::loadAllFileStatSlims() { file.seekCur(m + k); } + // Set cursor to start of central directory for sequential access + lastCentralDirPos = zipDetails.centralDirOffset; + lastCentralDirPosValid = true; + if (!wasOpen) { close(); } @@ -102,15 +106,35 @@ bool ZipFile::loadFileStatSlim(const char* filename, FileStatSlim* fileStat) { return false; } - file.seek(zipDetails.centralDirOffset); + // Phase 1: Try scanning from cursor position first + uint32_t startPos = lastCentralDirPosValid ? lastCentralDirPos : zipDetails.centralDirOffset; + uint32_t wrapPos = zipDetails.centralDirOffset; + bool wrapped = false; + bool found = false; + + file.seek(startPos); uint32_t sig; char itemName[256]; - bool found = false; - while (file.available()) { - file.read(&sig, 4); - if (sig != 0x02014b50) break; // End of list + while (true) { + uint32_t entryStart = file.position(); + + if (file.read(&sig, 4) != 4 || sig != 0x02014b50) { + // End of central directory + if (!wrapped && lastCentralDirPosValid && startPos != zipDetails.centralDirOffset) { + // Wrap around to beginning + file.seek(zipDetails.centralDirOffset); + wrapped = true; + continue; + } + break; + } + + // If we've wrapped and reached our start position, stop + if (wrapped && entryStart >= startPos) { + break; + } file.seekCur(6); file.read(&fileStat->method, 2); @@ -123,15 +147,25 @@ bool ZipFile::loadFileStatSlim(const char* filename, FileStatSlim* fileStat) { file.read(&k, 2); file.seekCur(8); file.read(&fileStat->localHeaderOffset, 4); - file.read(itemName, nameLen); - itemName[nameLen] = '\0'; - if (strcmp(itemName, filename) == 0) { - found = true; - break; + if (nameLen < 256) { + file.read(itemName, nameLen); + itemName[nameLen] = '\0'; + + if (strcmp(itemName, filename) == 0) { + // Found it! Update cursor to next entry + file.seekCur(m + k); + lastCentralDirPos = file.position(); + lastCentralDirPosValid = true; + found = true; + break; + } + } else { + // Name too long, skip it + file.seekCur(nameLen); } - // Skip the rest of this entry (extra field + comment) + // Skip extra field + comment file.seekCur(m + k); } @@ -253,6 +287,8 @@ bool ZipFile::close() { if (file) { file.close(); } + lastCentralDirPos = 0; + lastCentralDirPosValid = false; return true; } diff --git a/lib/ZipFile/ZipFile.h b/lib/ZipFile/ZipFile.h index 0144ed4..b895881 100644 --- a/lib/ZipFile/ZipFile.h +++ b/lib/ZipFile/ZipFile.h @@ -25,6 +25,10 @@ class ZipFile { ZipDetails zipDetails = {0, 0, false}; std::unordered_map fileStatSlimCache; + // Cursor for sequential central-dir scanning optimization + uint32_t lastCentralDirPos = 0; + bool lastCentralDirPosValid = false; + bool loadFileStatSlim(const char* filename, FileStatSlim* fileStat); long getDataOffset(const FileStatSlim& fileStat); bool loadZipDetails();