perf: optimize large EPUB indexing from O(n²) to O(n)

Replace O(n²) lookups with O(n) preprocessing:

1. createTocEntry(): Build href->spineIndex map once in beginTocPass()
   instead of scanning spine file for every TOC entry

2. buildBookBin(): Build spineIndex->tocIndex vector in single pass
   instead of scanning TOC file for every spine entry

For 2768-chapter EPUBs, this reduces:
- TOC pass: from ~7.6M file reads to ~5.5K reads
- buildBookBin: from ~7.6M file reads to ~5.5K reads

Memory impact: ~80KB for href map (acceptable trade-off for 10x+ speedup)
This commit is contained in:
Daniel 2026-01-20 12:40:36 -08:00 committed by cottongin
parent 481b8210fb
commit a91bb0b1b8
No known key found for this signature in database
GPG Key ID: 0ECC91FE4655C262
2 changed files with 33 additions and 22 deletions

View File

@ -48,12 +48,24 @@ bool BookMetadataCache::beginTocPass() {
spineFile.close();
return false;
}
// Build href->spineIndex lookup map for O(1) access during TOC creation
hrefToSpineIndex.clear();
hrefToSpineIndex.reserve(spineCount);
spineFile.seek(0);
for (int i = 0; i < spineCount; i++) {
auto entry = readSpineEntry(spineFile);
hrefToSpineIndex[entry.href] = static_cast<int16_t>(i);
}
spineFile.seek(0);
return true;
}
bool BookMetadataCache::endTocPass() {
tocFile.close();
spineFile.close();
hrefToSpineIndex.clear();
return true;
}
@ -134,6 +146,18 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
// LUTs complete
// Loop through spines from spine file matching up TOC indexes, calculating cumulative size and writing to book.bin
// Build spineIndex->tocIndex mapping in one pass (O(n) instead of O(n*m))
std::vector<int16_t> spineToTocIndex(spineCount, -1);
tocFile.seek(0);
for (int j = 0; j < tocCount; j++) {
auto tocEntry = readTocEntry(tocFile);
if (tocEntry.spineIndex >= 0 && tocEntry.spineIndex < spineCount) {
if (spineToTocIndex[tocEntry.spineIndex] == -1) {
spineToTocIndex[tocEntry.spineIndex] = static_cast<int16_t>(j);
}
}
}
ZipFile zip(epubPath);
// Pre-open zip file to speed up size calculations
if (!zip.open()) {
@ -155,14 +179,7 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
for (int i = 0; i < spineCount; i++) {
auto spineEntry = readSpineEntry(spineFile);
tocFile.seek(0);
for (int j = 0; j < tocCount; j++) {
auto tocEntry = readTocEntry(tocFile);
if (tocEntry.spineIndex == i) {
spineEntry.tocIndex = j;
break;
}
}
spineEntry.tocIndex = spineToTocIndex[i];
// Not a huge deal if we don't fine a TOC entry for the spine entry, this is expected behaviour for EPUBs
// Logging here is for debugging
@ -253,20 +270,11 @@ void BookMetadataCache::createTocEntry(const std::string& title, const std::stri
return;
}
int spineIndex = -1;
// find spine index
// TODO: This lookup is slow as need to scan through all items each time. We can't hold it all in memory due to size.
// But perhaps we can load just the hrefs in a vector/list to do an index lookup?
spineFile.seek(0);
for (int i = 0; i < spineCount; i++) {
auto spineEntry = readSpineEntry(spineFile);
if (spineEntry.href == href) {
spineIndex = i;
break;
}
}
if (spineIndex == -1) {
int16_t spineIndex = -1;
auto it = hrefToSpineIndex.find(href);
if (it != hrefToSpineIndex.end()) {
spineIndex = it->second;
} else {
Serial.printf("[%lu] [BMC] addTocEntry: Could not find spine item for TOC href %s\n", millis(), href.c_str());
}

View File

@ -3,6 +3,7 @@
#include <SDCardManager.h>
#include <string>
#include <unordered_map>
#include <vector>
class BookMetadataCache {
@ -54,6 +55,8 @@ class BookMetadataCache {
// Temp file handles during build
FsFile spineFile;
FsFile tocFile;
// Lookup cache for O(1) href->spineIndex during TOC pass
std::unordered_map<std::string, int16_t> hrefToSpineIndex;
uint32_t writeSpineEntry(FsFile& file, const SpineEntry& entry) const;
uint32_t writeTocEntry(FsFile& file, const TocEntry& entry) const;