perf: optimize large EPUB indexing from O(n²) to O(n)
Replace O(n²) lookups with O(n) preprocessing: 1. createTocEntry(): Build href->spineIndex map once in beginTocPass() instead of scanning spine file for every TOC entry 2. buildBookBin(): Build spineIndex->tocIndex vector in single pass instead of scanning TOC file for every spine entry For 2768-chapter EPUBs, this reduces: - TOC pass: from ~7.6M file reads to ~5.5K reads - buildBookBin: from ~7.6M file reads to ~5.5K reads Memory impact: ~80KB for href map (acceptable trade-off for 10x+ speedup)
This commit is contained in:
parent
481b8210fb
commit
a91bb0b1b8
@ -48,12 +48,24 @@ bool BookMetadataCache::beginTocPass() {
|
|||||||
spineFile.close();
|
spineFile.close();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Build href->spineIndex lookup map for O(1) access during TOC creation
|
||||||
|
hrefToSpineIndex.clear();
|
||||||
|
hrefToSpineIndex.reserve(spineCount);
|
||||||
|
spineFile.seek(0);
|
||||||
|
for (int i = 0; i < spineCount; i++) {
|
||||||
|
auto entry = readSpineEntry(spineFile);
|
||||||
|
hrefToSpineIndex[entry.href] = static_cast<int16_t>(i);
|
||||||
|
}
|
||||||
|
spineFile.seek(0);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BookMetadataCache::endTocPass() {
|
bool BookMetadataCache::endTocPass() {
|
||||||
tocFile.close();
|
tocFile.close();
|
||||||
spineFile.close();
|
spineFile.close();
|
||||||
|
hrefToSpineIndex.clear();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,6 +146,18 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
|
|||||||
// LUTs complete
|
// LUTs complete
|
||||||
// Loop through spines from spine file matching up TOC indexes, calculating cumulative size and writing to book.bin
|
// Loop through spines from spine file matching up TOC indexes, calculating cumulative size and writing to book.bin
|
||||||
|
|
||||||
|
// Build spineIndex->tocIndex mapping in one pass (O(n) instead of O(n*m))
|
||||||
|
std::vector<int16_t> spineToTocIndex(spineCount, -1);
|
||||||
|
tocFile.seek(0);
|
||||||
|
for (int j = 0; j < tocCount; j++) {
|
||||||
|
auto tocEntry = readTocEntry(tocFile);
|
||||||
|
if (tocEntry.spineIndex >= 0 && tocEntry.spineIndex < spineCount) {
|
||||||
|
if (spineToTocIndex[tocEntry.spineIndex] == -1) {
|
||||||
|
spineToTocIndex[tocEntry.spineIndex] = static_cast<int16_t>(j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ZipFile zip(epubPath);
|
ZipFile zip(epubPath);
|
||||||
// Pre-open zip file to speed up size calculations
|
// Pre-open zip file to speed up size calculations
|
||||||
if (!zip.open()) {
|
if (!zip.open()) {
|
||||||
@ -155,14 +179,7 @@ bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMeta
|
|||||||
for (int i = 0; i < spineCount; i++) {
|
for (int i = 0; i < spineCount; i++) {
|
||||||
auto spineEntry = readSpineEntry(spineFile);
|
auto spineEntry = readSpineEntry(spineFile);
|
||||||
|
|
||||||
tocFile.seek(0);
|
spineEntry.tocIndex = spineToTocIndex[i];
|
||||||
for (int j = 0; j < tocCount; j++) {
|
|
||||||
auto tocEntry = readTocEntry(tocFile);
|
|
||||||
if (tocEntry.spineIndex == i) {
|
|
||||||
spineEntry.tocIndex = j;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Not a huge deal if we don't fine a TOC entry for the spine entry, this is expected behaviour for EPUBs
|
// Not a huge deal if we don't fine a TOC entry for the spine entry, this is expected behaviour for EPUBs
|
||||||
// Logging here is for debugging
|
// Logging here is for debugging
|
||||||
@ -253,20 +270,11 @@ void BookMetadataCache::createTocEntry(const std::string& title, const std::stri
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int spineIndex = -1;
|
int16_t spineIndex = -1;
|
||||||
// find spine index
|
auto it = hrefToSpineIndex.find(href);
|
||||||
// TODO: This lookup is slow as need to scan through all items each time. We can't hold it all in memory due to size.
|
if (it != hrefToSpineIndex.end()) {
|
||||||
// But perhaps we can load just the hrefs in a vector/list to do an index lookup?
|
spineIndex = it->second;
|
||||||
spineFile.seek(0);
|
} else {
|
||||||
for (int i = 0; i < spineCount; i++) {
|
|
||||||
auto spineEntry = readSpineEntry(spineFile);
|
|
||||||
if (spineEntry.href == href) {
|
|
||||||
spineIndex = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (spineIndex == -1) {
|
|
||||||
Serial.printf("[%lu] [BMC] addTocEntry: Could not find spine item for TOC href %s\n", millis(), href.c_str());
|
Serial.printf("[%lu] [BMC] addTocEntry: Could not find spine item for TOC href %s\n", millis(), href.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
#include <SDCardManager.h>
|
#include <SDCardManager.h>
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
class BookMetadataCache {
|
class BookMetadataCache {
|
||||||
@ -54,6 +55,8 @@ class BookMetadataCache {
|
|||||||
// Temp file handles during build
|
// Temp file handles during build
|
||||||
FsFile spineFile;
|
FsFile spineFile;
|
||||||
FsFile tocFile;
|
FsFile tocFile;
|
||||||
|
// Lookup cache for O(1) href->spineIndex during TOC pass
|
||||||
|
std::unordered_map<std::string, int16_t> hrefToSpineIndex;
|
||||||
|
|
||||||
uint32_t writeSpineEntry(FsFile& file, const SpineEntry& entry) const;
|
uint32_t writeSpineEntry(FsFile& file, const SpineEntry& entry) const;
|
||||||
uint32_t writeTocEntry(FsFile& file, const TocEntry& entry) const;
|
uint32_t writeTocEntry(FsFile& file, const TocEntry& entry) const;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user