perf: optimize large EPUB indexing from O(n²) to O(n log n)

Three optimizations for EPUBs with many chapters (e.g. 2768 chapters):

1. OPF idref→href lookup: Build sorted hash index during manifest parsing,
   use binary search during spine resolution. Reduces ~4min to ~30-60s.

2. TOC href→spineIndex lookup: Build sorted hash index in beginTocPass(),
   use binary search in createTocEntry(). Reduces ~4min to ~30-60s.

3. ZIP central-dir cursor: Resume scanning from last position instead of
   restarting from beginning. Reduces ~8min to ~1-3min.

All optimizations only activate for large EPUBs (≥400 spine items).
Small books use unchanged code paths.

Memory impact: ~33KB + ~39KB temporary during indexing, freed after.
Expected total: ~17min → ~3-5min for Shadow Slave (2768 chapters).

Also adds phase timing logs for performance measurement.
This commit is contained in:
Daniel
2026-01-20 23:35:54 -08:00
committed by cottongin
parent 51a4faddd4
commit cf16d33710
7 changed files with 225 additions and 31 deletions

View File

@@ -74,6 +74,10 @@ bool ZipFile::loadAllFileStatSlims() {
file.seekCur(m + k);
}
// Set cursor to start of central directory for sequential access
lastCentralDirPos = zipDetails.centralDirOffset;
lastCentralDirPosValid = true;
if (!wasOpen) {
close();
}
@@ -102,15 +106,35 @@ bool ZipFile::loadFileStatSlim(const char* filename, FileStatSlim* fileStat) {
return false;
}
file.seek(zipDetails.centralDirOffset);
// Phase 1: Try scanning from cursor position first
uint32_t startPos = lastCentralDirPosValid ? lastCentralDirPos : zipDetails.centralDirOffset;
uint32_t wrapPos = zipDetails.centralDirOffset;
bool wrapped = false;
bool found = false;
file.seek(startPos);
uint32_t sig;
char itemName[256];
bool found = false;
while (file.available()) {
file.read(&sig, 4);
if (sig != 0x02014b50) break; // End of list
while (true) {
uint32_t entryStart = file.position();
if (file.read(&sig, 4) != 4 || sig != 0x02014b50) {
// End of central directory
if (!wrapped && lastCentralDirPosValid && startPos != zipDetails.centralDirOffset) {
// Wrap around to beginning
file.seek(zipDetails.centralDirOffset);
wrapped = true;
continue;
}
break;
}
// If we've wrapped and reached our start position, stop
if (wrapped && entryStart >= startPos) {
break;
}
file.seekCur(6);
file.read(&fileStat->method, 2);
@@ -123,15 +147,25 @@ bool ZipFile::loadFileStatSlim(const char* filename, FileStatSlim* fileStat) {
file.read(&k, 2);
file.seekCur(8);
file.read(&fileStat->localHeaderOffset, 4);
file.read(itemName, nameLen);
itemName[nameLen] = '\0';
if (strcmp(itemName, filename) == 0) {
found = true;
break;
if (nameLen < 256) {
file.read(itemName, nameLen);
itemName[nameLen] = '\0';
if (strcmp(itemName, filename) == 0) {
// Found it! Update cursor to next entry
file.seekCur(m + k);
lastCentralDirPos = file.position();
lastCentralDirPosValid = true;
found = true;
break;
}
} else {
// Name too long, skip it
file.seekCur(nameLen);
}
// Skip the rest of this entry (extra field + comment)
// Skip extra field + comment
file.seekCur(m + k);
}
@@ -253,6 +287,8 @@ bool ZipFile::close() {
if (file) {
file.close();
}
lastCentralDirPos = 0;
lastCentralDirPosValid = false;
return true;
}