New book.bin spine and table of contents cache (#104)

## Summary

* Use single unified cache file for book spine, table of contents, and
core metadata (title, author, cover image)
* Use new temp item store file in OPF parsing to store items to be
rescaned when parsing spine
  * This avoids us holding these items in memory
* Use new toc.bin.tmp and spine.bin.tmp to build out partial toc / spine
data as part of parsing content.opf and the NCX file
  * These files are re-read multiple times to ultimately build book.bin

## Additional Context

* Spec for file format included below as an image
* This should help with:
  * #10 
  * #60 
  * #99
This commit is contained in:
Dave Allie
2025-12-24 22:36:13 +11:00
committed by GitHub
parent ea0abaf351
commit b6bc1f7ed3
15 changed files with 748 additions and 169 deletions

View File

@@ -0,0 +1,326 @@
#include "BookMetadataCache.h"
#include <HardwareSerial.h>
#include <SD.h>
#include <Serialization.h>
#include <ZipFile.h>
#include <vector>
#include "FsHelpers.h"
namespace {
constexpr uint8_t BOOK_CACHE_VERSION = 1;
constexpr char bookBinFile[] = "/book.bin";
constexpr char tmpSpineBinFile[] = "/spine.bin.tmp";
constexpr char tmpTocBinFile[] = "/toc.bin.tmp";
} // namespace
/* ============= WRITING / BUILDING FUNCTIONS ================ */
bool BookMetadataCache::beginWrite() {
buildMode = true;
spineCount = 0;
tocCount = 0;
Serial.printf("[%lu] [BMC] Entering write mode\n", millis());
return true;
}
bool BookMetadataCache::beginContentOpfPass() {
Serial.printf("[%lu] [BMC] Beginning content opf pass\n", millis());
// Open spine file for writing
return FsHelpers::openFileForWrite("BMC", cachePath + tmpSpineBinFile, spineFile);
}
bool BookMetadataCache::endContentOpfPass() {
spineFile.close();
return true;
}
bool BookMetadataCache::beginTocPass() {
Serial.printf("[%lu] [BMC] Beginning toc pass\n", millis());
// Open spine file for reading
if (!FsHelpers::openFileForRead("BMC", cachePath + tmpSpineBinFile, spineFile)) {
return false;
}
if (!FsHelpers::openFileForWrite("BMC", cachePath + tmpTocBinFile, tocFile)) {
spineFile.close();
return false;
}
return true;
}
bool BookMetadataCache::endTocPass() {
tocFile.close();
spineFile.close();
return true;
}
bool BookMetadataCache::endWrite() {
if (!buildMode) {
Serial.printf("[%lu] [BMC] endWrite called but not in build mode\n", millis());
return false;
}
buildMode = false;
Serial.printf("[%lu] [BMC] Wrote %d spine, %d TOC entries\n", millis(), spineCount, tocCount);
return true;
}
bool BookMetadataCache::buildBookBin(const std::string& epubPath, const BookMetadata& metadata) {
// Open all three files, writing to meta, reading from spine and toc
if (!FsHelpers::openFileForWrite("BMC", cachePath + bookBinFile, bookFile)) {
return false;
}
if (!FsHelpers::openFileForRead("BMC", cachePath + tmpSpineBinFile, spineFile)) {
bookFile.close();
return false;
}
if (!FsHelpers::openFileForRead("BMC", cachePath + tmpTocBinFile, tocFile)) {
bookFile.close();
spineFile.close();
return false;
}
constexpr size_t headerASize =
sizeof(BOOK_CACHE_VERSION) + /* LUT Offset */ sizeof(size_t) + sizeof(spineCount) + sizeof(tocCount);
const size_t metadataSize =
metadata.title.size() + metadata.author.size() + metadata.coverItemHref.size() + sizeof(uint32_t) * 3;
const size_t lutSize = sizeof(size_t) * spineCount + sizeof(size_t) * tocCount;
const size_t lutOffset = headerASize + metadataSize;
// Header A
serialization::writePod(bookFile, BOOK_CACHE_VERSION);
serialization::writePod(bookFile, lutOffset);
serialization::writePod(bookFile, spineCount);
serialization::writePod(bookFile, tocCount);
// Metadata
serialization::writeString(bookFile, metadata.title);
serialization::writeString(bookFile, metadata.author);
serialization::writeString(bookFile, metadata.coverItemHref);
// Loop through spine entries, writing LUT positions
spineFile.seek(0);
for (int i = 0; i < spineCount; i++) {
auto pos = spineFile.position();
auto spineEntry = readSpineEntry(spineFile);
serialization::writePod(bookFile, pos + lutOffset + lutSize);
}
// Loop through toc entries, writing LUT positions
tocFile.seek(0);
for (int i = 0; i < tocCount; i++) {
auto pos = tocFile.position();
auto tocEntry = readTocEntry(tocFile);
serialization::writePod(bookFile, pos + lutOffset + lutSize + spineFile.position());
}
// LUTs complete
// Loop through spines from spine file matching up TOC indexes, calculating cumulative size and writing to book.bin
const ZipFile zip("/sd" + epubPath);
size_t cumSize = 0;
spineFile.seek(0);
for (int i = 0; i < spineCount; i++) {
auto spineEntry = readSpineEntry(spineFile);
tocFile.seek(0);
for (int j = 0; j < tocCount; j++) {
auto tocEntry = readTocEntry(tocFile);
if (tocEntry.spineIndex == i) {
spineEntry.tocIndex = j;
break;
}
}
// Not a huge deal if we don't fine a TOC entry for the spine entry, this is expected behaviour for EPUBs
// Logging here is for debugging
if (spineEntry.tocIndex == -1) {
Serial.printf("[%lu] [BMC] Warning: Could not find TOC entry for spine item %d: %s\n", millis(), i,
spineEntry.href.c_str());
}
// Calculate size for cumulative size
size_t itemSize = 0;
const std::string path = FsHelpers::normalisePath(spineEntry.href);
if (zip.getInflatedFileSize(path.c_str(), &itemSize)) {
cumSize += itemSize;
spineEntry.cumulativeSize = cumSize;
} else {
Serial.printf("[%lu] [BMC] Warning: Could not get size for spine item: %s\n", millis(), path.c_str());
}
// Write out spine data to book.bin
writeSpineEntry(bookFile, spineEntry);
}
// Loop through toc entries from toc file writing to book.bin
tocFile.seek(0);
for (int i = 0; i < tocCount; i++) {
auto tocEntry = readTocEntry(tocFile);
writeTocEntry(bookFile, tocEntry);
}
bookFile.close();
spineFile.close();
tocFile.close();
Serial.printf("[%lu] [BMC] Successfully built book.bin\n", millis());
return true;
}
bool BookMetadataCache::cleanupTmpFiles() const {
if (SD.exists((cachePath + tmpSpineBinFile).c_str())) {
SD.remove((cachePath + tmpSpineBinFile).c_str());
}
if (SD.exists((cachePath + tmpTocBinFile).c_str())) {
SD.remove((cachePath + tmpTocBinFile).c_str());
}
return true;
}
size_t BookMetadataCache::writeSpineEntry(File& file, const SpineEntry& entry) const {
const auto pos = file.position();
serialization::writeString(file, entry.href);
serialization::writePod(file, entry.cumulativeSize);
serialization::writePod(file, entry.tocIndex);
return pos;
}
size_t BookMetadataCache::writeTocEntry(File& file, const TocEntry& entry) const {
const auto pos = file.position();
serialization::writeString(file, entry.title);
serialization::writeString(file, entry.href);
serialization::writeString(file, entry.anchor);
serialization::writePod(file, entry.level);
serialization::writePod(file, entry.spineIndex);
return pos;
}
// Note: for the LUT to be accurate, this **MUST** be called for all spine items before `addTocEntry` is ever called
// this is because in this function we're marking positions of the items
void BookMetadataCache::createSpineEntry(const std::string& href) {
if (!buildMode || !spineFile) {
Serial.printf("[%lu] [BMC] createSpineEntry called but not in build mode\n", millis());
return;
}
const SpineEntry entry(href, 0, -1);
writeSpineEntry(spineFile, entry);
spineCount++;
}
void BookMetadataCache::createTocEntry(const std::string& title, const std::string& href, const std::string& anchor,
const uint8_t level) {
if (!buildMode || !tocFile || !spineFile) {
Serial.printf("[%lu] [BMC] createTocEntry called but not in build mode\n", millis());
return;
}
int spineIndex = -1;
// find spine index
spineFile.seek(0);
for (int i = 0; i < spineCount; i++) {
auto spineEntry = readSpineEntry(spineFile);
if (spineEntry.href == href) {
spineIndex = i;
break;
}
}
if (spineIndex == -1) {
Serial.printf("[%lu] [BMC] addTocEntry: Could not find spine item for TOC href %s\n", millis(), href.c_str());
}
const TocEntry entry(title, href, anchor, level, spineIndex);
writeTocEntry(tocFile, entry);
tocCount++;
}
/* ============= READING / LOADING FUNCTIONS ================ */
bool BookMetadataCache::load() {
if (!FsHelpers::openFileForRead("BMC", cachePath + bookBinFile, bookFile)) {
return false;
}
uint8_t version;
serialization::readPod(bookFile, version);
if (version != BOOK_CACHE_VERSION) {
Serial.printf("[%lu] [BMC] Cache version mismatch: expected %d, got %d\n", millis(), BOOK_CACHE_VERSION, version);
bookFile.close();
return false;
}
serialization::readPod(bookFile, lutOffset);
serialization::readPod(bookFile, spineCount);
serialization::readPod(bookFile, tocCount);
serialization::readString(bookFile, coreMetadata.title);
serialization::readString(bookFile, coreMetadata.author);
serialization::readString(bookFile, coreMetadata.coverItemHref);
loaded = true;
Serial.printf("[%lu] [BMC] Loaded cache data: %d spine, %d TOC entries\n", millis(), spineCount, tocCount);
return true;
}
BookMetadataCache::SpineEntry BookMetadataCache::getSpineEntry(const int index) {
if (!loaded) {
Serial.printf("[%lu] [BMC] getSpineEntry called but cache not loaded\n", millis());
return {};
}
if (index < 0 || index >= static_cast<int>(spineCount)) {
Serial.printf("[%lu] [BMC] getSpineEntry index %d out of range\n", millis(), index);
return {};
}
// Seek to spine LUT item, read from LUT and get out data
bookFile.seek(lutOffset + sizeof(size_t) * index);
size_t spineEntryPos;
serialization::readPod(bookFile, spineEntryPos);
bookFile.seek(spineEntryPos);
return readSpineEntry(bookFile);
}
BookMetadataCache::TocEntry BookMetadataCache::getTocEntry(const int index) {
if (!loaded) {
Serial.printf("[%lu] [BMC] getTocEntry called but cache not loaded\n", millis());
return {};
}
if (index < 0 || index >= static_cast<int>(tocCount)) {
Serial.printf("[%lu] [BMC] getTocEntry index %d out of range\n", millis(), index);
return {};
}
// Seek to TOC LUT item, read from LUT and get out data
bookFile.seek(lutOffset + sizeof(size_t) * spineCount + sizeof(size_t) * index);
size_t tocEntryPos;
serialization::readPod(bookFile, tocEntryPos);
bookFile.seek(tocEntryPos);
return readTocEntry(bookFile);
}
BookMetadataCache::SpineEntry BookMetadataCache::readSpineEntry(File& file) const {
SpineEntry entry;
serialization::readString(file, entry.href);
serialization::readPod(file, entry.cumulativeSize);
serialization::readPod(file, entry.tocIndex);
return entry;
}
BookMetadataCache::TocEntry BookMetadataCache::readTocEntry(File& file) const {
TocEntry entry;
serialization::readString(file, entry.title);
serialization::readString(file, entry.href);
serialization::readString(file, entry.anchor);
serialization::readPod(file, entry.level);
serialization::readPod(file, entry.spineIndex);
return entry;
}

View File

@@ -0,0 +1,87 @@
#pragma once
#include <SD.h>
#include <string>
class BookMetadataCache {
public:
struct BookMetadata {
std::string title;
std::string author;
std::string coverItemHref;
};
struct SpineEntry {
std::string href;
size_t cumulativeSize;
int16_t tocIndex;
SpineEntry() : cumulativeSize(0), tocIndex(-1) {}
SpineEntry(std::string href, const size_t cumulativeSize, const int16_t tocIndex)
: href(std::move(href)), cumulativeSize(cumulativeSize), tocIndex(tocIndex) {}
};
struct TocEntry {
std::string title;
std::string href;
std::string anchor;
uint8_t level;
int16_t spineIndex;
TocEntry() : level(0), spineIndex(-1) {}
TocEntry(std::string title, std::string href, std::string anchor, const uint8_t level, const int16_t spineIndex)
: title(std::move(title)),
href(std::move(href)),
anchor(std::move(anchor)),
level(level),
spineIndex(spineIndex) {}
};
private:
std::string cachePath;
size_t lutOffset;
uint16_t spineCount;
uint16_t tocCount;
bool loaded;
bool buildMode;
File bookFile;
// Temp file handles during build
File spineFile;
File tocFile;
size_t writeSpineEntry(File& file, const SpineEntry& entry) const;
size_t writeTocEntry(File& file, const TocEntry& entry) const;
SpineEntry readSpineEntry(File& file) const;
TocEntry readTocEntry(File& file) const;
public:
BookMetadata coreMetadata;
explicit BookMetadataCache(std::string cachePath)
: cachePath(std::move(cachePath)), lutOffset(0), spineCount(0), tocCount(0), loaded(false), buildMode(false) {}
~BookMetadataCache() = default;
// Building phase (stream to disk immediately)
bool beginWrite();
bool beginContentOpfPass();
void createSpineEntry(const std::string& href);
bool endContentOpfPass();
bool beginTocPass();
void createTocEntry(const std::string& title, const std::string& href, const std::string& anchor, uint8_t level);
bool endTocPass();
bool endWrite();
bool cleanupTmpFiles() const;
// Post-processing to update mappings and sizes
bool buildBookBin(const std::string& epubPath, const BookMetadata& metadata);
// Reading phase (read mode)
bool load();
SpineEntry getSpineEntry(int index);
TocEntry getTocEntry(int index);
int getSpineCount() const { return spineCount; }
int getTocCount() const { return tocCount; }
bool isLoaded() const { return loaded; }
};

View File

@@ -1,10 +0,0 @@
#pragma once
#include <string>
struct EpubTocEntry {
std::string title;
std::string href;
std::string anchor;
uint8_t level;
};

View File

@@ -0,0 +1,92 @@
#include "FsHelpers.h"
#include <SD.h>
#include <vector>
bool FsHelpers::openFileForRead(const char* moduleName, const std::string& path, File& file) {
file = SD.open(path.c_str(), FILE_READ);
if (!file) {
Serial.printf("[%lu] [%s] Failed to open file for reading: %s\n", millis(), moduleName, path.c_str());
return false;
}
return true;
}
bool FsHelpers::openFileForWrite(const char* moduleName, const std::string& path, File& file) {
file = SD.open(path.c_str(), FILE_WRITE, true);
if (!file) {
Serial.printf("[%lu] [%s] Failed to open file for writing: %s\n", millis(), moduleName, path.c_str());
return false;
}
return true;
}
bool FsHelpers::removeDir(const char* path) {
// 1. Open the directory
File dir = SD.open(path);
if (!dir) {
return false;
}
if (!dir.isDirectory()) {
return false;
}
File file = dir.openNextFile();
while (file) {
String filePath = path;
if (!filePath.endsWith("/")) {
filePath += "/";
}
filePath += file.name();
if (file.isDirectory()) {
if (!removeDir(filePath.c_str())) {
return false;
}
} else {
if (!SD.remove(filePath.c_str())) {
return false;
}
}
file = dir.openNextFile();
}
return SD.rmdir(path);
}
std::string FsHelpers::normalisePath(const std::string& path) {
std::vector<std::string> components;
std::string component;
for (const auto c : path) {
if (c == '/') {
if (!component.empty()) {
if (component == "..") {
if (!components.empty()) {
components.pop_back();
}
} else {
components.push_back(component);
}
component.clear();
}
} else {
component += c;
}
}
if (!component.empty()) {
components.push_back(component);
}
std::string result;
for (const auto& c : components) {
if (!result.empty()) {
result += "/";
}
result += c;
}
return result;
}

12
lib/Epub/Epub/FsHelpers.h Normal file
View File

@@ -0,0 +1,12 @@
#pragma once
#include <FS.h>
#include <string>
class FsHelpers {
public:
static bool openFileForRead(const char* moduleName, const std::string& path, File& file);
static bool openFileForWrite(const char* moduleName, const std::string& path, File& file);
static bool removeDir(const char* path);
static std::string normalisePath(const std::string& path);
};

View File

@@ -116,8 +116,7 @@ bool Section::clearCache() const {
bool Section::persistPageDataToSD(const int fontId, const float lineCompression, const int marginTop,
const int marginRight, const int marginBottom, const int marginLeft,
const bool extraParagraphSpacing) {
const auto localPath = epub->getSpineItem(spineIndex);
const auto localPath = epub->getSpineItem(spineIndex).href;
const auto tmpHtmlPath = epub->getCachePath() + "/.tmp_" + std::to_string(spineIndex) + ".html";
File tmpHtml;
if (!FsHelpers::openFileForWrite("SCT", tmpHtmlPath, tmpHtml)) {

View File

@@ -1,11 +1,16 @@
#include "ContentOpfParser.h"
#include <FsHelpers.h>
#include <HardwareSerial.h>
#include <Serialization.h>
#include <ZipFile.h>
#include "../BookMetadataCache.h"
namespace {
constexpr char MEDIA_TYPE_NCX[] = "application/x-dtbncx+xml";
}
constexpr char itemCacheFile[] = "/.items.bin";
} // namespace
bool ContentOpfParser::setup() {
parser = XML_ParserCreate(nullptr);
@@ -28,6 +33,12 @@ ContentOpfParser::~ContentOpfParser() {
XML_ParserFree(parser);
parser = nullptr;
}
if (tempItemStore) {
tempItemStore.close();
}
if (SD.exists((cachePath + itemCacheFile).c_str())) {
SD.remove((cachePath + itemCacheFile).c_str());
}
}
size_t ContentOpfParser::write(const uint8_t data) { return write(&data, 1); }
@@ -94,11 +105,21 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
self->state = IN_MANIFEST;
if (!FsHelpers::openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
Serial.printf(
"[%lu] [COF] Couldn't open temp items file for writing. This is probably going to be a fatal error.\n",
millis());
}
return;
}
if (self->state == IN_PACKAGE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) {
self->state = IN_SPINE;
if (!FsHelpers::openFileForRead("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
Serial.printf(
"[%lu] [COF] Couldn't open temp items file for reading. This is probably going to be a fatal error.\n",
millis());
}
return;
}
@@ -135,7 +156,13 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
}
}
self->items[itemId] = href;
// Write items down to SD card
serialization::writeString(self->tempItemStore, itemId);
serialization::writeString(self->tempItemStore, href);
if (itemId == self->coverItemId) {
self->coverItemHref = href;
}
if (mediaType == MEDIA_TYPE_NCX) {
if (self->tocNcxPath.empty()) {
@@ -148,14 +175,29 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
return;
}
if (self->state == IN_SPINE && (strcmp(name, "itemref") == 0 || strcmp(name, "opf:itemref") == 0)) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "idref") == 0) {
self->spineRefs.emplace_back(atts[i + 1]);
break;
// NOTE: This relies on spine appearing after item manifest (which is pretty safe as it's part of the EPUB spec)
// Only run the spine parsing if there's a cache to add it to
if (self->cache) {
if (self->state == IN_SPINE && (strcmp(name, "itemref") == 0 || strcmp(name, "opf:itemref") == 0)) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "idref") == 0) {
const std::string idref = atts[i + 1];
// Resolve the idref to href using items map
self->tempItemStore.seek(0);
std::string itemId;
std::string href;
while (self->tempItemStore.available()) {
serialization::readString(self->tempItemStore, itemId);
serialization::readString(self->tempItemStore, href);
if (itemId == idref) {
self->cache->createSpineEntry(href);
break;
}
}
}
}
return;
}
return;
}
}
@@ -174,11 +216,13 @@ void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name)
if (self->state == IN_SPINE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) {
self->state = IN_PACKAGE;
self->tempItemStore.close();
return;
}
if (self->state == IN_MANIFEST && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
self->state = IN_PACKAGE;
self->tempItemStore.close();
return;
}

View File

@@ -1,11 +1,11 @@
#pragma once
#include <Print.h>
#include <map>
#include "Epub.h"
#include "expat.h"
class BookMetadataCache;
class ContentOpfParser final : public Print {
enum ParserState {
START,
@@ -16,10 +16,14 @@ class ContentOpfParser final : public Print {
IN_SPINE,
};
const std::string& cachePath;
const std::string& baseContentPath;
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
BookMetadataCache* cache;
File tempItemStore;
std::string coverItemId;
static void startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void characterData(void* userData, const XML_Char* s, int len);
@@ -28,12 +32,11 @@ class ContentOpfParser final : public Print {
public:
std::string title;
std::string tocNcxPath;
std::string coverItemId;
std::map<std::string, std::string> items;
std::vector<std::string> spineRefs;
std::string coverItemHref;
explicit ContentOpfParser(const std::string& baseContentPath, const size_t xmlSize)
: baseContentPath(baseContentPath), remainingSize(xmlSize) {}
explicit ContentOpfParser(const std::string& cachePath, const std::string& baseContentPath, const size_t xmlSize,
BookMetadataCache* cache)
: cachePath(cachePath), baseContentPath(baseContentPath), remainingSize(xmlSize), cache(cache) {}
~ContentOpfParser() override;
bool setup();

View File

@@ -1,8 +1,9 @@
#include "TocNcxParser.h"
#include <Esp.h>
#include <HardwareSerial.h>
#include "../BookMetadataCache.h"
bool TocNcxParser::setup() {
parser = XML_ParserCreate(nullptr);
if (!parser) {
@@ -167,8 +168,9 @@ void XMLCALL TocNcxParser::endElement(void* userData, const XML_Char* name) {
href = href.substr(0, pos);
}
// Push to vector
self->toc.push_back({std::move(self->currentLabel), std::move(href), std::move(anchor), self->currentDepth});
if (self->cache) {
self->cache->createTocEntry(self->currentLabel, href, anchor, self->currentDepth);
}
// Clear them so we don't re-add them if there are weird XML structures
self->currentLabel.clear();

View File

@@ -1,11 +1,10 @@
#pragma once
#include <Print.h>
#include <expat.h>
#include <string>
#include <vector>
#include "Epub/EpubTocEntry.h"
#include "expat.h"
class BookMetadataCache;
class TocNcxParser final : public Print {
enum ParserState { START, IN_NCX, IN_NAV_MAP, IN_NAV_POINT, IN_NAV_LABEL, IN_NAV_LABEL_TEXT, IN_CONTENT };
@@ -14,6 +13,7 @@ class TocNcxParser final : public Print {
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
BookMetadataCache* cache;
std::string currentLabel;
std::string currentSrc;
@@ -24,10 +24,8 @@ class TocNcxParser final : public Print {
static void endElement(void* userData, const XML_Char* name);
public:
std::vector<EpubTocEntry> toc;
explicit TocNcxParser(const std::string& baseContentPath, const size_t xmlSize)
: baseContentPath(baseContentPath), remainingSize(xmlSize) {}
explicit TocNcxParser(const std::string& baseContentPath, const size_t xmlSize, BookMetadataCache* cache)
: baseContentPath(baseContentPath), remainingSize(xmlSize), cache(cache) {}
~TocNcxParser() override;
bool setup();