New book.bin spine and table of contents cache (#104)

## Summary

* Use single unified cache file for book spine, table of contents, and
core metadata (title, author, cover image)
* Use new temp item store file in OPF parsing to store items to be
rescaned when parsing spine
  * This avoids us holding these items in memory
* Use new toc.bin.tmp and spine.bin.tmp to build out partial toc / spine
data as part of parsing content.opf and the NCX file
  * These files are re-read multiple times to ultimately build book.bin

## Additional Context

* Spec for file format included below as an image
* This should help with:
  * #10 
  * #60 
  * #99
This commit is contained in:
Dave Allie
2025-12-24 22:36:13 +11:00
committed by GitHub
parent ea0abaf351
commit b6bc1f7ed3
15 changed files with 748 additions and 169 deletions

View File

@@ -1,11 +1,16 @@
#include "ContentOpfParser.h"
#include <FsHelpers.h>
#include <HardwareSerial.h>
#include <Serialization.h>
#include <ZipFile.h>
#include "../BookMetadataCache.h"
namespace {
constexpr char MEDIA_TYPE_NCX[] = "application/x-dtbncx+xml";
}
constexpr char itemCacheFile[] = "/.items.bin";
} // namespace
bool ContentOpfParser::setup() {
parser = XML_ParserCreate(nullptr);
@@ -28,6 +33,12 @@ ContentOpfParser::~ContentOpfParser() {
XML_ParserFree(parser);
parser = nullptr;
}
if (tempItemStore) {
tempItemStore.close();
}
if (SD.exists((cachePath + itemCacheFile).c_str())) {
SD.remove((cachePath + itemCacheFile).c_str());
}
}
size_t ContentOpfParser::write(const uint8_t data) { return write(&data, 1); }
@@ -94,11 +105,21 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
self->state = IN_MANIFEST;
if (!FsHelpers::openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
Serial.printf(
"[%lu] [COF] Couldn't open temp items file for writing. This is probably going to be a fatal error.\n",
millis());
}
return;
}
if (self->state == IN_PACKAGE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) {
self->state = IN_SPINE;
if (!FsHelpers::openFileForRead("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
Serial.printf(
"[%lu] [COF] Couldn't open temp items file for reading. This is probably going to be a fatal error.\n",
millis());
}
return;
}
@@ -135,7 +156,13 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
}
}
self->items[itemId] = href;
// Write items down to SD card
serialization::writeString(self->tempItemStore, itemId);
serialization::writeString(self->tempItemStore, href);
if (itemId == self->coverItemId) {
self->coverItemHref = href;
}
if (mediaType == MEDIA_TYPE_NCX) {
if (self->tocNcxPath.empty()) {
@@ -148,14 +175,29 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
return;
}
if (self->state == IN_SPINE && (strcmp(name, "itemref") == 0 || strcmp(name, "opf:itemref") == 0)) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "idref") == 0) {
self->spineRefs.emplace_back(atts[i + 1]);
break;
// NOTE: This relies on spine appearing after item manifest (which is pretty safe as it's part of the EPUB spec)
// Only run the spine parsing if there's a cache to add it to
if (self->cache) {
if (self->state == IN_SPINE && (strcmp(name, "itemref") == 0 || strcmp(name, "opf:itemref") == 0)) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], "idref") == 0) {
const std::string idref = atts[i + 1];
// Resolve the idref to href using items map
self->tempItemStore.seek(0);
std::string itemId;
std::string href;
while (self->tempItemStore.available()) {
serialization::readString(self->tempItemStore, itemId);
serialization::readString(self->tempItemStore, href);
if (itemId == idref) {
self->cache->createSpineEntry(href);
break;
}
}
}
}
return;
}
return;
}
}
@@ -174,11 +216,13 @@ void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name)
if (self->state == IN_SPINE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) {
self->state = IN_PACKAGE;
self->tempItemStore.close();
return;
}
if (self->state == IN_MANIFEST && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
self->state = IN_PACKAGE;
self->tempItemStore.close();
return;
}

View File

@@ -1,11 +1,11 @@
#pragma once
#include <Print.h>
#include <map>
#include "Epub.h"
#include "expat.h"
class BookMetadataCache;
class ContentOpfParser final : public Print {
enum ParserState {
START,
@@ -16,10 +16,14 @@ class ContentOpfParser final : public Print {
IN_SPINE,
};
const std::string& cachePath;
const std::string& baseContentPath;
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
BookMetadataCache* cache;
File tempItemStore;
std::string coverItemId;
static void startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void characterData(void* userData, const XML_Char* s, int len);
@@ -28,12 +32,11 @@ class ContentOpfParser final : public Print {
public:
std::string title;
std::string tocNcxPath;
std::string coverItemId;
std::map<std::string, std::string> items;
std::vector<std::string> spineRefs;
std::string coverItemHref;
explicit ContentOpfParser(const std::string& baseContentPath, const size_t xmlSize)
: baseContentPath(baseContentPath), remainingSize(xmlSize) {}
explicit ContentOpfParser(const std::string& cachePath, const std::string& baseContentPath, const size_t xmlSize,
BookMetadataCache* cache)
: cachePath(cachePath), baseContentPath(baseContentPath), remainingSize(xmlSize), cache(cache) {}
~ContentOpfParser() override;
bool setup();

View File

@@ -1,8 +1,9 @@
#include "TocNcxParser.h"
#include <Esp.h>
#include <HardwareSerial.h>
#include "../BookMetadataCache.h"
bool TocNcxParser::setup() {
parser = XML_ParserCreate(nullptr);
if (!parser) {
@@ -167,8 +168,9 @@ void XMLCALL TocNcxParser::endElement(void* userData, const XML_Char* name) {
href = href.substr(0, pos);
}
// Push to vector
self->toc.push_back({std::move(self->currentLabel), std::move(href), std::move(anchor), self->currentDepth});
if (self->cache) {
self->cache->createTocEntry(self->currentLabel, href, anchor, self->currentDepth);
}
// Clear them so we don't re-add them if there are weird XML structures
self->currentLabel.clear();

View File

@@ -1,11 +1,10 @@
#pragma once
#include <Print.h>
#include <expat.h>
#include <string>
#include <vector>
#include "Epub/EpubTocEntry.h"
#include "expat.h"
class BookMetadataCache;
class TocNcxParser final : public Print {
enum ParserState { START, IN_NCX, IN_NAV_MAP, IN_NAV_POINT, IN_NAV_LABEL, IN_NAV_LABEL_TEXT, IN_CONTENT };
@@ -14,6 +13,7 @@ class TocNcxParser final : public Print {
size_t remainingSize;
XML_Parser parser = nullptr;
ParserState state = START;
BookMetadataCache* cache;
std::string currentLabel;
std::string currentSrc;
@@ -24,10 +24,8 @@ class TocNcxParser final : public Print {
static void endElement(void* userData, const XML_Char* name);
public:
std::vector<EpubTocEntry> toc;
explicit TocNcxParser(const std::string& baseContentPath, const size_t xmlSize)
: baseContentPath(baseContentPath), remainingSize(xmlSize) {}
explicit TocNcxParser(const std::string& baseContentPath, const size_t xmlSize, BookMetadataCache* cache)
: baseContentPath(baseContentPath), remainingSize(xmlSize), cache(cache) {}
~TocNcxParser() override;
bool setup();