#include "ContentOpfParser.h" #include #include #include #include "../BookMetadataCache.h" namespace { constexpr char MEDIA_TYPE_NCX[] = "application/x-dtbncx+xml"; constexpr char MEDIA_TYPE_CSS[] = "text/css"; constexpr char itemCacheFile[] = "/.items.bin"; } // namespace bool ContentOpfParser::setup() { parser = XML_ParserCreate(nullptr); if (!parser) { LOG_DBG("COF", "Couldn't allocate memory for parser"); return false; } XML_SetUserData(parser, this); XML_SetElementHandler(parser, startElement, endElement); XML_SetCharacterDataHandler(parser, characterData); return true; } ContentOpfParser::~ContentOpfParser() { if (parser) { XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); parser = nullptr; } if (tempItemStore) { tempItemStore.close(); } if (Storage.exists((cachePath + itemCacheFile).c_str())) { Storage.remove((cachePath + itemCacheFile).c_str()); } itemIndex.clear(); itemIndex.shrink_to_fit(); useItemIndex = false; } size_t ContentOpfParser::write(const uint8_t data) { return write(&data, 1); } size_t ContentOpfParser::write(const uint8_t* buffer, const size_t size) { if (!parser) return 0; const uint8_t* currentBufferPos = buffer; auto remainingInBuffer = size; while (remainingInBuffer > 0) { void* const buf = XML_GetBuffer(parser, 1024); if (!buf) { LOG_ERR("COF", "Couldn't allocate memory for buffer"); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); parser = nullptr; return 0; } const auto toRead = remainingInBuffer < 1024 ? remainingInBuffer : 1024; memcpy(buf, currentBufferPos, toRead); if (XML_ParseBuffer(parser, static_cast(toRead), remainingSize == toRead) == XML_STATUS_ERROR) { LOG_DBG("COF", "Parse error at line %lu: %s", XML_GetCurrentLineNumber(parser), XML_ErrorString(XML_GetErrorCode(parser))); XML_StopParser(parser, XML_FALSE); // Stop any pending processing XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks XML_SetCharacterDataHandler(parser, nullptr); XML_ParserFree(parser); parser = nullptr; return 0; } currentBufferPos += toRead; remainingInBuffer -= toRead; remainingSize -= toRead; } return size; } void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) { auto* self = static_cast(userData); (void)atts; if (self->state == START && (strcmp(name, "package") == 0 || strcmp(name, "opf:package") == 0)) { self->state = IN_PACKAGE; return; } if (self->state == IN_PACKAGE && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) { self->state = IN_METADATA; return; } if (self->state == IN_METADATA && strcmp(name, "dc:title") == 0) { self->state = IN_BOOK_TITLE; return; } if (self->state == IN_METADATA && strcmp(name, "dc:creator") == 0) { self->state = IN_BOOK_AUTHOR; return; } if (self->state == IN_METADATA && strcmp(name, "dc:language") == 0) { self->state = IN_BOOK_LANGUAGE; return; } if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) { self->state = IN_MANIFEST; if (!Storage.openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) { LOG_ERR("COF", "Couldn't open temp items file for writing. This is probably going to be a fatal error."); } return; } if (self->state == IN_PACKAGE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) { self->state = IN_SPINE; if (!Storage.openFileForRead("COF", self->cachePath + itemCacheFile, self->tempItemStore)) { LOG_ERR("COF", "Couldn't open temp items file for reading. This is probably going to be a fatal error."); } // Sort item index for binary search if we have enough items if (self->itemIndex.size() >= LARGE_SPINE_THRESHOLD) { std::sort(self->itemIndex.begin(), self->itemIndex.end(), [](const ItemIndexEntry& a, const ItemIndexEntry& b) { return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen); }); self->useItemIndex = true; LOG_DBG("COF", "Using fast index for %zu manifest items", self->itemIndex.size()); } return; } if (self->state == IN_PACKAGE && (strcmp(name, "guide") == 0 || strcmp(name, "opf:guide") == 0)) { self->state = IN_GUIDE; // TODO Remove print LOG_DBG("COF", "Entering guide state."); if (!Storage.openFileForRead("COF", self->cachePath + itemCacheFile, self->tempItemStore)) { LOG_ERR("COF", "Couldn't open temp items file for reading. This is probably going to be a fatal error."); } return; } if (self->state == IN_METADATA && (strcmp(name, "meta") == 0 || strcmp(name, "opf:meta") == 0)) { bool isCover = false; std::string coverItemId; for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "name") == 0 && strcmp(atts[i + 1], "cover") == 0) { isCover = true; } else if (strcmp(atts[i], "content") == 0) { coverItemId = atts[i + 1]; } } if (isCover) { self->coverItemId = coverItemId; } return; } if (self->state == IN_MANIFEST && (strcmp(name, "item") == 0 || strcmp(name, "opf:item") == 0)) { std::string itemId; std::string href; std::string mediaType; std::string properties; for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "id") == 0) { itemId = atts[i + 1]; } else if (strcmp(atts[i], "href") == 0) { href = FsHelpers::normalisePath(self->baseContentPath + atts[i + 1]); } else if (strcmp(atts[i], "media-type") == 0) { mediaType = atts[i + 1]; } else if (strcmp(atts[i], "properties") == 0) { properties = atts[i + 1]; } } // Record index entry for fast lookup later if (self->tempItemStore) { ItemIndexEntry entry; entry.idHash = fnvHash(itemId); entry.idLen = static_cast(itemId.size()); entry.fileOffset = static_cast(self->tempItemStore.position()); self->itemIndex.push_back(entry); } // Write items down to SD card serialization::writeString(self->tempItemStore, itemId); serialization::writeString(self->tempItemStore, href); if (itemId == self->coverItemId) { self->coverItemHref = href; } if (mediaType == MEDIA_TYPE_NCX) { if (self->tocNcxPath.empty()) { self->tocNcxPath = href; } else { LOG_DBG("COF", "Warning: Multiple NCX files found in manifest. Ignoring duplicate: %s", href.c_str()); } } // Collect CSS files if (mediaType == MEDIA_TYPE_CSS) { self->cssFiles.push_back(href); } // EPUB 3: Check for nav document (properties contains "nav") if (!properties.empty() && self->tocNavPath.empty()) { // Properties is space-separated, check if "nav" is present as a word if (properties == "nav" || properties.find("nav ") == 0 || properties.find(" nav") != std::string::npos) { self->tocNavPath = href; LOG_DBG("COF", "Found EPUB 3 nav document: %s", href.c_str()); } } // EPUB 3: Check for cover image (properties contains "cover-image") if (!properties.empty() && self->coverItemHref.empty()) { if (properties == "cover-image" || properties.find("cover-image ") == 0 || properties.find(" cover-image") != std::string::npos) { self->coverItemHref = href; } } return; } // NOTE: This relies on spine appearing after item manifest (which is pretty safe as it's part of the EPUB spec) // Only run the spine parsing if there's a cache to add it to if (self->cache) { if (self->state == IN_SPINE && (strcmp(name, "itemref") == 0 || strcmp(name, "opf:itemref") == 0)) { for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "idref") == 0) { const std::string idref = atts[i + 1]; std::string href; bool found = false; if (self->useItemIndex) { // Fast path: binary search uint32_t targetHash = fnvHash(idref); uint16_t targetLen = static_cast(idref.size()); auto it = std::lower_bound(self->itemIndex.begin(), self->itemIndex.end(), ItemIndexEntry{targetHash, targetLen, 0}, [](const ItemIndexEntry& a, const ItemIndexEntry& b) { return a.idHash < b.idHash || (a.idHash == b.idHash && a.idLen < b.idLen); }); // Check for match (may need to check a few due to hash collisions) while (it != self->itemIndex.end() && it->idHash == targetHash) { self->tempItemStore.seek(it->fileOffset); std::string itemId; serialization::readString(self->tempItemStore, itemId); if (itemId == idref) { serialization::readString(self->tempItemStore, href); found = true; break; } ++it; } } else { // Slow path: linear scan (for small manifests, keeps original behavior) // TODO: This lookup is slow as need to scan through all items each time. // It can take up to 200ms per item when getting to 1500 items. self->tempItemStore.seek(0); std::string itemId; while (self->tempItemStore.available()) { serialization::readString(self->tempItemStore, itemId); serialization::readString(self->tempItemStore, href); if (itemId == idref) { found = true; break; } } } if (found && self->cache) { self->cache->createSpineEntry(href); } } } return; } } // parse the guide if (self->state == IN_GUIDE && (strcmp(name, "reference") == 0 || strcmp(name, "opf:reference") == 0)) { std::string type; std::string textHref; for (int i = 0; atts[i]; i += 2) { if (strcmp(atts[i], "type") == 0) { type = atts[i + 1]; if (type == "text" || type == "start") { continue; } else { LOG_DBG("COF", "Skipping non-text reference in guide: %s", type.c_str()); break; } } else if (strcmp(atts[i], "href") == 0) { textHref = FsHelpers::normalisePath(self->baseContentPath + atts[i + 1]); } } if ((type == "text" || (type == "start" && !self->textReferenceHref.empty())) && (textHref.length() > 0)) { LOG_DBG("COF", "Found %s reference in guide: %s.", type.c_str(), textHref.c_str()); self->textReferenceHref = textHref; } return; } } void XMLCALL ContentOpfParser::characterData(void* userData, const XML_Char* s, const int len) { auto* self = static_cast(userData); if (self->state == IN_BOOK_TITLE) { self->title.append(s, len); return; } if (self->state == IN_BOOK_AUTHOR) { self->author.append(s, len); return; } if (self->state == IN_BOOK_LANGUAGE) { self->language.append(s, len); return; } } void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) { auto* self = static_cast(userData); (void)name; if (self->state == IN_SPINE && (strcmp(name, "spine") == 0 || strcmp(name, "opf:spine") == 0)) { self->state = IN_PACKAGE; self->tempItemStore.close(); return; } if (self->state == IN_GUIDE && (strcmp(name, "guide") == 0 || strcmp(name, "opf:guide") == 0)) { self->state = IN_PACKAGE; self->tempItemStore.close(); return; } if (self->state == IN_MANIFEST && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) { self->state = IN_PACKAGE; self->tempItemStore.close(); return; } if (self->state == IN_BOOK_TITLE && strcmp(name, "dc:title") == 0) { self->state = IN_METADATA; return; } if (self->state == IN_BOOK_AUTHOR && strcmp(name, "dc:creator") == 0) { self->state = IN_METADATA; return; } if (self->state == IN_BOOK_LANGUAGE && strcmp(name, "dc:language") == 0) { self->state = IN_METADATA; return; } if (self->state == IN_METADATA && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) { self->state = IN_PACKAGE; return; } if (self->state == IN_PACKAGE && (strcmp(name, "package") == 0 || strcmp(name, "opf:package") == 0)) { self->state = START; return; } }