From 3628d8eb3727f8f1a00ffe883fa0a4a69fd8fa7d Mon Sep 17 00:00:00 2001 From: cottongin Date: Mon, 2 Mar 2026 05:19:14 -0500 Subject: [PATCH] feat: port upstream KOReader sync PRs (#1185, #1217, #1090) Port three unmerged upstream PRs with adaptations for the fork's callback-based ActivityWithSubactivity architecture: - PR #1185: Cache KOReader document hash using mtime fingerprint + file size validation to avoid repeated MD5 computation on sync. - PR #1217: Proper KOReader XPath synchronisation via new ChapterXPathIndexer (Expat-based on-demand XHTML parsing) with XPath-first mapping and percentage fallback in ProgressMapper. - PR #1090: Push Progress & Sleep menu option with PUSH_ONLY sync mode. Adapted to fork's callback pattern with deferFinish() for thread-safe completion. Modified to sleep silently on any failure (hash, upload, no credentials) rather than returning to reader. Made-with: Cursor --- .../koreader-sync-xpath-mapping.md | 99 ++++ lib/I18n/I18nKeys.h | 1 + lib/I18n/translations/english.yaml | 1 + lib/KOReaderSync/ChapterXPathIndexer.cpp | 497 ++++++++++++++++++ lib/KOReaderSync/ChapterXPathIndexer.h | 67 +++ lib/KOReaderSync/KOReaderDocumentId.cpp | 152 ++++++ lib/KOReaderSync/KOReaderDocumentId.h | 27 + lib/KOReaderSync/ProgressMapper.cpp | 109 ++-- lib/KOReaderSync/ProgressMapper.h | 23 +- src/activities/reader/EpubReaderActivity.cpp | 36 ++ src/activities/reader/EpubReaderActivity.h | 1 + .../reader/EpubReaderMenuActivity.h | 2 + .../reader/KOReaderSyncActivity.cpp | 45 ++ src/activities/reader/KOReaderSyncActivity.h | 15 +- 14 files changed, 1031 insertions(+), 44 deletions(-) create mode 100644 docs/contributing/koreader-sync-xpath-mapping.md create mode 100644 lib/KOReaderSync/ChapterXPathIndexer.cpp create mode 100644 lib/KOReaderSync/ChapterXPathIndexer.h diff --git a/docs/contributing/koreader-sync-xpath-mapping.md b/docs/contributing/koreader-sync-xpath-mapping.md new file mode 100644 index 00000000..3e80ebaf --- /dev/null +++ b/docs/contributing/koreader-sync-xpath-mapping.md @@ -0,0 +1,99 @@ +# KOReader Sync XPath Mapping + +This note documents how CrossPoint maps reading positions to and from KOReader sync payloads. + +## Problem + +CrossPoint internally stores position as: + +- `spineIndex` (chapter index, 0-based) +- `pageNumber` + `totalPages` + +KOReader sync payload stores: + +- `progress` (XPath-like location) +- `percentage` (overall progress) + +A direct 1:1 mapping is not guaranteed because page layout differs between engines/devices. + +## DocFragment Index Convention + +KOReader uses **1-based** XPath predicates throughout, following standard XPath conventions. +The first EPUB spine item is `DocFragment[1]`, the second is `DocFragment[2]`, and so on. + +CrossPoint stores spine items as 0-based indices internally. The conversion is: + +- **Generating XPath (to KOReader):** `DocFragment[spineIndex + 1]` +- **Parsing XPath (from KOReader):** `spineIndex = DocFragment[N] - 1` + +Reference: [koreader/koreader#11585](https://github.com/koreader/koreader/issues/11585) confirms this +via a KOReader contributor mapping spine items to DocFragment numbers. + +## Current Strategy + +### CrossPoint -> KOReader + +Implemented in `ProgressMapper::toKOReader`. + +1. Compute overall `percentage` from chapter/page. +2. Attempt to compute a real element-level XPath via `ChapterXPathIndexer::findXPathForProgress`. +3. If XPath extraction fails, fallback to synthetic chapter path: + - `/body/DocFragment[spineIndex + 1]/body` + +### KOReader -> CrossPoint + +Implemented in `ProgressMapper::toCrossPoint`. + +1. Attempt to parse `DocFragment[N]` from incoming XPath; convert N to 0-based `spineIndex = N - 1`. +2. If valid, attempt XPath-to-offset mapping via `ChapterXPathIndexer::findProgressForXPath`. +3. Convert resolved intra-spine progress to page estimate. +4. If XPath path is invalid/unresolvable, fallback to percentage-based chapter/page estimation. + +## ChapterXPathIndexer Design + +The module reparses **one spine XHTML** on demand using Expat and builds temporary anchors: + +Source-of-truth note: XPath anchors are built from the original EPUB spine XHTML bytes (zip item contents), not from CrossPoint's distilled section render cache. This is intentional to preserve KOReader XPath compatibility. + +- anchor: `` +- `textOffset` counts non-whitespace bytes +- When multiple anchors exist for the same path, the one with the **smallest** textOffset is used + (start of element), not the latest periodic anchor. + +Forward lookup (CrossPoint → XPath): uses `upper_bound` to find the last anchor at or before the +target text offset, ensuring the returned XPath corresponds to the element the user is currently +inside rather than the next element. + +Matching for reverse lookup: + +1. exact path match — reported as `exact=yes` +2. index-insensitive path match (`div[2]` vs `div[3]` tolerated) — reported as `exact=no` +3. ancestor fallback — reported as `exact=no` + +If no match is found, caller must fallback to percentage. + +## Memory / Safety Constraints (ESP32-C3) + +The implementation intentionally avoids full DOM storage. + +- Parse one chapter only. +- Keep anchors in transient vectors only for duration of call. +- Free XML parser and chapter byte buffer on all success/failure paths. +- No persistent cache structures are introduced by this module. + +## Known Limitations + +- Page number on reverse mapping is still an estimate (renderer differences). +- XPath mapping intentionally uses original spine XHTML while pagination comes from distilled renderer output, so minor roundtrip page drift is expected. +- Image-only/low-text chapters may yield coarse anchors. +- Extremely malformed XHTML can force fallback behavior. + +## Operational Logging + +`ProgressMapper` logs mapping source in reverse direction: + +- `xpath` when XPath mapping path was used +- `percentage` when fallback path was used + +It also logs exactness (`exact=yes/no`) for XPath matches. Note that `exact=yes` is only set for +a full path match with correct indices; index-insensitive and ancestor matches always log `exact=no`. diff --git a/lib/I18n/I18nKeys.h b/lib/I18n/I18nKeys.h index 6d1a39e3..a3fa4adc 100644 --- a/lib/I18n/I18nKeys.h +++ b/lib/I18n/I18nKeys.h @@ -321,6 +321,7 @@ enum class StrId : uint16_t { STR_GO_TO_PERCENT, STR_GO_HOME_BUTTON, STR_SYNC_PROGRESS, + STR_PUSH_AND_SLEEP, STR_DELETE_CACHE, STR_CHAPTER_PREFIX, STR_PAGES_SEPARATOR, diff --git a/lib/I18n/translations/english.yaml b/lib/I18n/translations/english.yaml index 79482b5f..dcf51c6f 100644 --- a/lib/I18n/translations/english.yaml +++ b/lib/I18n/translations/english.yaml @@ -285,6 +285,7 @@ STR_HW_RIGHT_LABEL: "Right (4th button)" STR_GO_TO_PERCENT: "Go to %" STR_GO_HOME_BUTTON: "Go Home" STR_SYNC_PROGRESS: "Sync Reading Progress" +STR_PUSH_AND_SLEEP: "Push Progress & Sleep" STR_DELETE_CACHE: "Delete Book Cache" STR_CHAPTER_PREFIX: "Chapter: " STR_PAGES_SEPARATOR: " pages | " diff --git a/lib/KOReaderSync/ChapterXPathIndexer.cpp b/lib/KOReaderSync/ChapterXPathIndexer.cpp new file mode 100644 index 00000000..32909913 --- /dev/null +++ b/lib/KOReaderSync/ChapterXPathIndexer.cpp @@ -0,0 +1,497 @@ +#include "ChapterXPathIndexer.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +// Anchor used for both mapping directions. +// textOffset is counted as visible (non-whitespace) bytes from chapter start. +// xpath points to the nearest element path at/near that offset. + +struct XPathAnchor { + size_t textOffset = 0; + std::string xpath; + std::string xpathNoIndex; // precomputed removeIndices(xpath) +}; + +struct StackNode { + std::string tag; + int index = 1; + bool hasTextAnchor = false; +}; + +// ParserState is intentionally ephemeral and created per lookup call. +// It holds only one spine parse worth of data to avoid retaining structures +// that would increase long-lived heap usage on the ESP32-C3. +struct ParserState { + explicit ParserState(const int spineIndex) : spineIndex(spineIndex) { siblingCounters.emplace_back(); } + + int spineIndex = 0; + int skipDepth = -1; + size_t totalTextBytes = 0; + + std::vector stack; + std::vector> siblingCounters; + std::vector anchors; + + std::string baseXPath() const { return "/body/DocFragment[" + std::to_string(spineIndex + 1) + "]/body"; } + + // Canonicalize incoming KOReader XPath before matching: + // - remove all whitespace + // - lowercase tags + // - strip optional trailing /text() + // - strip trailing slash + static std::string normalizeXPath(const std::string& input) { + if (input.empty()) { + return ""; + } + + std::string out; + out.reserve(input.size()); + for (char c : input) { + const unsigned char uc = static_cast(c); + if (std::isspace(uc)) { + continue; + } + out.push_back(static_cast(std::tolower(uc))); + } + + const std::string textSuffix = "/text()"; + const size_t textPos = out.rfind(textSuffix); + if (textPos != std::string::npos && textPos + textSuffix.size() == out.size()) { + out.erase(textPos); + } + + while (!out.empty() && out.back() == '/') { + out.pop_back(); + } + + return out; + } + + // Remove bracketed numeric predicates so paths can be compared even when + // index counters differ between parser implementations. + static std::string removeIndices(const std::string& xpath) { + std::string out; + out.reserve(xpath.size()); + + bool inBracket = false; + for (char c : xpath) { + if (c == '[') { + inBracket = true; + continue; + } + if (c == ']') { + inBracket = false; + continue; + } + if (!inBracket) { + out.push_back(c); + } + } + return out; + } + + static int pathDepth(const std::string& xpath) { + int depth = 0; + for (char c : xpath) { + if (c == '/') { + depth++; + } + } + return depth; + } + + // Resolve a path to the best anchor offset. + // If exact node path is not found, progressively trim trailing segments and + // match ancestors to obtain a stable approximate location. + bool pickBestAnchorByPath(const std::string& targetPath, const bool ignoreIndices, size_t& outTextOffset, + bool& outExact) const { + if (targetPath.empty() || anchors.empty()) { + return false; + } + + const std::string normalizedTarget = ignoreIndices ? removeIndices(targetPath) : targetPath; + std::string probe = normalizedTarget; + bool exactProbe = true; + + while (!probe.empty()) { + int bestDepth = -1; + size_t bestOffset = 0; + bool found = false; + + for (const auto& anchor : anchors) { + const std::string& anchorPath = ignoreIndices ? anchor.xpathNoIndex : anchor.xpath; + if (anchorPath == probe) { + const int depth = pathDepth(anchorPath); + if (!found || depth > bestDepth || (depth == bestDepth && anchor.textOffset < bestOffset)) { + found = true; + bestDepth = depth; + bestOffset = anchor.textOffset; + } + } + } + + if (found) { + outTextOffset = bestOffset; + outExact = exactProbe; + return true; + } + + const size_t lastSlash = probe.find_last_of('/'); + if (lastSlash == std::string::npos || lastSlash == 0) { + break; + } + probe.erase(lastSlash); + exactProbe = false; + } + + return false; + } + + static std::string toLower(std::string value) { + for (char& c : value) { + c = static_cast(std::tolower(static_cast(c))); + } + return value; + } + + // Elements that should not contribute text position anchors. + static bool isSkippableTag(const std::string& tag) { return tag == "head" || tag == "script" || tag == "style"; } + + static bool isWhitespaceOnly(const XML_Char* text, const int len) { + for (int i = 0; i < len; i++) { + if (!std::isspace(static_cast(text[i]))) { + return false; + } + } + return true; + } + + // Count non-whitespace bytes to keep offsets stable against formatting-only + // differences and indentation in source XHTML. + static size_t countVisibleBytes(const XML_Char* text, const int len) { + size_t count = 0; + for (int i = 0; i < len; i++) { + if (!std::isspace(static_cast(text[i]))) { + count++; + } + } + return count; + } + + int bodyDepth() const { + for (int i = static_cast(stack.size()) - 1; i >= 0; i--) { + if (stack[i].tag == "body") { + return i; + } + } + return -1; + } + + bool insideBody() const { return bodyDepth() >= 0; } + + std::string currentXPath() const { + const int bodyIdx = bodyDepth(); + if (bodyIdx < 0) { + return baseXPath(); + } + + std::string xpath = baseXPath(); + for (size_t i = static_cast(bodyIdx + 1); i < stack.size(); i++) { + xpath += "/" + stack[i].tag + "[" + std::to_string(stack[i].index) + "]"; + } + return xpath; + } + + // Adds first anchor for an element when text begins and periodic anchors in + // longer runs so matching has sufficient granularity without exploding memory. + void addAnchorIfNeeded() { + if (!insideBody() || stack.empty()) { + return; + } + + if (!stack.back().hasTextAnchor) { + const std::string xpath = currentXPath(); + anchors.push_back({totalTextBytes, xpath, removeIndices(xpath)}); + stack.back().hasTextAnchor = true; + } else if (anchors.empty() || totalTextBytes - anchors.back().textOffset >= 192) { + const std::string xpath = currentXPath(); + if (anchors.empty() || anchors.back().xpath != xpath) { + anchors.push_back({totalTextBytes, xpath, removeIndices(xpath)}); + } + } + } + + void onStartElement(const XML_Char* rawName) { + std::string name = toLower(rawName ? rawName : ""); + const size_t depth = stack.size(); + + if (siblingCounters.size() <= depth) { + siblingCounters.resize(depth + 1); + } + const int siblingIndex = ++siblingCounters[depth][name]; + + stack.push_back({name, siblingIndex, false}); + siblingCounters.emplace_back(); + + if (skipDepth < 0 && isSkippableTag(name)) { + skipDepth = static_cast(stack.size()) - 1; + } + } + + void onEndElement() { + if (stack.empty()) { + return; + } + + if (skipDepth == static_cast(stack.size()) - 1) { + skipDepth = -1; + } + + stack.pop_back(); + if (!siblingCounters.empty()) { + siblingCounters.pop_back(); + } + } + + void onCharacterData(const XML_Char* text, const int len) { + if (skipDepth >= 0 || len <= 0 || !insideBody() || isWhitespaceOnly(text, len)) { + return; + } + + addAnchorIfNeeded(); + totalTextBytes += countVisibleBytes(text, len); + } + + std::string chooseXPath(const float intraSpineProgress) const { + if (anchors.empty()) { + return baseXPath(); + } + if (totalTextBytes == 0) { + return anchors.front().xpath; + } + + const float clampedProgress = std::max(0.0f, std::min(1.0f, intraSpineProgress)); + const size_t target = static_cast(clampedProgress * static_cast(totalTextBytes)); + + // upper_bound returns the first anchor strictly after target; step back to get + // the last anchor at-or-before target (the element the user is currently inside). + auto it = std::upper_bound(anchors.begin(), anchors.end(), target, + [](const size_t value, const XPathAnchor& anchor) { return value < anchor.textOffset; }); + if (it != anchors.begin()) { + --it; + } + return it->xpath; + } + + // Convert path -> progress ratio by matching to nearest available anchor. + bool chooseProgressForXPath(const std::string& xpath, float& outIntraSpineProgress, bool& outExactMatch) const { + if (anchors.empty()) { + return false; + } + + const std::string normalized = normalizeXPath(xpath); + if (normalized.empty()) { + return false; + } + + size_t matchedOffset = 0; + bool exact = false; + const char* matchTier = nullptr; + + bool matched = pickBestAnchorByPath(normalized, false, matchedOffset, exact); + if (matched) { + matchTier = exact ? "exact" : "ancestor"; + } else { + bool exactRaw = false; + matched = pickBestAnchorByPath(normalized, true, matchedOffset, exactRaw); + if (matched) { + exact = false; + matchTier = exactRaw ? "index-insensitive" : "index-insensitive-ancestor"; + } + } + + if (!matched) { + LOG_DBG("KOX", "Reverse: spine=%d no anchor match for '%s' (%zu anchors)", spineIndex, normalized.c_str(), + anchors.size()); + return false; + } + + outExactMatch = exact; + if (totalTextBytes == 0) { + outIntraSpineProgress = 0.0f; + LOG_DBG("KOX", "Reverse: spine=%d %s match offset=%zu -> progress=0.0 (no text)", spineIndex, matchTier, + matchedOffset); + return true; + } + + outIntraSpineProgress = static_cast(matchedOffset) / static_cast(totalTextBytes); + outIntraSpineProgress = std::max(0.0f, std::min(1.0f, outIntraSpineProgress)); + LOG_DBG("KOX", "Reverse: spine=%d %s match offset=%zu/%zu -> progress=%.3f", spineIndex, matchTier, matchedOffset, + totalTextBytes, outIntraSpineProgress); + return true; + } +}; + +void XMLCALL onStartElement(void* userData, const XML_Char* name, const XML_Char**) { + auto* state = static_cast(userData); + state->onStartElement(name); +} + +void XMLCALL onEndElement(void* userData, const XML_Char*) { + auto* state = static_cast(userData); + state->onEndElement(); +} + +void XMLCALL onCharacterData(void* userData, const XML_Char* text, const int len) { + auto* state = static_cast(userData); + state->onCharacterData(text, len); +} + +void XMLCALL onDefaultHandlerExpand(void* userData, const XML_Char* text, const int len) { + // The default handler fires for comments, PIs, DOCTYPE, and entity references. + // Only forward entity references (&..;) to avoid skewing text offsets with + // non-visible markup. + if (len < 3 || text[0] != '&' || text[len - 1] != ';') { + return; + } + for (int i = 1; i < len - 1; ++i) { + if (text[i] == '<' || text[i] == '>') { + return; + } + } + auto* state = static_cast(userData); + state->onCharacterData(text, len); +} + +// Parse one spine item and return a fully populated ParserState. +// Returns std::nullopt if validation, I/O, or XML parse fails. +static std::optional parseSpineItem(const std::shared_ptr& epub, const int spineIndex) { + if (!epub || spineIndex < 0 || spineIndex >= epub->getSpineItemsCount()) { + return std::nullopt; + } + + const auto spineItem = epub->getSpineItem(spineIndex); + if (spineItem.href.empty()) { + return std::nullopt; + } + + size_t chapterSize = 0; + uint8_t* chapterBytes = epub->readItemContentsToBytes(spineItem.href, &chapterSize, false); + if (!chapterBytes || chapterSize == 0) { + free(chapterBytes); + return std::nullopt; + } + + ParserState state(spineIndex); + + XML_Parser parser = XML_ParserCreate(nullptr); + if (!parser) { + free(chapterBytes); + LOG_ERR("KOX", "Failed to allocate XML parser for spine=%d", spineIndex); + return std::nullopt; + } + + XML_SetUserData(parser, &state); + XML_SetElementHandler(parser, onStartElement, onEndElement); + XML_SetCharacterDataHandler(parser, onCharacterData); + XML_SetDefaultHandlerExpand(parser, onDefaultHandlerExpand); + + const bool parseOk = XML_Parse(parser, reinterpret_cast(chapterBytes), static_cast(chapterSize), + XML_TRUE) != XML_STATUS_ERROR; + + if (!parseOk) { + LOG_ERR("KOX", "XPath parse failed for spine=%d at line %lu: %s", spineIndex, XML_GetCurrentLineNumber(parser), + XML_ErrorString(XML_GetErrorCode(parser))); + } + + XML_ParserFree(parser); + free(chapterBytes); + + if (!parseOk) { + return std::nullopt; + } + + return state; +} + +} // namespace + +std::string ChapterXPathIndexer::findXPathForProgress(const std::shared_ptr& epub, const int spineIndex, + const float intraSpineProgress) { + const auto state = parseSpineItem(epub, spineIndex); + if (!state) { + return ""; + } + + const std::string result = state->chooseXPath(intraSpineProgress); + LOG_DBG("KOX", "Forward: spine=%d progress=%.3f anchors=%zu textBytes=%zu -> %s", spineIndex, intraSpineProgress, + state->anchors.size(), state->totalTextBytes, result.c_str()); + return result; +} + +bool ChapterXPathIndexer::findProgressForXPath(const std::shared_ptr& epub, const int spineIndex, + const std::string& xpath, float& outIntraSpineProgress, + bool& outExactMatch) { + outIntraSpineProgress = 0.0f; + outExactMatch = false; + + if (xpath.empty()) { + return false; + } + + const auto state = parseSpineItem(epub, spineIndex); + if (!state) { + return false; + } + + LOG_DBG("KOX", "Reverse: spine=%d anchors=%zu textBytes=%zu for '%s'", spineIndex, state->anchors.size(), + state->totalTextBytes, xpath.c_str()); + return state->chooseProgressForXPath(xpath, outIntraSpineProgress, outExactMatch); +} + +bool ChapterXPathIndexer::tryExtractSpineIndexFromXPath(const std::string& xpath, int& outSpineIndex) { + outSpineIndex = -1; + if (xpath.empty()) { + return false; + } + + const std::string normalized = ParserState::normalizeXPath(xpath); + const std::string key = "/docfragment["; + const size_t pos = normalized.find(key); + if (pos == std::string::npos) { + LOG_DBG("KOX", "No DocFragment in xpath: '%s'", xpath.c_str()); + return false; + } + + const size_t start = pos + key.size(); + size_t end = start; + while (end < normalized.size() && std::isdigit(static_cast(normalized[end]))) { + end++; + } + + if (end == start || end >= normalized.size() || normalized[end] != ']') { + return false; + } + + const std::string value = normalized.substr(start, end - start); + const long parsed = std::strtol(value.c_str(), nullptr, 10); + // KOReader uses 1-based DocFragment indices; convert to 0-based spine index. + if (parsed < 1 || parsed > std::numeric_limits::max()) { + return false; + } + + outSpineIndex = static_cast(parsed) - 1; + return true; +} diff --git a/lib/KOReaderSync/ChapterXPathIndexer.h b/lib/KOReaderSync/ChapterXPathIndexer.h new file mode 100644 index 00000000..246fdecd --- /dev/null +++ b/lib/KOReaderSync/ChapterXPathIndexer.h @@ -0,0 +1,67 @@ +#pragma once + +#include + +#include +#include + +/** + * Lightweight XPath/progress bridge for KOReader sync. + * + * Why this exists: + * - CrossPoint stores reading position as chapter/page. + * - KOReader sync uses XPath + percentage. + * + * This utility reparses exactly one spine XHTML item with Expat and builds + * transient text anchors () so we can translate in both + * directions without keeping a full DOM in memory. + * + * Design constraints (ESP32-C3): + * - No persistent full-book structures. + * - Parse-on-demand and free memory immediately. + * - Keep fallback behavior deterministic if parsing/matching fails. + */ +class ChapterXPathIndexer { + public: + /** + * Convert an intra-spine progress ratio to the nearest element-level XPath. + * + * @param epub Loaded EPUB instance + * @param spineIndex Current spine item index + * @param intraSpineProgress Position within the spine item [0.0, 1.0] + * @return Best matching XPath for KOReader, or empty string on failure + */ + static std::string findXPathForProgress(const std::shared_ptr& epub, int spineIndex, float intraSpineProgress); + + /** + * Resolve a KOReader XPath to an intra-spine progress ratio. + * + * Matching strategy: + * 1) exact anchor path match, + * 2) index-insensitive path match, + * 3) ancestor fallback. + * + * @param epub Loaded EPUB instance + * @param spineIndex Spine item index to parse + * @param xpath Incoming KOReader XPath + * @param outIntraSpineProgress Resolved position within spine [0.0, 1.0] + * @param outExactMatch True only for full exact path match + * @return true if any match was resolved; false means caller should fallback + */ + static bool findProgressForXPath(const std::shared_ptr& epub, int spineIndex, const std::string& xpath, + float& outIntraSpineProgress, bool& outExactMatch); + + /** + * Parse DocFragment index from KOReader-style path segment: + * /body/DocFragment[N]/body/... + * + * KOReader uses 1-based DocFragment indices; N is converted to the 0-based + * spine index stored in outSpineIndex (i.e. outSpineIndex = N - 1). + * + * @param xpath KOReader XPath + * @param outSpineIndex 0-based spine index derived from DocFragment[N] + * @return true when DocFragment[N] exists and N is a valid integer >= 1 + * (converted to 0-based outSpineIndex); false otherwise + */ + static bool tryExtractSpineIndexFromXPath(const std::string& xpath, int& outSpineIndex); +}; diff --git a/lib/KOReaderSync/KOReaderDocumentId.cpp b/lib/KOReaderSync/KOReaderDocumentId.cpp index efb18d1b..0d5ea9b3 100644 --- a/lib/KOReaderSync/KOReaderDocumentId.cpp +++ b/lib/KOReaderSync/KOReaderDocumentId.cpp @@ -4,6 +4,8 @@ #include #include +#include + namespace { // Extract filename from path (everything after last '/') std::string getFilename(const std::string& path) { @@ -15,6 +17,130 @@ std::string getFilename(const std::string& path) { } } // namespace +std::string KOReaderDocumentId::getCacheFilePath(const std::string& filePath) { + // Mirror the Epub cache directory convention so the hash file shares the + // same per-book folder as other cached data. + return std::string("/.crosspoint/epub_") + std::to_string(std::hash{}(filePath)) + "/koreader_docid.txt"; +} + +std::string KOReaderDocumentId::loadCachedHash(const std::string& cacheFilePath, const size_t fileSize, + const std::string& currentFingerprint) { + if (!Storage.exists(cacheFilePath.c_str())) { + return ""; + } + + const String content = Storage.readFile(cacheFilePath.c_str()); + if (content.isEmpty()) { + return ""; + } + + // Format: ":\n<32-char-hex-hash>" + const int newlinePos = content.indexOf('\n'); + if (newlinePos < 0) { + return ""; + } + + const String header = content.substring(0, newlinePos); + const int colonPos = header.indexOf(':'); + if (colonPos < 0) { + LOG_DBG("KODoc", "Hash cache invalidated: header missing fingerprint"); + return ""; + } + + const String sizeTok = header.substring(0, colonPos); + const String fpTok = header.substring(colonPos + 1); + + // Validate the filesize token – it must consist of ASCII digits and parse + // correctly to the expected size. + bool digitsOnly = true; + for (size_t i = 0; i < sizeTok.length(); ++i) { + const char ch = sizeTok[i]; + if (ch < '0' || ch > '9') { + digitsOnly = false; + break; + } + } + if (!digitsOnly) { + LOG_DBG("KODoc", "Hash cache invalidated: size token not numeric ('%s')", sizeTok.c_str()); + return ""; + } + + const long parsed = sizeTok.toInt(); + if (parsed < 0) { + LOG_DBG("KODoc", "Hash cache invalidated: size token parse error ('%s')", sizeTok.c_str()); + return ""; + } + const size_t cachedSize = static_cast(parsed); + if (cachedSize != fileSize) { + LOG_DBG("KODoc", "Hash cache invalidated: file size or fingerprint changed (%zu -> %zu)", cachedSize, fileSize); + return ""; + } + + // Validate stored fingerprint format (8 hex characters) + if (fpTok.length() != 8) { + LOG_DBG("KODoc", "Hash cache invalidated: bad fingerprint length (%zu)", fpTok.length()); + return ""; + } + for (size_t i = 0; i < fpTok.length(); ++i) { + char c = fpTok[i]; + bool hex = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); + if (!hex) { + LOG_DBG("KODoc", "Hash cache invalidated: non-hex character '%c' in fingerprint", c); + return ""; + } + } + + { + String currentFpStr(currentFingerprint.c_str()); + if (fpTok != currentFpStr) { + LOG_DBG("KODoc", "Hash cache invalidated: fingerprint changed (%s != %s)", fpTok.c_str(), + currentFingerprint.c_str()); + return ""; + } + } + + std::string hash = content.substring(newlinePos + 1).c_str(); + // Trim any trailing whitespace / line endings + while (!hash.empty() && (hash.back() == '\n' || hash.back() == '\r' || hash.back() == ' ')) { + hash.pop_back(); + } + + // Hash must be exactly 32 hex characters. + if (hash.size() != 32) { + LOG_DBG("KODoc", "Hash cache invalidated: wrong hash length (%zu)", hash.size()); + return ""; + } + for (char c : hash) { + if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))) { + LOG_DBG("KODoc", "Hash cache invalidated: non-hex character '%c' in hash", c); + return ""; + } + } + + LOG_DBG("KODoc", "Hash cache hit: %s", hash.c_str()); + return hash; +} + +void KOReaderDocumentId::saveCachedHash(const std::string& cacheFilePath, const size_t fileSize, + const std::string& fingerprint, const std::string& hash) { + // Ensure the book's cache directory exists before writing + const size_t lastSlash = cacheFilePath.rfind('/'); + if (lastSlash != std::string::npos) { + Storage.ensureDirectoryExists(cacheFilePath.substr(0, lastSlash).c_str()); + } + + // Format: ":\n" + String content(std::to_string(fileSize).c_str()); + content += ':'; + content += fingerprint.c_str(); + content += '\n'; + content += hash.c_str(); + + if (!Storage.writeFile(cacheFilePath.c_str(), content)) { + LOG_DBG("KODoc", "Failed to write hash cache to %s", cacheFilePath.c_str()); + } +} + std::string KOReaderDocumentId::calculateFromFilename(const std::string& filePath) { const std::string filename = getFilename(filePath); if (filename.empty()) { @@ -49,6 +175,30 @@ std::string KOReaderDocumentId::calculate(const std::string& filePath) { } const size_t fileSize = file.fileSize(); + + // Compute a lightweight fingerprint from the file's modification time. + // The underlying FsFile API provides getModifyDateTime which returns two + // packed 16-bit values (date and time). Concatenate these as eight hex + // digits to produce the token stored in the cache header. + uint16_t date = 0, time = 0; + if (!file.getModifyDateTime(&date, &time)) { + // If timestamp isn't available for some reason, fall back to a sentinel. + date = 0; + time = 0; + } + char fpBuf[9]; + // two 16-bit numbers => 4 hex digits each + sprintf(fpBuf, "%04x%04x", date, time); + const std::string fingerprintTok(fpBuf); + + // Return persisted hash if the file size and fingerprint haven't changed. + const std::string cacheFilePath = getCacheFilePath(filePath); + const std::string cached = loadCachedHash(cacheFilePath, fileSize, fingerprintTok); + if (!cached.empty()) { + file.close(); + return cached; + } + LOG_DBG("KODoc", "Calculating hash for file: %s (size: %zu)", filePath.c_str(), fileSize); // Initialize MD5 builder @@ -92,5 +242,7 @@ std::string KOReaderDocumentId::calculate(const std::string& filePath) { LOG_DBG("KODoc", "Hash calculated: %s (from %zu bytes)", result.c_str(), totalBytesRead); + saveCachedHash(cacheFilePath, fileSize, fingerprintTok, result); + return result; } diff --git a/lib/KOReaderSync/KOReaderDocumentId.h b/lib/KOReaderSync/KOReaderDocumentId.h index 2b6189e2..5f226eb5 100644 --- a/lib/KOReaderSync/KOReaderDocumentId.h +++ b/lib/KOReaderSync/KOReaderDocumentId.h @@ -42,4 +42,31 @@ class KOReaderDocumentId { // Calculate offset for index i: 1024 << (2*i) static size_t getOffset(int i); + + // Hash cache helpers + // Returns the path to the per-book cache file that stores the precomputed hash. + // Uses the same directory convention as the Epub cache (/.crosspoint/epub_/). + static std::string getCacheFilePath(const std::string& filePath); + + // Returns the cached hash if the file size and fingerprint match, or empty + // string on miss/invalidation. + // + // The fingerprint is derived from the file's modification timestamp. We + // call `FsFile::getModifyDateTime` to retrieve two 16‑bit packed values + // supplied by the filesystem: one for the date and one for the time. These + // are concatenated and represented as eight hexadecimal digits in the form + //