Port three unmerged upstream PRs with adaptations for the fork's callback-based ActivityWithSubactivity architecture: - PR #1185: Cache KOReader document hash using mtime fingerprint + file size validation to avoid repeated MD5 computation on sync. - PR #1217: Proper KOReader XPath synchronisation via new ChapterXPathIndexer (Expat-based on-demand XHTML parsing) with XPath-first mapping and percentage fallback in ProgressMapper. - PR #1090: Push Progress & Sleep menu option with PUSH_ONLY sync mode. Adapted to fork's callback pattern with deferFinish() for thread-safe completion. Modified to sleep silently on any failure (hash, upload, no credentials) rather than returning to reader. Made-with: Cursor
This commit is contained in:
497
lib/KOReaderSync/ChapterXPathIndexer.cpp
Normal file
497
lib/KOReaderSync/ChapterXPathIndexer.cpp
Normal file
@@ -0,0 +1,497 @@
|
||||
#include "ChapterXPathIndexer.h"
|
||||
|
||||
#include <Logging.h>
|
||||
#include <expat.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
|
||||
// Anchor used for both mapping directions.
|
||||
// textOffset is counted as visible (non-whitespace) bytes from chapter start.
|
||||
// xpath points to the nearest element path at/near that offset.
|
||||
|
||||
struct XPathAnchor {
|
||||
size_t textOffset = 0;
|
||||
std::string xpath;
|
||||
std::string xpathNoIndex; // precomputed removeIndices(xpath)
|
||||
};
|
||||
|
||||
struct StackNode {
|
||||
std::string tag;
|
||||
int index = 1;
|
||||
bool hasTextAnchor = false;
|
||||
};
|
||||
|
||||
// ParserState is intentionally ephemeral and created per lookup call.
|
||||
// It holds only one spine parse worth of data to avoid retaining structures
|
||||
// that would increase long-lived heap usage on the ESP32-C3.
|
||||
struct ParserState {
|
||||
explicit ParserState(const int spineIndex) : spineIndex(spineIndex) { siblingCounters.emplace_back(); }
|
||||
|
||||
int spineIndex = 0;
|
||||
int skipDepth = -1;
|
||||
size_t totalTextBytes = 0;
|
||||
|
||||
std::vector<StackNode> stack;
|
||||
std::vector<std::unordered_map<std::string, int>> siblingCounters;
|
||||
std::vector<XPathAnchor> anchors;
|
||||
|
||||
std::string baseXPath() const { return "/body/DocFragment[" + std::to_string(spineIndex + 1) + "]/body"; }
|
||||
|
||||
// Canonicalize incoming KOReader XPath before matching:
|
||||
// - remove all whitespace
|
||||
// - lowercase tags
|
||||
// - strip optional trailing /text()
|
||||
// - strip trailing slash
|
||||
static std::string normalizeXPath(const std::string& input) {
|
||||
if (input.empty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string out;
|
||||
out.reserve(input.size());
|
||||
for (char c : input) {
|
||||
const unsigned char uc = static_cast<unsigned char>(c);
|
||||
if (std::isspace(uc)) {
|
||||
continue;
|
||||
}
|
||||
out.push_back(static_cast<char>(std::tolower(uc)));
|
||||
}
|
||||
|
||||
const std::string textSuffix = "/text()";
|
||||
const size_t textPos = out.rfind(textSuffix);
|
||||
if (textPos != std::string::npos && textPos + textSuffix.size() == out.size()) {
|
||||
out.erase(textPos);
|
||||
}
|
||||
|
||||
while (!out.empty() && out.back() == '/') {
|
||||
out.pop_back();
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// Remove bracketed numeric predicates so paths can be compared even when
|
||||
// index counters differ between parser implementations.
|
||||
static std::string removeIndices(const std::string& xpath) {
|
||||
std::string out;
|
||||
out.reserve(xpath.size());
|
||||
|
||||
bool inBracket = false;
|
||||
for (char c : xpath) {
|
||||
if (c == '[') {
|
||||
inBracket = true;
|
||||
continue;
|
||||
}
|
||||
if (c == ']') {
|
||||
inBracket = false;
|
||||
continue;
|
||||
}
|
||||
if (!inBracket) {
|
||||
out.push_back(c);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static int pathDepth(const std::string& xpath) {
|
||||
int depth = 0;
|
||||
for (char c : xpath) {
|
||||
if (c == '/') {
|
||||
depth++;
|
||||
}
|
||||
}
|
||||
return depth;
|
||||
}
|
||||
|
||||
// Resolve a path to the best anchor offset.
|
||||
// If exact node path is not found, progressively trim trailing segments and
|
||||
// match ancestors to obtain a stable approximate location.
|
||||
bool pickBestAnchorByPath(const std::string& targetPath, const bool ignoreIndices, size_t& outTextOffset,
|
||||
bool& outExact) const {
|
||||
if (targetPath.empty() || anchors.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::string normalizedTarget = ignoreIndices ? removeIndices(targetPath) : targetPath;
|
||||
std::string probe = normalizedTarget;
|
||||
bool exactProbe = true;
|
||||
|
||||
while (!probe.empty()) {
|
||||
int bestDepth = -1;
|
||||
size_t bestOffset = 0;
|
||||
bool found = false;
|
||||
|
||||
for (const auto& anchor : anchors) {
|
||||
const std::string& anchorPath = ignoreIndices ? anchor.xpathNoIndex : anchor.xpath;
|
||||
if (anchorPath == probe) {
|
||||
const int depth = pathDepth(anchorPath);
|
||||
if (!found || depth > bestDepth || (depth == bestDepth && anchor.textOffset < bestOffset)) {
|
||||
found = true;
|
||||
bestDepth = depth;
|
||||
bestOffset = anchor.textOffset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (found) {
|
||||
outTextOffset = bestOffset;
|
||||
outExact = exactProbe;
|
||||
return true;
|
||||
}
|
||||
|
||||
const size_t lastSlash = probe.find_last_of('/');
|
||||
if (lastSlash == std::string::npos || lastSlash == 0) {
|
||||
break;
|
||||
}
|
||||
probe.erase(lastSlash);
|
||||
exactProbe = false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static std::string toLower(std::string value) {
|
||||
for (char& c : value) {
|
||||
c = static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
// Elements that should not contribute text position anchors.
|
||||
static bool isSkippableTag(const std::string& tag) { return tag == "head" || tag == "script" || tag == "style"; }
|
||||
|
||||
static bool isWhitespaceOnly(const XML_Char* text, const int len) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (!std::isspace(static_cast<unsigned char>(text[i]))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Count non-whitespace bytes to keep offsets stable against formatting-only
|
||||
// differences and indentation in source XHTML.
|
||||
static size_t countVisibleBytes(const XML_Char* text, const int len) {
|
||||
size_t count = 0;
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (!std::isspace(static_cast<unsigned char>(text[i]))) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
int bodyDepth() const {
|
||||
for (int i = static_cast<int>(stack.size()) - 1; i >= 0; i--) {
|
||||
if (stack[i].tag == "body") {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool insideBody() const { return bodyDepth() >= 0; }
|
||||
|
||||
std::string currentXPath() const {
|
||||
const int bodyIdx = bodyDepth();
|
||||
if (bodyIdx < 0) {
|
||||
return baseXPath();
|
||||
}
|
||||
|
||||
std::string xpath = baseXPath();
|
||||
for (size_t i = static_cast<size_t>(bodyIdx + 1); i < stack.size(); i++) {
|
||||
xpath += "/" + stack[i].tag + "[" + std::to_string(stack[i].index) + "]";
|
||||
}
|
||||
return xpath;
|
||||
}
|
||||
|
||||
// Adds first anchor for an element when text begins and periodic anchors in
|
||||
// longer runs so matching has sufficient granularity without exploding memory.
|
||||
void addAnchorIfNeeded() {
|
||||
if (!insideBody() || stack.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!stack.back().hasTextAnchor) {
|
||||
const std::string xpath = currentXPath();
|
||||
anchors.push_back({totalTextBytes, xpath, removeIndices(xpath)});
|
||||
stack.back().hasTextAnchor = true;
|
||||
} else if (anchors.empty() || totalTextBytes - anchors.back().textOffset >= 192) {
|
||||
const std::string xpath = currentXPath();
|
||||
if (anchors.empty() || anchors.back().xpath != xpath) {
|
||||
anchors.push_back({totalTextBytes, xpath, removeIndices(xpath)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void onStartElement(const XML_Char* rawName) {
|
||||
std::string name = toLower(rawName ? rawName : "");
|
||||
const size_t depth = stack.size();
|
||||
|
||||
if (siblingCounters.size() <= depth) {
|
||||
siblingCounters.resize(depth + 1);
|
||||
}
|
||||
const int siblingIndex = ++siblingCounters[depth][name];
|
||||
|
||||
stack.push_back({name, siblingIndex, false});
|
||||
siblingCounters.emplace_back();
|
||||
|
||||
if (skipDepth < 0 && isSkippableTag(name)) {
|
||||
skipDepth = static_cast<int>(stack.size()) - 1;
|
||||
}
|
||||
}
|
||||
|
||||
void onEndElement() {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (skipDepth == static_cast<int>(stack.size()) - 1) {
|
||||
skipDepth = -1;
|
||||
}
|
||||
|
||||
stack.pop_back();
|
||||
if (!siblingCounters.empty()) {
|
||||
siblingCounters.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
void onCharacterData(const XML_Char* text, const int len) {
|
||||
if (skipDepth >= 0 || len <= 0 || !insideBody() || isWhitespaceOnly(text, len)) {
|
||||
return;
|
||||
}
|
||||
|
||||
addAnchorIfNeeded();
|
||||
totalTextBytes += countVisibleBytes(text, len);
|
||||
}
|
||||
|
||||
std::string chooseXPath(const float intraSpineProgress) const {
|
||||
if (anchors.empty()) {
|
||||
return baseXPath();
|
||||
}
|
||||
if (totalTextBytes == 0) {
|
||||
return anchors.front().xpath;
|
||||
}
|
||||
|
||||
const float clampedProgress = std::max(0.0f, std::min(1.0f, intraSpineProgress));
|
||||
const size_t target = static_cast<size_t>(clampedProgress * static_cast<float>(totalTextBytes));
|
||||
|
||||
// upper_bound returns the first anchor strictly after target; step back to get
|
||||
// the last anchor at-or-before target (the element the user is currently inside).
|
||||
auto it = std::upper_bound(anchors.begin(), anchors.end(), target,
|
||||
[](const size_t value, const XPathAnchor& anchor) { return value < anchor.textOffset; });
|
||||
if (it != anchors.begin()) {
|
||||
--it;
|
||||
}
|
||||
return it->xpath;
|
||||
}
|
||||
|
||||
// Convert path -> progress ratio by matching to nearest available anchor.
|
||||
bool chooseProgressForXPath(const std::string& xpath, float& outIntraSpineProgress, bool& outExactMatch) const {
|
||||
if (anchors.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::string normalized = normalizeXPath(xpath);
|
||||
if (normalized.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t matchedOffset = 0;
|
||||
bool exact = false;
|
||||
const char* matchTier = nullptr;
|
||||
|
||||
bool matched = pickBestAnchorByPath(normalized, false, matchedOffset, exact);
|
||||
if (matched) {
|
||||
matchTier = exact ? "exact" : "ancestor";
|
||||
} else {
|
||||
bool exactRaw = false;
|
||||
matched = pickBestAnchorByPath(normalized, true, matchedOffset, exactRaw);
|
||||
if (matched) {
|
||||
exact = false;
|
||||
matchTier = exactRaw ? "index-insensitive" : "index-insensitive-ancestor";
|
||||
}
|
||||
}
|
||||
|
||||
if (!matched) {
|
||||
LOG_DBG("KOX", "Reverse: spine=%d no anchor match for '%s' (%zu anchors)", spineIndex, normalized.c_str(),
|
||||
anchors.size());
|
||||
return false;
|
||||
}
|
||||
|
||||
outExactMatch = exact;
|
||||
if (totalTextBytes == 0) {
|
||||
outIntraSpineProgress = 0.0f;
|
||||
LOG_DBG("KOX", "Reverse: spine=%d %s match offset=%zu -> progress=0.0 (no text)", spineIndex, matchTier,
|
||||
matchedOffset);
|
||||
return true;
|
||||
}
|
||||
|
||||
outIntraSpineProgress = static_cast<float>(matchedOffset) / static_cast<float>(totalTextBytes);
|
||||
outIntraSpineProgress = std::max(0.0f, std::min(1.0f, outIntraSpineProgress));
|
||||
LOG_DBG("KOX", "Reverse: spine=%d %s match offset=%zu/%zu -> progress=%.3f", spineIndex, matchTier, matchedOffset,
|
||||
totalTextBytes, outIntraSpineProgress);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
void XMLCALL onStartElement(void* userData, const XML_Char* name, const XML_Char**) {
|
||||
auto* state = static_cast<ParserState*>(userData);
|
||||
state->onStartElement(name);
|
||||
}
|
||||
|
||||
void XMLCALL onEndElement(void* userData, const XML_Char*) {
|
||||
auto* state = static_cast<ParserState*>(userData);
|
||||
state->onEndElement();
|
||||
}
|
||||
|
||||
void XMLCALL onCharacterData(void* userData, const XML_Char* text, const int len) {
|
||||
auto* state = static_cast<ParserState*>(userData);
|
||||
state->onCharacterData(text, len);
|
||||
}
|
||||
|
||||
void XMLCALL onDefaultHandlerExpand(void* userData, const XML_Char* text, const int len) {
|
||||
// The default handler fires for comments, PIs, DOCTYPE, and entity references.
|
||||
// Only forward entity references (&..;) to avoid skewing text offsets with
|
||||
// non-visible markup.
|
||||
if (len < 3 || text[0] != '&' || text[len - 1] != ';') {
|
||||
return;
|
||||
}
|
||||
for (int i = 1; i < len - 1; ++i) {
|
||||
if (text[i] == '<' || text[i] == '>') {
|
||||
return;
|
||||
}
|
||||
}
|
||||
auto* state = static_cast<ParserState*>(userData);
|
||||
state->onCharacterData(text, len);
|
||||
}
|
||||
|
||||
// Parse one spine item and return a fully populated ParserState.
|
||||
// Returns std::nullopt if validation, I/O, or XML parse fails.
|
||||
static std::optional<ParserState> parseSpineItem(const std::shared_ptr<Epub>& epub, const int spineIndex) {
|
||||
if (!epub || spineIndex < 0 || spineIndex >= epub->getSpineItemsCount()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const auto spineItem = epub->getSpineItem(spineIndex);
|
||||
if (spineItem.href.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
size_t chapterSize = 0;
|
||||
uint8_t* chapterBytes = epub->readItemContentsToBytes(spineItem.href, &chapterSize, false);
|
||||
if (!chapterBytes || chapterSize == 0) {
|
||||
free(chapterBytes);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
ParserState state(spineIndex);
|
||||
|
||||
XML_Parser parser = XML_ParserCreate(nullptr);
|
||||
if (!parser) {
|
||||
free(chapterBytes);
|
||||
LOG_ERR("KOX", "Failed to allocate XML parser for spine=%d", spineIndex);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
XML_SetUserData(parser, &state);
|
||||
XML_SetElementHandler(parser, onStartElement, onEndElement);
|
||||
XML_SetCharacterDataHandler(parser, onCharacterData);
|
||||
XML_SetDefaultHandlerExpand(parser, onDefaultHandlerExpand);
|
||||
|
||||
const bool parseOk = XML_Parse(parser, reinterpret_cast<const char*>(chapterBytes), static_cast<int>(chapterSize),
|
||||
XML_TRUE) != XML_STATUS_ERROR;
|
||||
|
||||
if (!parseOk) {
|
||||
LOG_ERR("KOX", "XPath parse failed for spine=%d at line %lu: %s", spineIndex, XML_GetCurrentLineNumber(parser),
|
||||
XML_ErrorString(XML_GetErrorCode(parser)));
|
||||
}
|
||||
|
||||
XML_ParserFree(parser);
|
||||
free(chapterBytes);
|
||||
|
||||
if (!parseOk) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::string ChapterXPathIndexer::findXPathForProgress(const std::shared_ptr<Epub>& epub, const int spineIndex,
|
||||
const float intraSpineProgress) {
|
||||
const auto state = parseSpineItem(epub, spineIndex);
|
||||
if (!state) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const std::string result = state->chooseXPath(intraSpineProgress);
|
||||
LOG_DBG("KOX", "Forward: spine=%d progress=%.3f anchors=%zu textBytes=%zu -> %s", spineIndex, intraSpineProgress,
|
||||
state->anchors.size(), state->totalTextBytes, result.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
bool ChapterXPathIndexer::findProgressForXPath(const std::shared_ptr<Epub>& epub, const int spineIndex,
|
||||
const std::string& xpath, float& outIntraSpineProgress,
|
||||
bool& outExactMatch) {
|
||||
outIntraSpineProgress = 0.0f;
|
||||
outExactMatch = false;
|
||||
|
||||
if (xpath.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto state = parseSpineItem(epub, spineIndex);
|
||||
if (!state) {
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_DBG("KOX", "Reverse: spine=%d anchors=%zu textBytes=%zu for '%s'", spineIndex, state->anchors.size(),
|
||||
state->totalTextBytes, xpath.c_str());
|
||||
return state->chooseProgressForXPath(xpath, outIntraSpineProgress, outExactMatch);
|
||||
}
|
||||
|
||||
bool ChapterXPathIndexer::tryExtractSpineIndexFromXPath(const std::string& xpath, int& outSpineIndex) {
|
||||
outSpineIndex = -1;
|
||||
if (xpath.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::string normalized = ParserState::normalizeXPath(xpath);
|
||||
const std::string key = "/docfragment[";
|
||||
const size_t pos = normalized.find(key);
|
||||
if (pos == std::string::npos) {
|
||||
LOG_DBG("KOX", "No DocFragment in xpath: '%s'", xpath.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t start = pos + key.size();
|
||||
size_t end = start;
|
||||
while (end < normalized.size() && std::isdigit(static_cast<unsigned char>(normalized[end]))) {
|
||||
end++;
|
||||
}
|
||||
|
||||
if (end == start || end >= normalized.size() || normalized[end] != ']') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::string value = normalized.substr(start, end - start);
|
||||
const long parsed = std::strtol(value.c_str(), nullptr, 10);
|
||||
// KOReader uses 1-based DocFragment indices; convert to 0-based spine index.
|
||||
if (parsed < 1 || parsed > std::numeric_limits<int>::max()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
outSpineIndex = static_cast<int>(parsed) - 1;
|
||||
return true;
|
||||
}
|
||||
67
lib/KOReaderSync/ChapterXPathIndexer.h
Normal file
67
lib/KOReaderSync/ChapterXPathIndexer.h
Normal file
@@ -0,0 +1,67 @@
|
||||
#pragma once
|
||||
|
||||
#include <Epub.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
/**
|
||||
* Lightweight XPath/progress bridge for KOReader sync.
|
||||
*
|
||||
* Why this exists:
|
||||
* - CrossPoint stores reading position as chapter/page.
|
||||
* - KOReader sync uses XPath + percentage.
|
||||
*
|
||||
* This utility reparses exactly one spine XHTML item with Expat and builds
|
||||
* transient text anchors (<xpath, textOffset>) so we can translate in both
|
||||
* directions without keeping a full DOM in memory.
|
||||
*
|
||||
* Design constraints (ESP32-C3):
|
||||
* - No persistent full-book structures.
|
||||
* - Parse-on-demand and free memory immediately.
|
||||
* - Keep fallback behavior deterministic if parsing/matching fails.
|
||||
*/
|
||||
class ChapterXPathIndexer {
|
||||
public:
|
||||
/**
|
||||
* Convert an intra-spine progress ratio to the nearest element-level XPath.
|
||||
*
|
||||
* @param epub Loaded EPUB instance
|
||||
* @param spineIndex Current spine item index
|
||||
* @param intraSpineProgress Position within the spine item [0.0, 1.0]
|
||||
* @return Best matching XPath for KOReader, or empty string on failure
|
||||
*/
|
||||
static std::string findXPathForProgress(const std::shared_ptr<Epub>& epub, int spineIndex, float intraSpineProgress);
|
||||
|
||||
/**
|
||||
* Resolve a KOReader XPath to an intra-spine progress ratio.
|
||||
*
|
||||
* Matching strategy:
|
||||
* 1) exact anchor path match,
|
||||
* 2) index-insensitive path match,
|
||||
* 3) ancestor fallback.
|
||||
*
|
||||
* @param epub Loaded EPUB instance
|
||||
* @param spineIndex Spine item index to parse
|
||||
* @param xpath Incoming KOReader XPath
|
||||
* @param outIntraSpineProgress Resolved position within spine [0.0, 1.0]
|
||||
* @param outExactMatch True only for full exact path match
|
||||
* @return true if any match was resolved; false means caller should fallback
|
||||
*/
|
||||
static bool findProgressForXPath(const std::shared_ptr<Epub>& epub, int spineIndex, const std::string& xpath,
|
||||
float& outIntraSpineProgress, bool& outExactMatch);
|
||||
|
||||
/**
|
||||
* Parse DocFragment index from KOReader-style path segment:
|
||||
* /body/DocFragment[N]/body/...
|
||||
*
|
||||
* KOReader uses 1-based DocFragment indices; N is converted to the 0-based
|
||||
* spine index stored in outSpineIndex (i.e. outSpineIndex = N - 1).
|
||||
*
|
||||
* @param xpath KOReader XPath
|
||||
* @param outSpineIndex 0-based spine index derived from DocFragment[N]
|
||||
* @return true when DocFragment[N] exists and N is a valid integer >= 1
|
||||
* (converted to 0-based outSpineIndex); false otherwise
|
||||
*/
|
||||
static bool tryExtractSpineIndexFromXPath(const std::string& xpath, int& outSpineIndex);
|
||||
};
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <Logging.h>
|
||||
#include <MD5Builder.h>
|
||||
|
||||
#include <functional>
|
||||
|
||||
namespace {
|
||||
// Extract filename from path (everything after last '/')
|
||||
std::string getFilename(const std::string& path) {
|
||||
@@ -15,6 +17,130 @@ std::string getFilename(const std::string& path) {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::string KOReaderDocumentId::getCacheFilePath(const std::string& filePath) {
|
||||
// Mirror the Epub cache directory convention so the hash file shares the
|
||||
// same per-book folder as other cached data.
|
||||
return std::string("/.crosspoint/epub_") + std::to_string(std::hash<std::string>{}(filePath)) + "/koreader_docid.txt";
|
||||
}
|
||||
|
||||
std::string KOReaderDocumentId::loadCachedHash(const std::string& cacheFilePath, const size_t fileSize,
|
||||
const std::string& currentFingerprint) {
|
||||
if (!Storage.exists(cacheFilePath.c_str())) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const String content = Storage.readFile(cacheFilePath.c_str());
|
||||
if (content.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Format: "<filesize>:<fingerprint>\n<32-char-hex-hash>"
|
||||
const int newlinePos = content.indexOf('\n');
|
||||
if (newlinePos < 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const String header = content.substring(0, newlinePos);
|
||||
const int colonPos = header.indexOf(':');
|
||||
if (colonPos < 0) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: header missing fingerprint");
|
||||
return "";
|
||||
}
|
||||
|
||||
const String sizeTok = header.substring(0, colonPos);
|
||||
const String fpTok = header.substring(colonPos + 1);
|
||||
|
||||
// Validate the filesize token – it must consist of ASCII digits and parse
|
||||
// correctly to the expected size.
|
||||
bool digitsOnly = true;
|
||||
for (size_t i = 0; i < sizeTok.length(); ++i) {
|
||||
const char ch = sizeTok[i];
|
||||
if (ch < '0' || ch > '9') {
|
||||
digitsOnly = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!digitsOnly) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: size token not numeric ('%s')", sizeTok.c_str());
|
||||
return "";
|
||||
}
|
||||
|
||||
const long parsed = sizeTok.toInt();
|
||||
if (parsed < 0) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: size token parse error ('%s')", sizeTok.c_str());
|
||||
return "";
|
||||
}
|
||||
const size_t cachedSize = static_cast<size_t>(parsed);
|
||||
if (cachedSize != fileSize) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: file size or fingerprint changed (%zu -> %zu)", cachedSize, fileSize);
|
||||
return "";
|
||||
}
|
||||
|
||||
// Validate stored fingerprint format (8 hex characters)
|
||||
if (fpTok.length() != 8) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: bad fingerprint length (%zu)", fpTok.length());
|
||||
return "";
|
||||
}
|
||||
for (size_t i = 0; i < fpTok.length(); ++i) {
|
||||
char c = fpTok[i];
|
||||
bool hex = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
if (!hex) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: non-hex character '%c' in fingerprint", c);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
String currentFpStr(currentFingerprint.c_str());
|
||||
if (fpTok != currentFpStr) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: fingerprint changed (%s != %s)", fpTok.c_str(),
|
||||
currentFingerprint.c_str());
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
std::string hash = content.substring(newlinePos + 1).c_str();
|
||||
// Trim any trailing whitespace / line endings
|
||||
while (!hash.empty() && (hash.back() == '\n' || hash.back() == '\r' || hash.back() == ' ')) {
|
||||
hash.pop_back();
|
||||
}
|
||||
|
||||
// Hash must be exactly 32 hex characters.
|
||||
if (hash.size() != 32) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: wrong hash length (%zu)", hash.size());
|
||||
return "";
|
||||
}
|
||||
for (char c : hash) {
|
||||
if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))) {
|
||||
LOG_DBG("KODoc", "Hash cache invalidated: non-hex character '%c' in hash", c);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("KODoc", "Hash cache hit: %s", hash.c_str());
|
||||
return hash;
|
||||
}
|
||||
|
||||
void KOReaderDocumentId::saveCachedHash(const std::string& cacheFilePath, const size_t fileSize,
|
||||
const std::string& fingerprint, const std::string& hash) {
|
||||
// Ensure the book's cache directory exists before writing
|
||||
const size_t lastSlash = cacheFilePath.rfind('/');
|
||||
if (lastSlash != std::string::npos) {
|
||||
Storage.ensureDirectoryExists(cacheFilePath.substr(0, lastSlash).c_str());
|
||||
}
|
||||
|
||||
// Format: "<filesize>:<fingerprint>\n<hash>"
|
||||
String content(std::to_string(fileSize).c_str());
|
||||
content += ':';
|
||||
content += fingerprint.c_str();
|
||||
content += '\n';
|
||||
content += hash.c_str();
|
||||
|
||||
if (!Storage.writeFile(cacheFilePath.c_str(), content)) {
|
||||
LOG_DBG("KODoc", "Failed to write hash cache to %s", cacheFilePath.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
std::string KOReaderDocumentId::calculateFromFilename(const std::string& filePath) {
|
||||
const std::string filename = getFilename(filePath);
|
||||
if (filename.empty()) {
|
||||
@@ -49,6 +175,30 @@ std::string KOReaderDocumentId::calculate(const std::string& filePath) {
|
||||
}
|
||||
|
||||
const size_t fileSize = file.fileSize();
|
||||
|
||||
// Compute a lightweight fingerprint from the file's modification time.
|
||||
// The underlying FsFile API provides getModifyDateTime which returns two
|
||||
// packed 16-bit values (date and time). Concatenate these as eight hex
|
||||
// digits to produce the token stored in the cache header.
|
||||
uint16_t date = 0, time = 0;
|
||||
if (!file.getModifyDateTime(&date, &time)) {
|
||||
// If timestamp isn't available for some reason, fall back to a sentinel.
|
||||
date = 0;
|
||||
time = 0;
|
||||
}
|
||||
char fpBuf[9];
|
||||
// two 16-bit numbers => 4 hex digits each
|
||||
sprintf(fpBuf, "%04x%04x", date, time);
|
||||
const std::string fingerprintTok(fpBuf);
|
||||
|
||||
// Return persisted hash if the file size and fingerprint haven't changed.
|
||||
const std::string cacheFilePath = getCacheFilePath(filePath);
|
||||
const std::string cached = loadCachedHash(cacheFilePath, fileSize, fingerprintTok);
|
||||
if (!cached.empty()) {
|
||||
file.close();
|
||||
return cached;
|
||||
}
|
||||
|
||||
LOG_DBG("KODoc", "Calculating hash for file: %s (size: %zu)", filePath.c_str(), fileSize);
|
||||
|
||||
// Initialize MD5 builder
|
||||
@@ -92,5 +242,7 @@ std::string KOReaderDocumentId::calculate(const std::string& filePath) {
|
||||
|
||||
LOG_DBG("KODoc", "Hash calculated: %s (from %zu bytes)", result.c_str(), totalBytesRead);
|
||||
|
||||
saveCachedHash(cacheFilePath, fileSize, fingerprintTok, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -42,4 +42,31 @@ class KOReaderDocumentId {
|
||||
|
||||
// Calculate offset for index i: 1024 << (2*i)
|
||||
static size_t getOffset(int i);
|
||||
|
||||
// Hash cache helpers
|
||||
// Returns the path to the per-book cache file that stores the precomputed hash.
|
||||
// Uses the same directory convention as the Epub cache (/.crosspoint/epub_<hash>/).
|
||||
static std::string getCacheFilePath(const std::string& filePath);
|
||||
|
||||
// Returns the cached hash if the file size and fingerprint match, or empty
|
||||
// string on miss/invalidation.
|
||||
//
|
||||
// The fingerprint is derived from the file's modification timestamp. We
|
||||
// call `FsFile::getModifyDateTime` to retrieve two 16‑bit packed values
|
||||
// supplied by the filesystem: one for the date and one for the time. These
|
||||
// are concatenated and represented as eight hexadecimal digits in the form
|
||||
// <date><time> (high 16 bits = packed date, low 16 bits = packed time).
|
||||
//
|
||||
// The resulting string serves as a lightweight change signal; any modification
|
||||
// to the file's mtime will alter the packed date/time combo and invalidate
|
||||
// the cache entry. Since the full document hash is expensive to compute,
|
||||
// using the packed timestamp gives us a quick way to detect modifications
|
||||
// without reading file contents.
|
||||
static std::string loadCachedHash(const std::string& cacheFilePath, size_t fileSize,
|
||||
const std::string& currentFingerprint);
|
||||
|
||||
// Persists the computed hash alongside the file size and fingerprint (the
|
||||
// modification-timestamp token) used to generate it.
|
||||
static void saveCachedHash(const std::string& cacheFilePath, size_t fileSize, const std::string& fingerprint,
|
||||
const std::string& hash);
|
||||
};
|
||||
|
||||
@@ -2,8 +2,11 @@
|
||||
|
||||
#include <Logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
#include "ChapterXPathIndexer.h"
|
||||
|
||||
KOReaderPosition ProgressMapper::toKOReader(const std::shared_ptr<Epub>& epub, const CrossPointPosition& pos) {
|
||||
KOReaderPosition result;
|
||||
|
||||
@@ -16,8 +19,13 @@ KOReaderPosition ProgressMapper::toKOReader(const std::shared_ptr<Epub>& epub, c
|
||||
// Calculate overall book progress (0.0-1.0)
|
||||
result.percentage = epub->calculateProgress(pos.spineIndex, intraSpineProgress);
|
||||
|
||||
// Generate XPath with estimated paragraph position based on page
|
||||
result.xpath = generateXPath(pos.spineIndex, pos.pageNumber, pos.totalPages);
|
||||
// Generate the best available XPath for the current chapter position.
|
||||
// Prefer element-level XPaths from a lightweight XHTML reparse; fall back
|
||||
// to a synthetic chapter-level path if parsing fails.
|
||||
result.xpath = ChapterXPathIndexer::findXPathForProgress(epub, pos.spineIndex, intraSpineProgress);
|
||||
if (result.xpath.empty()) {
|
||||
result.xpath = generateXPath(pos.spineIndex);
|
||||
}
|
||||
|
||||
// Get chapter info for logging
|
||||
const int tocIndex = epub->getTocIndexForSpineIndex(pos.spineIndex);
|
||||
@@ -36,34 +44,69 @@ CrossPointPosition ProgressMapper::toCrossPoint(const std::shared_ptr<Epub>& epu
|
||||
result.pageNumber = 0;
|
||||
result.totalPages = 0;
|
||||
|
||||
const size_t bookSize = epub->getBookSize();
|
||||
if (bookSize == 0) {
|
||||
if (!epub || epub->getSpineItemsCount() <= 0) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Use percentage-based lookup for both spine and page positioning
|
||||
// XPath parsing is unreliable since CrossPoint doesn't preserve detailed HTML structure
|
||||
const size_t targetBytes = static_cast<size_t>(bookSize * koPos.percentage);
|
||||
|
||||
// Find the spine item that contains this byte position
|
||||
const int spineCount = epub->getSpineItemsCount();
|
||||
bool spineFound = false;
|
||||
for (int i = 0; i < spineCount; i++) {
|
||||
const size_t cumulativeSize = epub->getCumulativeSpineItemSize(i);
|
||||
if (cumulativeSize >= targetBytes) {
|
||||
result.spineIndex = i;
|
||||
spineFound = true;
|
||||
break;
|
||||
|
||||
float resolvedIntraSpineProgress = -1.0f;
|
||||
bool xpathExactMatch = false;
|
||||
bool usedXPathMapping = false;
|
||||
|
||||
int xpathSpineIndex = -1;
|
||||
if (ChapterXPathIndexer::tryExtractSpineIndexFromXPath(koPos.xpath, xpathSpineIndex) && xpathSpineIndex >= 0 &&
|
||||
xpathSpineIndex < spineCount) {
|
||||
float intraFromXPath = 0.0f;
|
||||
if (ChapterXPathIndexer::findProgressForXPath(epub, xpathSpineIndex, koPos.xpath, intraFromXPath,
|
||||
xpathExactMatch)) {
|
||||
result.spineIndex = xpathSpineIndex;
|
||||
resolvedIntraSpineProgress = intraFromXPath;
|
||||
usedXPathMapping = true;
|
||||
}
|
||||
}
|
||||
|
||||
// If no spine item was found (e.g., targetBytes beyond last cumulative size),
|
||||
// default to the last spine item so we map to the end of the book instead of the beginning.
|
||||
if (!spineFound && spineCount > 0) {
|
||||
result.spineIndex = spineCount - 1;
|
||||
if (!usedXPathMapping) {
|
||||
const size_t bookSize = epub->getBookSize();
|
||||
if (bookSize == 0) {
|
||||
return result;
|
||||
}
|
||||
|
||||
if (!std::isfinite(koPos.percentage)) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const float sanitizedPercentage = std::clamp(koPos.percentage, 0.0f, 1.0f);
|
||||
const size_t targetBytes = static_cast<size_t>(bookSize * sanitizedPercentage);
|
||||
|
||||
bool spineFound = false;
|
||||
for (int i = 0; i < spineCount; i++) {
|
||||
const size_t cumulativeSize = epub->getCumulativeSpineItemSize(i);
|
||||
if (cumulativeSize >= targetBytes) {
|
||||
result.spineIndex = i;
|
||||
spineFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!spineFound && spineCount > 0) {
|
||||
result.spineIndex = spineCount - 1;
|
||||
}
|
||||
|
||||
if (result.spineIndex < epub->getSpineItemsCount()) {
|
||||
const size_t prevCumSize = (result.spineIndex > 0) ? epub->getCumulativeSpineItemSize(result.spineIndex - 1) : 0;
|
||||
const size_t currentCumSize = epub->getCumulativeSpineItemSize(result.spineIndex);
|
||||
const size_t spineSize = currentCumSize - prevCumSize;
|
||||
|
||||
if (spineSize > 0) {
|
||||
const size_t bytesIntoSpine = (targetBytes > prevCumSize) ? (targetBytes - prevCumSize) : 0;
|
||||
resolvedIntraSpineProgress = static_cast<float>(bytesIntoSpine) / static_cast<float>(spineSize);
|
||||
resolvedIntraSpineProgress = std::max(0.0f, std::min(1.0f, resolvedIntraSpineProgress));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Estimate page number within the spine item using percentage
|
||||
// Estimate page number within the selected spine item
|
||||
if (result.spineIndex < epub->getSpineItemsCount()) {
|
||||
const size_t prevCumSize = (result.spineIndex > 0) ? epub->getCumulativeSpineItemSize(result.spineIndex - 1) : 0;
|
||||
const size_t currentCumSize = epub->getCumulativeSpineItemSize(result.spineIndex);
|
||||
@@ -91,24 +134,24 @@ CrossPointPosition ProgressMapper::toCrossPoint(const std::shared_ptr<Epub>& epu
|
||||
|
||||
result.totalPages = estimatedTotalPages;
|
||||
|
||||
if (spineSize > 0 && estimatedTotalPages > 0) {
|
||||
const size_t bytesIntoSpine = (targetBytes > prevCumSize) ? (targetBytes - prevCumSize) : 0;
|
||||
const float intraSpineProgress = static_cast<float>(bytesIntoSpine) / static_cast<float>(spineSize);
|
||||
const float clampedProgress = std::max(0.0f, std::min(1.0f, intraSpineProgress));
|
||||
result.pageNumber = static_cast<int>(clampedProgress * estimatedTotalPages);
|
||||
if (estimatedTotalPages > 0 && resolvedIntraSpineProgress >= 0.0f) {
|
||||
const float clampedProgress = std::max(0.0f, std::min(1.0f, resolvedIntraSpineProgress));
|
||||
result.pageNumber = static_cast<int>(clampedProgress * static_cast<float>(estimatedTotalPages));
|
||||
result.pageNumber = std::max(0, std::min(result.pageNumber, estimatedTotalPages - 1));
|
||||
} else if (spineSize > 0 && estimatedTotalPages > 0) {
|
||||
result.pageNumber = 0;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("ProgressMapper", "KOReader -> CrossPoint: %.2f%% at %s -> spine=%d, page=%d", koPos.percentage * 100,
|
||||
koPos.xpath.c_str(), result.spineIndex, result.pageNumber);
|
||||
LOG_DBG("ProgressMapper", "KOReader -> CrossPoint: %.2f%% at %s -> spine=%d, page=%d (%s, exact=%s)",
|
||||
koPos.percentage * 100, koPos.xpath.c_str(), result.spineIndex, result.pageNumber,
|
||||
usedXPathMapping ? "xpath" : "percentage", xpathExactMatch ? "yes" : "no");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string ProgressMapper::generateXPath(int spineIndex, int pageNumber, int totalPages) {
|
||||
// Use 0-based DocFragment indices for KOReader
|
||||
// Use a simple xpath pointing to the DocFragment - KOReader will use the percentage for fine positioning within it
|
||||
// Avoid specifying paragraph numbers as they may not exist in the target document
|
||||
return "/body/DocFragment[" + std::to_string(spineIndex) + "]/body";
|
||||
std::string ProgressMapper::generateXPath(int spineIndex) {
|
||||
// Fallback path when element-level XPath extraction is unavailable.
|
||||
// KOReader uses 1-based XPath predicates; spineIndex is 0-based internally.
|
||||
return "/body/DocFragment[" + std::to_string(spineIndex + 1) + "]/body";
|
||||
}
|
||||
|
||||
@@ -27,9 +27,16 @@ struct KOReaderPosition {
|
||||
* CrossPoint tracks position as (spineIndex, pageNumber).
|
||||
* KOReader uses XPath-like strings + percentage.
|
||||
*
|
||||
* Since CrossPoint discards HTML structure during parsing, we generate
|
||||
* synthetic XPath strings based on spine index, using percentage as the
|
||||
* primary sync mechanism.
|
||||
* Forward mapping (CrossPoint -> KOReader):
|
||||
* - Prefer element-level XPath extracted from current spine XHTML.
|
||||
* - Fallback to synthetic chapter XPath if extraction fails.
|
||||
*
|
||||
* Reverse mapping (KOReader -> CrossPoint):
|
||||
* - Prefer incoming XPath (DocFragment + element path) when resolvable.
|
||||
* - Fallback to percentage-based approximation when XPath is missing/invalid.
|
||||
*
|
||||
* This keeps behavior stable on low-memory devices while improving round-trip
|
||||
* sync precision when KOReader provides detailed paths.
|
||||
*/
|
||||
class ProgressMapper {
|
||||
public:
|
||||
@@ -45,8 +52,9 @@ class ProgressMapper {
|
||||
/**
|
||||
* Convert KOReader position to CrossPoint format.
|
||||
*
|
||||
* Note: The returned pageNumber may be approximate since different
|
||||
* rendering settings produce different page counts.
|
||||
* Uses XPath-first resolution when possible and percentage fallback otherwise.
|
||||
* Returned pageNumber can still be approximate because page counts differ
|
||||
* across renderer/font/layout settings.
|
||||
*
|
||||
* @param epub The EPUB book
|
||||
* @param koPos KOReader position
|
||||
@@ -60,8 +68,7 @@ class ProgressMapper {
|
||||
private:
|
||||
/**
|
||||
* Generate XPath for KOReader compatibility.
|
||||
* Format: /body/DocFragment[spineIndex+1]/body
|
||||
* Since CrossPoint doesn't preserve HTML structure, we rely on percentage for positioning.
|
||||
* Fallback format: /body/DocFragment[spineIndex + 1]/body
|
||||
*/
|
||||
static std::string generateXPath(int spineIndex, int pageNumber, int totalPages);
|
||||
static std::string generateXPath(int spineIndex);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user