#include "ChapterXPathIndexer.h" #include #include #include #include #include #include #include #include #include #include namespace { // Anchor used for both mapping directions. // textOffset is counted as visible (non-whitespace) bytes from chapter start. // xpath points to the nearest element path at/near that offset. struct XPathAnchor { size_t textOffset = 0; std::string xpath; std::string xpathNoIndex; // precomputed removeIndices(xpath) }; struct StackNode { std::string tag; int index = 1; bool hasTextAnchor = false; }; // ParserState is intentionally ephemeral and created per lookup call. // It holds only one spine parse worth of data to avoid retaining structures // that would increase long-lived heap usage on the ESP32-C3. struct ParserState { explicit ParserState(const int spineIndex) : spineIndex(spineIndex) { siblingCounters.emplace_back(); } int spineIndex = 0; int skipDepth = -1; size_t totalTextBytes = 0; std::vector stack; std::vector> siblingCounters; std::vector anchors; std::string baseXPath() const { return "/body/DocFragment[" + std::to_string(spineIndex + 1) + "]/body"; } // Canonicalize incoming KOReader XPath before matching: // - remove all whitespace // - lowercase tags // - strip optional trailing /text() // - strip trailing slash static std::string normalizeXPath(const std::string& input) { if (input.empty()) { return ""; } std::string out; out.reserve(input.size()); for (char c : input) { const unsigned char uc = static_cast(c); if (std::isspace(uc)) { continue; } out.push_back(static_cast(std::tolower(uc))); } const std::string textSuffix = "/text()"; const size_t textPos = out.rfind(textSuffix); if (textPos != std::string::npos && textPos + textSuffix.size() == out.size()) { out.erase(textPos); } while (!out.empty() && out.back() == '/') { out.pop_back(); } return out; } // Remove bracketed numeric predicates so paths can be compared even when // index counters differ between parser implementations. static std::string removeIndices(const std::string& xpath) { std::string out; out.reserve(xpath.size()); bool inBracket = false; for (char c : xpath) { if (c == '[') { inBracket = true; continue; } if (c == ']') { inBracket = false; continue; } if (!inBracket) { out.push_back(c); } } return out; } static int pathDepth(const std::string& xpath) { int depth = 0; for (char c : xpath) { if (c == '/') { depth++; } } return depth; } // Resolve a path to the best anchor offset. // If exact node path is not found, progressively trim trailing segments and // match ancestors to obtain a stable approximate location. bool pickBestAnchorByPath(const std::string& targetPath, const bool ignoreIndices, size_t& outTextOffset, bool& outExact) const { if (targetPath.empty() || anchors.empty()) { return false; } const std::string normalizedTarget = ignoreIndices ? removeIndices(targetPath) : targetPath; std::string probe = normalizedTarget; bool exactProbe = true; while (!probe.empty()) { int bestDepth = -1; size_t bestOffset = 0; bool found = false; for (const auto& anchor : anchors) { const std::string& anchorPath = ignoreIndices ? anchor.xpathNoIndex : anchor.xpath; if (anchorPath == probe) { const int depth = pathDepth(anchorPath); if (!found || depth > bestDepth || (depth == bestDepth && anchor.textOffset < bestOffset)) { found = true; bestDepth = depth; bestOffset = anchor.textOffset; } } } if (found) { outTextOffset = bestOffset; outExact = exactProbe; return true; } const size_t lastSlash = probe.find_last_of('/'); if (lastSlash == std::string::npos || lastSlash == 0) { break; } probe.erase(lastSlash); exactProbe = false; } return false; } static std::string toLower(std::string value) { for (char& c : value) { c = static_cast(std::tolower(static_cast(c))); } return value; } // Elements that should not contribute text position anchors. static bool isSkippableTag(const std::string& tag) { return tag == "head" || tag == "script" || tag == "style"; } static bool isWhitespaceOnly(const XML_Char* text, const int len) { for (int i = 0; i < len; i++) { if (!std::isspace(static_cast(text[i]))) { return false; } } return true; } // Count non-whitespace bytes to keep offsets stable against formatting-only // differences and indentation in source XHTML. static size_t countVisibleBytes(const XML_Char* text, const int len) { size_t count = 0; for (int i = 0; i < len; i++) { if (!std::isspace(static_cast(text[i]))) { count++; } } return count; } int bodyDepth() const { for (int i = static_cast(stack.size()) - 1; i >= 0; i--) { if (stack[i].tag == "body") { return i; } } return -1; } bool insideBody() const { return bodyDepth() >= 0; } std::string currentXPath() const { const int bodyIdx = bodyDepth(); if (bodyIdx < 0) { return baseXPath(); } std::string xpath = baseXPath(); for (size_t i = static_cast(bodyIdx + 1); i < stack.size(); i++) { xpath += "/" + stack[i].tag + "[" + std::to_string(stack[i].index) + "]"; } return xpath; } // Adds first anchor for an element when text begins and periodic anchors in // longer runs so matching has sufficient granularity without exploding memory. void addAnchorIfNeeded() { if (!insideBody() || stack.empty()) { return; } if (!stack.back().hasTextAnchor) { const std::string xpath = currentXPath(); anchors.push_back({totalTextBytes, xpath, removeIndices(xpath)}); stack.back().hasTextAnchor = true; } else if (anchors.empty() || totalTextBytes - anchors.back().textOffset >= 192) { const std::string xpath = currentXPath(); if (anchors.empty() || anchors.back().xpath != xpath) { anchors.push_back({totalTextBytes, xpath, removeIndices(xpath)}); } } } void onStartElement(const XML_Char* rawName) { std::string name = toLower(rawName ? rawName : ""); const size_t depth = stack.size(); if (siblingCounters.size() <= depth) { siblingCounters.resize(depth + 1); } const int siblingIndex = ++siblingCounters[depth][name]; stack.push_back({name, siblingIndex, false}); siblingCounters.emplace_back(); if (skipDepth < 0 && isSkippableTag(name)) { skipDepth = static_cast(stack.size()) - 1; } } void onEndElement() { if (stack.empty()) { return; } if (skipDepth == static_cast(stack.size()) - 1) { skipDepth = -1; } stack.pop_back(); if (!siblingCounters.empty()) { siblingCounters.pop_back(); } } void onCharacterData(const XML_Char* text, const int len) { if (skipDepth >= 0 || len <= 0 || !insideBody() || isWhitespaceOnly(text, len)) { return; } addAnchorIfNeeded(); totalTextBytes += countVisibleBytes(text, len); } std::string chooseXPath(const float intraSpineProgress) const { if (anchors.empty()) { return baseXPath(); } if (totalTextBytes == 0) { return anchors.front().xpath; } const float clampedProgress = std::max(0.0f, std::min(1.0f, intraSpineProgress)); const size_t target = static_cast(clampedProgress * static_cast(totalTextBytes)); // upper_bound returns the first anchor strictly after target; step back to get // the last anchor at-or-before target (the element the user is currently inside). auto it = std::upper_bound(anchors.begin(), anchors.end(), target, [](const size_t value, const XPathAnchor& anchor) { return value < anchor.textOffset; }); if (it != anchors.begin()) { --it; } return it->xpath; } // Convert path -> progress ratio by matching to nearest available anchor. bool chooseProgressForXPath(const std::string& xpath, float& outIntraSpineProgress, bool& outExactMatch) const { if (anchors.empty()) { return false; } const std::string normalized = normalizeXPath(xpath); if (normalized.empty()) { return false; } size_t matchedOffset = 0; bool exact = false; const char* matchTier = nullptr; bool matched = pickBestAnchorByPath(normalized, false, matchedOffset, exact); if (matched) { matchTier = exact ? "exact" : "ancestor"; } else { bool exactRaw = false; matched = pickBestAnchorByPath(normalized, true, matchedOffset, exactRaw); if (matched) { exact = false; matchTier = exactRaw ? "index-insensitive" : "index-insensitive-ancestor"; } } if (!matched) { LOG_DBG("KOX", "Reverse: spine=%d no anchor match for '%s' (%zu anchors)", spineIndex, normalized.c_str(), anchors.size()); return false; } outExactMatch = exact; if (totalTextBytes == 0) { outIntraSpineProgress = 0.0f; LOG_DBG("KOX", "Reverse: spine=%d %s match offset=%zu -> progress=0.0 (no text)", spineIndex, matchTier, matchedOffset); return true; } outIntraSpineProgress = static_cast(matchedOffset) / static_cast(totalTextBytes); outIntraSpineProgress = std::max(0.0f, std::min(1.0f, outIntraSpineProgress)); LOG_DBG("KOX", "Reverse: spine=%d %s match offset=%zu/%zu -> progress=%.3f", spineIndex, matchTier, matchedOffset, totalTextBytes, outIntraSpineProgress); return true; } }; void XMLCALL onStartElement(void* userData, const XML_Char* name, const XML_Char**) { auto* state = static_cast(userData); state->onStartElement(name); } void XMLCALL onEndElement(void* userData, const XML_Char*) { auto* state = static_cast(userData); state->onEndElement(); } void XMLCALL onCharacterData(void* userData, const XML_Char* text, const int len) { auto* state = static_cast(userData); state->onCharacterData(text, len); } void XMLCALL onDefaultHandlerExpand(void* userData, const XML_Char* text, const int len) { // The default handler fires for comments, PIs, DOCTYPE, and entity references. // Only forward entity references (&..;) to avoid skewing text offsets with // non-visible markup. if (len < 3 || text[0] != '&' || text[len - 1] != ';') { return; } for (int i = 1; i < len - 1; ++i) { if (text[i] == '<' || text[i] == '>') { return; } } auto* state = static_cast(userData); state->onCharacterData(text, len); } // Parse one spine item and return a fully populated ParserState. // Returns std::nullopt if validation, I/O, or XML parse fails. static std::optional parseSpineItem(const std::shared_ptr& epub, const int spineIndex) { if (!epub || spineIndex < 0 || spineIndex >= epub->getSpineItemsCount()) { return std::nullopt; } const auto spineItem = epub->getSpineItem(spineIndex); if (spineItem.href.empty()) { return std::nullopt; } size_t chapterSize = 0; uint8_t* chapterBytes = epub->readItemContentsToBytes(spineItem.href, &chapterSize, false); if (!chapterBytes || chapterSize == 0) { free(chapterBytes); return std::nullopt; } ParserState state(spineIndex); XML_Parser parser = XML_ParserCreate(nullptr); if (!parser) { free(chapterBytes); LOG_ERR("KOX", "Failed to allocate XML parser for spine=%d", spineIndex); return std::nullopt; } XML_SetUserData(parser, &state); XML_SetElementHandler(parser, onStartElement, onEndElement); XML_SetCharacterDataHandler(parser, onCharacterData); XML_SetDefaultHandlerExpand(parser, onDefaultHandlerExpand); const bool parseOk = XML_Parse(parser, reinterpret_cast(chapterBytes), static_cast(chapterSize), XML_TRUE) != XML_STATUS_ERROR; if (!parseOk) { LOG_ERR("KOX", "XPath parse failed for spine=%d at line %lu: %s", spineIndex, XML_GetCurrentLineNumber(parser), XML_ErrorString(XML_GetErrorCode(parser))); } XML_ParserFree(parser); free(chapterBytes); if (!parseOk) { return std::nullopt; } return state; } } // namespace std::string ChapterXPathIndexer::findXPathForProgress(const std::shared_ptr& epub, const int spineIndex, const float intraSpineProgress) { const auto state = parseSpineItem(epub, spineIndex); if (!state) { return ""; } const std::string result = state->chooseXPath(intraSpineProgress); LOG_DBG("KOX", "Forward: spine=%d progress=%.3f anchors=%zu textBytes=%zu -> %s", spineIndex, intraSpineProgress, state->anchors.size(), state->totalTextBytes, result.c_str()); return result; } bool ChapterXPathIndexer::findProgressForXPath(const std::shared_ptr& epub, const int spineIndex, const std::string& xpath, float& outIntraSpineProgress, bool& outExactMatch) { outIntraSpineProgress = 0.0f; outExactMatch = false; if (xpath.empty()) { return false; } const auto state = parseSpineItem(epub, spineIndex); if (!state) { return false; } LOG_DBG("KOX", "Reverse: spine=%d anchors=%zu textBytes=%zu for '%s'", spineIndex, state->anchors.size(), state->totalTextBytes, xpath.c_str()); return state->chooseProgressForXPath(xpath, outIntraSpineProgress, outExactMatch); } bool ChapterXPathIndexer::tryExtractSpineIndexFromXPath(const std::string& xpath, int& outSpineIndex) { outSpineIndex = -1; if (xpath.empty()) { return false; } const std::string normalized = ParserState::normalizeXPath(xpath); const std::string key = "/docfragment["; const size_t pos = normalized.find(key); if (pos == std::string::npos) { LOG_DBG("KOX", "No DocFragment in xpath: '%s'", xpath.c_str()); return false; } const size_t start = pos + key.size(); size_t end = start; while (end < normalized.size() && std::isdigit(static_cast(normalized[end]))) { end++; } if (end == start || end >= normalized.size() || normalized[end] != ']') { return false; } const std::string value = normalized.substr(start, end - start); const long parsed = std::strtol(value.c_str(), nullptr, 10); // KOReader uses 1-based DocFragment indices; convert to 0-based spine index. if (parsed < 1 || parsed > std::numeric_limits::max()) { return false; } outSpineIndex = static_cast(parsed) - 1; return true; }