granular position tracking

This commit is contained in:
cottongin
2026-01-25 00:24:54 -05:00
parent fedc14bcb4
commit 91c8cc67ce
9 changed files with 345 additions and 61 deletions

View File

@@ -40,6 +40,12 @@ class Page {
public:
// the list of block index and line numbers on this page
std::vector<std::shared_ptr<PageElement>> elements;
// Byte offset in source HTML where this page's content begins
// Used for restoring reading position after re-indexing due to font/setting changes
// This is stored in the Section file's LUT, not in Page serialization
uint32_t firstContentOffset = 0;
void render(GfxRenderer& renderer, int fontId, int xOffset, int yOffset) const;
bool serialize(FsFile& file) const;
static std::unique_ptr<Page> deserialize(FsFile& file);

View File

@@ -8,10 +8,15 @@
#include "parsers/ChapterHtmlSlimParser.h"
namespace {
constexpr uint8_t SECTION_FILE_VERSION = 11;
// Version 12: Added content offsets to LUT for position restoration after re-indexing
constexpr uint8_t SECTION_FILE_VERSION = 12;
constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint8_t) +
sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(bool) +
sizeof(uint32_t);
// LUT entry structure: { filePosition, contentOffset }
// Each entry is 8 bytes (2 x uint32_t)
constexpr size_t LUT_ENTRY_SIZE = sizeof(uint32_t) * 2;
} // namespace
uint32_t Section::onPageComplete(std::unique_ptr<Page> page) {
@@ -181,12 +186,23 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
}
writeSectionFileHeader(fontId, lineCompression, extraParagraphSpacing, paragraphAlignment, viewportWidth,
viewportHeight, hyphenationEnabled);
std::vector<uint32_t> lut = {};
// LUT entries: { filePosition, contentOffset } pairs
struct LutEntry {
uint32_t filePos;
uint32_t contentOffset;
};
std::vector<LutEntry> lut = {};
ChapterHtmlSlimParser visitor(
tmpHtmlPath, renderer, fontId, lineCompression, extraParagraphSpacing, paragraphAlignment, viewportWidth,
viewportHeight, hyphenationEnabled,
[this, &lut](std::unique_ptr<Page> page) { lut.emplace_back(this->onPageComplete(std::move(page))); }, progressFn,
[this, &lut](std::unique_ptr<Page> page) {
// Capture content offset before processing
const uint32_t contentOffset = page->firstContentOffset;
const uint32_t filePos = this->onPageComplete(std::move(page));
lut.push_back({filePos, contentOffset});
}, progressFn,
epub->getCssParser());
Hyphenator::setPreferredLanguage(epub->getLanguage());
success = visitor.parseAndBuildPages();
@@ -197,8 +213,10 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
// Create a placeholder page for malformed chapters instead of failing entirely
// This allows the book to continue loading with chapters that do parse successfully
auto placeholderPage = std::unique_ptr<Page>(new Page());
placeholderPage->firstContentOffset = 0;
// Add placeholder to LUT
lut.emplace_back(this->onPageComplete(std::move(placeholderPage)));
const uint32_t filePos = this->onPageComplete(std::move(placeholderPage));
lut.push_back({filePos, 0});
// If we still have no pages, the placeholder creation failed
if (pageCount == 0) {
@@ -211,13 +229,14 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
const uint32_t lutOffset = file.position();
bool hasFailedLutRecords = false;
// Write LUT
for (const uint32_t& pos : lut) {
if (pos == 0) {
// Write LUT with both file position and content offset
for (const auto& entry : lut) {
if (entry.filePos == 0) {
hasFailedLutRecords = true;
break;
}
serialization::writePod(file, pos);
serialization::writePod(file, entry.filePos);
serialization::writePod(file, entry.contentOffset);
}
if (hasFailedLutRecords) {
@@ -243,12 +262,106 @@ std::unique_ptr<Page> Section::loadPageFromSectionFile() {
file.seek(HEADER_SIZE - sizeof(uint32_t));
uint32_t lutOffset;
serialization::readPod(file, lutOffset);
file.seek(lutOffset + sizeof(uint32_t) * currentPage);
// LUT entries are now 8 bytes each: { filePos (4), contentOffset (4) }
file.seek(lutOffset + LUT_ENTRY_SIZE * currentPage);
uint32_t pagePos;
serialization::readPod(file, pagePos);
// Skip contentOffset for now - we don't need it when just loading the page
file.seek(pagePos);
auto page = Page::deserialize(file);
file.close();
return page;
}
int Section::findPageForContentOffset(uint32_t targetOffset) const {
if (pageCount == 0) {
return 0;
}
FsFile f;
if (!SdMan.openFileForRead("SCT", filePath, f)) {
Serial.printf("[%lu] [SCT] findPageForContentOffset: Failed to open file\n", millis());
return 0;
}
// Read LUT offset from header
f.seek(HEADER_SIZE - sizeof(uint32_t));
uint32_t lutOffset;
serialization::readPod(f, lutOffset);
// Binary search through the LUT to find the page containing targetOffset
// We want the largest contentOffset that is <= targetOffset
int left = 0;
int right = pageCount - 1;
int result = 0;
while (left <= right) {
const int mid = left + (right - left) / 2;
// Read content offset for page 'mid'
// LUT entry format: { filePos (4), contentOffset (4) }
f.seek(lutOffset + LUT_ENTRY_SIZE * mid + sizeof(uint32_t)); // Skip filePos
uint32_t midOffset;
serialization::readPod(f, midOffset);
if (midOffset <= targetOffset) {
result = mid; // This page could be the answer
left = mid + 1; // Look for a later page that might also qualify
} else {
right = mid - 1; // Look for an earlier page
}
}
// When multiple pages share the same content offset (e.g., a large text
// block spanning multiple pages), scan backward to find the FIRST page
// with that offset, not the last
if (result > 0) {
f.seek(lutOffset + LUT_ENTRY_SIZE * result + sizeof(uint32_t));
uint32_t resultOffset;
serialization::readPod(f, resultOffset);
while (result > 0) {
f.seek(lutOffset + LUT_ENTRY_SIZE * (result - 1) + sizeof(uint32_t));
uint32_t prevOffset;
serialization::readPod(f, prevOffset);
if (prevOffset == resultOffset) {
result--;
} else {
break;
}
}
}
f.close();
Serial.printf("[%lu] [SCT] findPageForContentOffset: offset %u -> page %d\n", millis(), targetOffset, result);
return result;
}
uint32_t Section::getContentOffsetForPage(int pageIndex) const {
if (pageCount == 0 || pageIndex < 0 || pageIndex >= pageCount) {
return 0;
}
FsFile f;
if (!SdMan.openFileForRead("SCT", filePath, f)) {
Serial.printf("[%lu] [SCT] getContentOffsetForPage: Failed to open file\n", millis());
return 0;
}
// Read LUT offset from header
f.seek(HEADER_SIZE - sizeof(uint32_t));
uint32_t lutOffset;
serialization::readPod(f, lutOffset);
// Read content offset for the specified page
// LUT entry format: { filePos (4), contentOffset (4) }
f.seek(lutOffset + LUT_ENTRY_SIZE * pageIndex + sizeof(uint32_t)); // Skip filePos
uint32_t contentOffset;
serialization::readPod(f, contentOffset);
f.close();
return contentOffset;
}

View File

@@ -36,4 +36,9 @@ class Section {
const std::function<void()>& progressSetupFn = nullptr,
const std::function<void(int)>& progressFn = nullptr);
std::unique_ptr<Page> loadPageFromSectionFile();
// Methods for content offset-based position tracking
// Used to restore reading position after re-indexing due to font/setting changes
int findPageForContentOffset(uint32_t targetOffset) const;
uint32_t getContentOffsetForPage(int pageIndex) const;
};

View File

@@ -332,6 +332,11 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
if (self->skipUntilDepth < self->depth) {
return;
}
// Capture byte offset of this character data for page position tracking
if (self->xmlParser) {
self->lastCharDataOffset = XML_GetCurrentByteIndex(self->xmlParser);
}
// Determine font style from depth-based tracking and CSS effective style
const bool isBold = self->boldUntilDepth < self->depth || self->effectiveBold;
@@ -477,17 +482,18 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
bool ChapterHtmlSlimParser::parseAndBuildPages() {
startNewTextBlock((TextBlock::Style)this->paragraphAlignment);
const XML_Parser parser = XML_ParserCreate(nullptr);
xmlParser = XML_ParserCreate(nullptr);
int done;
if (!parser) {
if (!xmlParser) {
Serial.printf("[%lu] [EHP] Couldn't allocate memory for parser\n", millis());
return false;
}
FsFile file;
if (!SdMan.openFileForRead("EHP", filepath, file)) {
XML_ParserFree(parser);
XML_ParserFree(xmlParser);
xmlParser = nullptr;
return false;
}
@@ -495,19 +501,24 @@ bool ChapterHtmlSlimParser::parseAndBuildPages() {
const size_t totalSize = file.size();
size_t bytesRead = 0;
int lastProgress = -1;
// Initialize offset tracking - first page starts at offset 0
currentPageStartOffset = 0;
lastCharDataOffset = 0;
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
XML_SetUserData(xmlParser, this);
XML_SetElementHandler(xmlParser, startElement, endElement);
XML_SetCharacterDataHandler(xmlParser, characterData);
do {
void* const buf = XML_GetBuffer(parser, 1024);
void* const buf = XML_GetBuffer(xmlParser, 1024);
if (!buf) {
Serial.printf("[%lu] [EHP] Couldn't allocate memory for buffer\n", millis());
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
XML_StopParser(xmlParser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(xmlParser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(xmlParser, nullptr);
XML_ParserFree(xmlParser);
xmlParser = nullptr;
file.close();
return false;
}
@@ -516,10 +527,11 @@ bool ChapterHtmlSlimParser::parseAndBuildPages() {
if (len == 0 && file.available() > 0) {
Serial.printf("[%lu] [EHP] File read error\n", millis());
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
XML_StopParser(xmlParser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(xmlParser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(xmlParser, nullptr);
XML_ParserFree(xmlParser);
xmlParser = nullptr;
file.close();
return false;
}
@@ -537,27 +549,33 @@ bool ChapterHtmlSlimParser::parseAndBuildPages() {
done = file.available() == 0;
if (XML_ParseBuffer(parser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
if (XML_ParseBuffer(xmlParser, static_cast<int>(len), done) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [EHP] Parse error at line %lu:\n%s\n", millis(), XML_GetCurrentLineNumber(xmlParser),
XML_ErrorString(XML_GetErrorCode(xmlParser)));
XML_StopParser(xmlParser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(xmlParser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(xmlParser, nullptr);
XML_ParserFree(xmlParser);
xmlParser = nullptr;
file.close();
return false;
}
} while (!done);
XML_StopParser(parser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(parser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
XML_StopParser(xmlParser, XML_FALSE); // Stop any pending processing
XML_SetElementHandler(xmlParser, nullptr, nullptr); // Clear callbacks
XML_SetCharacterDataHandler(xmlParser, nullptr);
XML_ParserFree(xmlParser);
xmlParser = nullptr;
file.close();
// Process last page if there is still text
if (currentTextBlock) {
makePages();
// Set the content offset for the final page
if (currentPage) {
currentPage->firstContentOffset = static_cast<uint32_t>(currentPageStartOffset);
}
completePageFn(std::move(currentPage));
currentPage.reset();
currentTextBlock.reset();
@@ -570,8 +588,15 @@ void ChapterHtmlSlimParser::addLineToPage(std::shared_ptr<TextBlock> line) {
const int lineHeight = renderer.getLineHeight(fontId) * lineCompression;
if (currentPageNextY + lineHeight > viewportHeight) {
// Set the content offset for the page being completed
if (currentPage) {
currentPage->firstContentOffset = static_cast<uint32_t>(currentPageStartOffset);
}
completePageFn(std::move(currentPage));
// Start new page - offset will be set when first content is added
currentPage.reset(new Page());
currentPageStartOffset = lastCharDataOffset; // Use offset from when content was parsed
currentPageNextY = 0;
}
@@ -587,6 +612,8 @@ void ChapterHtmlSlimParser::makePages() {
if (!currentPage) {
currentPage.reset(new Page());
// Use offset captured during character data parsing
currentPageStartOffset = lastCharDataOffset;
currentPageNextY = 0;
}

View File

@@ -54,6 +54,11 @@ class ChapterHtmlSlimParser {
bool effectiveBold = false;
bool effectiveItalic = false;
bool effectiveUnderline = false;
// Byte offset tracking for position restoration after re-indexing
XML_Parser xmlParser = nullptr; // Store parser for getting current byte index
size_t currentPageStartOffset = 0; // Byte offset when current page was started
size_t lastCharDataOffset = 0; // Byte offset of last character data (captured during parsing)
void updateEffectiveInlineStyle();
void startNewTextBlock(TextBlock::Style style);