diff --git a/lib/Epub/Epub/Page.cpp b/lib/Epub/Epub/Page.cpp index cf9206e3..29ed3a5b 100644 --- a/lib/Epub/Epub/Page.cpp +++ b/lib/Epub/Epub/Page.cpp @@ -1,8 +1,17 @@ #include "Page.h" +#include #include #include +// Cell padding in pixels (must match TABLE_CELL_PAD_* in ChapterHtmlSlimParser.cpp) +static constexpr int TABLE_CELL_PADDING_X = 4; +static constexpr int TABLE_CELL_PADDING_TOP = 1; + +// --------------------------------------------------------------------------- +// PageLine +// --------------------------------------------------------------------------- + void PageLine::render(GfxRenderer& renderer, const int fontId, const int xOffset, const int yOffset) { block->render(renderer, fontId, xPos + xOffset, yPos + yOffset); } @@ -25,6 +34,115 @@ std::unique_ptr PageLine::deserialize(FsFile& file) { return std::unique_ptr(new PageLine(std::move(tb), xPos, yPos)); } +// --------------------------------------------------------------------------- +// PageTableRow +// --------------------------------------------------------------------------- + +void PageTableRow::render(GfxRenderer& renderer, const int fontId, const int xOffset, const int yOffset) { + const int baseX = xPos + xOffset; + const int baseY = yPos + yOffset; + + // Draw horizontal borders (top and bottom of this row) + renderer.drawLine(baseX, baseY, baseX + totalWidth, baseY); + renderer.drawLine(baseX, baseY + rowHeight, baseX + totalWidth, baseY + rowHeight); + + // Draw vertical borders and render cell contents + // Left edge + renderer.drawLine(baseX, baseY, baseX, baseY + rowHeight); + + for (const auto& cell : cells) { + // Right vertical border for this cell + const int cellRightX = baseX + cell.xOffset + cell.columnWidth; + renderer.drawLine(cellRightX, baseY, cellRightX, baseY + rowHeight); + + // Render each text line within the cell + const int cellTextX = baseX + cell.xOffset + TABLE_CELL_PADDING_X; + int cellLineY = baseY + 1 + TABLE_CELL_PADDING_TOP; // 1px border + top padding + + for (const auto& line : cell.lines) { + line->render(renderer, fontId, cellTextX, cellLineY); + cellLineY += lineHeight; + } + } +} + +bool PageTableRow::serialize(FsFile& file) { + serialization::writePod(file, xPos); + serialization::writePod(file, yPos); + serialization::writePod(file, rowHeight); + serialization::writePod(file, totalWidth); + serialization::writePod(file, lineHeight); + + const uint16_t cellCount = static_cast(cells.size()); + serialization::writePod(file, cellCount); + + for (const auto& cell : cells) { + serialization::writePod(file, cell.xOffset); + serialization::writePod(file, cell.columnWidth); + + const uint16_t lineCount = static_cast(cell.lines.size()); + serialization::writePod(file, lineCount); + + for (const auto& line : cell.lines) { + if (!line->serialize(file)) { + return false; + } + } + } + + return true; +} + +std::unique_ptr PageTableRow::deserialize(FsFile& file) { + int16_t xPos, yPos, rowHeight, totalWidth, lineHeight; + serialization::readPod(file, xPos); + serialization::readPod(file, yPos); + serialization::readPod(file, rowHeight); + serialization::readPod(file, totalWidth); + serialization::readPod(file, lineHeight); + + uint16_t cellCount; + serialization::readPod(file, cellCount); + + // Sanity check + if (cellCount > 100) { + LOG_ERR("PTR", "Deserialization failed: cell count %u exceeds maximum", cellCount); + return nullptr; + } + + std::vector cells; + cells.resize(cellCount); + + for (uint16_t c = 0; c < cellCount; ++c) { + serialization::readPod(file, cells[c].xOffset); + serialization::readPod(file, cells[c].columnWidth); + + uint16_t lineCount; + serialization::readPod(file, lineCount); + + if (lineCount > 1000) { + LOG_ERR("PTR", "Deserialization failed: line count %u in cell %u exceeds maximum", lineCount, c); + return nullptr; + } + + cells[c].lines.reserve(lineCount); + for (uint16_t l = 0; l < lineCount; ++l) { + auto tb = TextBlock::deserialize(file); + if (!tb) { + return nullptr; + } + cells[c].lines.push_back(std::move(tb)); + } + } + + return std::unique_ptr( + new PageTableRow(std::move(cells), rowHeight, totalWidth, lineHeight, xPos, yPos)); +} + +// --------------------------------------------------------------------------- +// Page +// --------------------------------------------------------------------------- + void Page::render(GfxRenderer& renderer, const int fontId, const int xOffset, const int yOffset) const { for (auto& element : elements) { element->render(renderer, fontId, xOffset, yOffset); @@ -36,8 +154,7 @@ bool Page::serialize(FsFile& file) const { serialization::writePod(file, count); for (const auto& el : elements) { - // Only PageLine exists currently - serialization::writePod(file, static_cast(TAG_PageLine)); + serialization::writePod(file, static_cast(el->getTag())); if (!el->serialize(file)) { return false; } @@ -59,6 +176,13 @@ std::unique_ptr Page::deserialize(FsFile& file) { if (tag == TAG_PageLine) { auto pl = PageLine::deserialize(file); page->elements.push_back(std::move(pl)); + } else if (tag == TAG_PageTableRow) { + auto tr = PageTableRow::deserialize(file); + if (!tr) { + LOG_ERR("PGE", "Deserialization failed for PageTableRow at element %u", i); + return nullptr; + } + page->elements.push_back(std::move(tr)); } else { LOG_ERR("PGE", "Deserialization failed: Unknown tag %u", tag); return nullptr; diff --git a/lib/Epub/Epub/Page.h b/lib/Epub/Epub/Page.h index 41e1db90..7fc009bf 100644 --- a/lib/Epub/Epub/Page.h +++ b/lib/Epub/Epub/Page.h @@ -8,6 +8,7 @@ enum PageElementTag : uint8_t { TAG_PageLine = 1, + TAG_PageTableRow = 2, }; // represents something that has been added to a page @@ -17,6 +18,7 @@ class PageElement { int16_t yPos; explicit PageElement(const int16_t xPos, const int16_t yPos) : xPos(xPos), yPos(yPos) {} virtual ~PageElement() = default; + virtual PageElementTag getTag() const = 0; virtual void render(GfxRenderer& renderer, int fontId, int xOffset, int yOffset) = 0; virtual bool serialize(FsFile& file) = 0; }; @@ -29,11 +31,42 @@ class PageLine final : public PageElement { PageLine(std::shared_ptr block, const int16_t xPos, const int16_t yPos) : PageElement(xPos, yPos), block(std::move(block)) {} const std::shared_ptr& getBlock() const { return block; } + PageElementTag getTag() const override { return TAG_PageLine; } void render(GfxRenderer& renderer, int fontId, int xOffset, int yOffset) override; bool serialize(FsFile& file) override; static std::unique_ptr deserialize(FsFile& file); }; +/// Data for a single cell within a PageTableRow. +struct PageTableCellData { + std::vector> lines; // Laid-out text lines for this cell + uint16_t columnWidth = 0; // Width of this column in pixels + uint16_t xOffset = 0; // X offset of this cell within the row +}; + +/// A table row element that renders cells in a column-aligned grid with borders. +class PageTableRow final : public PageElement { + std::vector cells; + int16_t rowHeight; // Total row height in pixels + int16_t totalWidth; // Total table width in pixels + int16_t lineHeight; // Height of one text line (for vertical positioning of cell lines) + + public: + PageTableRow(std::vector cells, int16_t rowHeight, int16_t totalWidth, int16_t lineHeight, + int16_t xPos, int16_t yPos) + : PageElement(xPos, yPos), + cells(std::move(cells)), + rowHeight(rowHeight), + totalWidth(totalWidth), + lineHeight(lineHeight) {} + + int16_t getHeight() const { return rowHeight; } + PageElementTag getTag() const override { return TAG_PageTableRow; } + void render(GfxRenderer& renderer, int fontId, int xOffset, int yOffset) override; + bool serialize(FsFile& file) override; + static std::unique_ptr deserialize(FsFile& file); +}; + class Page { public: // the list of block index and line numbers on this page diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 7020cc92..f4a0b51c 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -62,6 +62,13 @@ void ParsedText::addWord(std::string word, const EpdFontFamily::Style fontStyle, } wordStyles.push_back(combinedStyle); wordContinues.push_back(attachToPrevious); + forceBreakAfter.push_back(false); +} + +void ParsedText::addLineBreak() { + if (!words.empty()) { + forceBreakAfter.back() = true; + } } // Consumes data to minimize memory usage @@ -148,6 +155,11 @@ std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, c const int effectivePageWidth = i == 0 ? pageWidth - firstLineIndent : pageWidth; for (size_t j = i; j < totalWordCount; ++j) { + // If the previous word has a forced line break, this line cannot include word j + if (j > static_cast(i) && !forceBreakAfter.empty() && forceBreakAfter[j - 1]) { + break; + } + // Add space before word j, unless it's the first word on the line or a continuation const int gap = j > static_cast(i) && !continuesVec[j] ? spaceWidth : 0; currlen += wordWidths[j] + gap; @@ -156,8 +168,11 @@ std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, c break; } - // Cannot break after word j if the next word attaches to it (continuation group) - if (j + 1 < totalWordCount && continuesVec[j + 1]) { + // Forced line break after word j overrides continuation (must end line here) + const bool mustBreakHere = !forceBreakAfter.empty() && forceBreakAfter[j]; + + // Cannot break after word j if the next word attaches to it (unless forced) + if (!mustBreakHere && j + 1 < totalWordCount && continuesVec[j + 1]) { continue; } @@ -180,6 +195,11 @@ std::vector ParsedText::computeLineBreaks(const GfxRenderer& renderer, c dp[i] = cost; ans[i] = j; // j is the index of the last word in this optimal line } + + // After evaluating cost, enforce forced break - no more words on this line + if (mustBreakHere) { + break; + } } // Handle oversized word: if no valid configuration found, force single-word line @@ -254,6 +274,11 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r // Consume as many words as possible for current line, splitting when prefixes fit while (currentIndex < wordWidths.size()) { + // If the previous word has a forced line break, stop - this word starts a new line + if (currentIndex > lineStart && !forceBreakAfter.empty() && forceBreakAfter[currentIndex - 1]) { + break; + } + const bool isFirstWord = currentIndex == lineStart; const int spacing = isFirstWord || continuesVec[currentIndex] ? 0 : spaceWidth; const int candidateWidth = spacing + wordWidths[currentIndex]; @@ -262,6 +287,11 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r if (lineWidth + candidateWidth <= effectivePageWidth) { lineWidth += candidateWidth; ++currentIndex; + + // If the word we just added has a forced break, end this line now + if (!forceBreakAfter.empty() && forceBreakAfter[currentIndex - 1]) { + break; + } continue; } @@ -287,7 +317,12 @@ std::vector ParsedText::computeHyphenatedLineBreaks(const GfxRenderer& r // Don't break before a continuation word (e.g., orphaned "?" after "question"). // Backtrack to the start of the continuation group so the whole group moves to the next line. + // But don't backtrack past a forced break point. while (currentIndex > lineStart + 1 && currentIndex < wordWidths.size() && continuesVec[currentIndex]) { + // Don't backtrack past a forced break + if (!forceBreakAfter.empty() && forceBreakAfter[currentIndex - 1]) { + break; + } --currentIndex; } @@ -361,6 +396,13 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl wordContinues[wordIndex] = false; wordContinues.insert(wordContinues.begin() + wordIndex + 1, originalContinuedToNext); + // Forced break belongs to the original whole word; transfer it to the remainder (last part). + if (!forceBreakAfter.empty()) { + const bool originalForceBreak = forceBreakAfter[wordIndex]; + forceBreakAfter[wordIndex] = false; // prefix doesn't force break + forceBreakAfter.insert(forceBreakAfter.begin() + wordIndex + 1, originalForceBreak); + } + // Update cached widths to reflect the new prefix/remainder pairing. wordWidths[wordIndex] = static_cast(chosenWidth); const uint16_t remainderWidth = measureWordWidth(renderer, fontId, remainder, style); @@ -447,3 +489,22 @@ void ParsedText::extractLine(const size_t breakIndex, const int pageWidth, const processLine( std::make_shared(std::move(lineWords), std::move(lineXPos), std::move(lineWordStyles), blockStyle)); } + +uint16_t ParsedText::getNaturalWidth(const GfxRenderer& renderer, const int fontId) const { + if (words.empty()) { + return 0; + } + + const int spaceWidth = renderer.getSpaceWidth(fontId); + int totalWidth = 0; + + for (size_t i = 0; i < words.size(); ++i) { + totalWidth += measureWordWidth(renderer, fontId, words[i], wordStyles[i]); + // Add a space before this word unless it's the first word or a continuation + if (i > 0 && !wordContinues[i]) { + totalWidth += spaceWidth; + } + } + + return static_cast(std::min(totalWidth, static_cast(UINT16_MAX))); +} diff --git a/lib/Epub/Epub/ParsedText.h b/lib/Epub/Epub/ParsedText.h index 39e6bfe8..92823f60 100644 --- a/lib/Epub/Epub/ParsedText.h +++ b/lib/Epub/Epub/ParsedText.h @@ -15,7 +15,8 @@ class GfxRenderer; class ParsedText { std::vector words; std::vector wordStyles; - std::vector wordContinues; // true = word attaches to previous (no space before it) + std::vector wordContinues; // true = word attaches to previous (no space before it) + std::vector forceBreakAfter; // true = mandatory line break after this word (e.g.
in table cells) BlockStyle blockStyle; bool extraParagraphSpacing; bool hyphenationEnabled; @@ -40,6 +41,10 @@ class ParsedText { ~ParsedText() = default; void addWord(std::string word, EpdFontFamily::Style fontStyle, bool underline = false, bool attachToPrevious = false); + + /// Mark a forced line break after the last word (e.g. for
within table cells). + /// If no words have been added yet, this is a no-op. + void addLineBreak(); void setBlockStyle(const BlockStyle& blockStyle) { this->blockStyle = blockStyle; } BlockStyle& getBlockStyle() { return blockStyle; } size_t size() const { return words.size(); } @@ -47,4 +52,9 @@ class ParsedText { void layoutAndExtractLines(const GfxRenderer& renderer, int fontId, uint16_t viewportWidth, const std::function)>& processLine, bool includeLastLine = true); + + /// Returns the "natural" width of the content if it were laid out on a single line + /// (sum of word widths + space widths between non-continuation words). + /// Used by table layout to determine column widths before line-breaking. + uint16_t getNaturalWidth(const GfxRenderer& renderer, int fontId) const; }; \ No newline at end of file diff --git a/lib/Epub/Epub/TableData.h b/lib/Epub/Epub/TableData.h new file mode 100644 index 00000000..c1b89b70 --- /dev/null +++ b/lib/Epub/Epub/TableData.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +#include "ParsedText.h" +#include "css/CssStyle.h" + +/// A single cell in a table row. +struct TableCell { + std::unique_ptr content; + bool isHeader = false; // true for , false for + int colspan = 1; // number of logical columns this cell spans + CssLength widthHint; // width hint from HTML attribute or CSS (if hasWidthHint) + bool hasWidthHint = false; +}; + +/// A single row in a table. +struct TableRow { + std::vector cells; +}; + +/// Buffered table data collected during SAX parsing. +/// The entire table must be buffered before layout because column widths +/// depend on content across all rows. +struct TableData { + std::vector rows; + std::vector colWidthHints; // width hints from tags, indexed by logical column +}; diff --git a/lib/Epub/Epub/css/CssParser.cpp b/lib/Epub/Epub/css/CssParser.cpp index 0a11e80e..023b05bb 100644 --- a/lib/Epub/Epub/css/CssParser.cpp +++ b/lib/Epub/Epub/css/CssParser.cpp @@ -413,6 +413,9 @@ CssStyle CssParser::parseDeclarations(const std::string& declBlock) { style.defined.paddingTop = style.defined.paddingRight = style.defined.paddingBottom = style.defined.paddingLeft = 1; } + } else if (propName == "width") { + style.width = interpretLength(propValue); + style.defined.width = 1; } } diff --git a/lib/Epub/Epub/css/CssStyle.h b/lib/Epub/Epub/css/CssStyle.h index b90fa7ab..d2c2f47d 100644 --- a/lib/Epub/Epub/css/CssStyle.h +++ b/lib/Epub/Epub/css/CssStyle.h @@ -69,6 +69,7 @@ struct CssPropertyFlags { uint16_t paddingBottom : 1; uint16_t paddingLeft : 1; uint16_t paddingRight : 1; + uint16_t width : 1; CssPropertyFlags() : textAlign(0), @@ -83,17 +84,19 @@ struct CssPropertyFlags { paddingTop(0), paddingBottom(0), paddingLeft(0), - paddingRight(0) {} + paddingRight(0), + width(0) {} [[nodiscard]] bool anySet() const { return textAlign || fontStyle || fontWeight || textDecoration || textIndent || marginTop || marginBottom || - marginLeft || marginRight || paddingTop || paddingBottom || paddingLeft || paddingRight; + marginLeft || marginRight || paddingTop || paddingBottom || paddingLeft || paddingRight || width; } void clearAll() { textAlign = fontStyle = fontWeight = textDecoration = textIndent = 0; marginTop = marginBottom = marginLeft = marginRight = 0; paddingTop = paddingBottom = paddingLeft = paddingRight = 0; + width = 0; } }; @@ -115,6 +118,7 @@ struct CssStyle { CssLength paddingBottom; // Padding after CssLength paddingLeft; // Padding left CssLength paddingRight; // Padding right + CssLength width; // Element width (used for table columns/cells) CssPropertyFlags defined; // Tracks which properties were explicitly set @@ -173,6 +177,10 @@ struct CssStyle { paddingRight = base.paddingRight; defined.paddingRight = 1; } + if (base.hasWidth()) { + width = base.width; + defined.width = 1; + } } [[nodiscard]] bool hasTextAlign() const { return defined.textAlign; } @@ -188,6 +196,7 @@ struct CssStyle { [[nodiscard]] bool hasPaddingBottom() const { return defined.paddingBottom; } [[nodiscard]] bool hasPaddingLeft() const { return defined.paddingLeft; } [[nodiscard]] bool hasPaddingRight() const { return defined.paddingRight; } + [[nodiscard]] bool hasWidth() const { return defined.width; } void reset() { textAlign = CssTextAlign::Left; @@ -197,6 +206,7 @@ struct CssStyle { textIndent = CssLength{}; marginTop = marginBottom = marginLeft = marginRight = CssLength{}; paddingTop = paddingBottom = paddingLeft = paddingRight = CssLength{}; + width = CssLength{}; defined.clearAll(); } }; diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index e5512472..29210ba9 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -5,6 +5,8 @@ #include #include +#include + #include "../Page.h" #include "../htmlEntities.h" @@ -32,8 +34,30 @@ constexpr int NUM_IMAGE_TAGS = sizeof(IMAGE_TAGS) / sizeof(IMAGE_TAGS[0]); const char* SKIP_TAGS[] = {"head"}; constexpr int NUM_SKIP_TAGS = sizeof(SKIP_TAGS) / sizeof(SKIP_TAGS[0]); +// Table tags that are transparent containers (just depth tracking, no special handling) +const char* TABLE_TRANSPARENT_TAGS[] = {"thead", "tbody", "tfoot", "colgroup"}; +constexpr int NUM_TABLE_TRANSPARENT_TAGS = sizeof(TABLE_TRANSPARENT_TAGS) / sizeof(TABLE_TRANSPARENT_TAGS[0]); + +// Table tags to skip entirely (their children produce no useful output) +const char* TABLE_SKIP_TAGS[] = {"caption"}; +constexpr int NUM_TABLE_SKIP_TAGS = sizeof(TABLE_SKIP_TAGS) / sizeof(TABLE_SKIP_TAGS[0]); + bool isWhitespace(const char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t'; } +// Parse an HTML width attribute value into a CssLength. +// "200" -> 200px, "50%" -> 50 percent. Returns false if the value can't be parsed. +static bool parseHtmlWidthAttr(const char* value, CssLength& out) { + char* end = nullptr; + const float num = strtof(value, &end); + if (end == value || num < 0) return false; + if (*end == '%') { + out = CssLength(num, CssUnit::Percent); + } else { + out = CssLength(num, CssUnit::Pixels); + } + return true; +} + // given the start and end of a tag, check to see if it matches a known tag bool matches(const char* tag_name, const char* possible_tags[], const int possible_tag_count) { for (int i = 0; i < possible_tag_count; i++) { @@ -91,13 +115,37 @@ void ChapterHtmlSlimParser::flushPartWordBuffer() { // flush the buffer partWordBuffer[partWordBufferIndex] = '\0'; - currentTextBlock->addWord(partWordBuffer, fontStyle, false, nextWordContinues); + + // Handle double-encoded   entities (e.g. &nbsp; in source -> literal " " after + // XML parsing). Common in Wikipedia and other generated EPUBs. Replace with a space so the text + // renders cleanly. The space stays within the word, preserving non-breaking behavior. + std::string flushedWord(partWordBuffer); + size_t entityPos = 0; + while ((entityPos = flushedWord.find(" ", entityPos)) != std::string::npos) { + flushedWord.replace(entityPos, 6, " "); + entityPos += 1; + } + + currentTextBlock->addWord(flushedWord, fontStyle, false, nextWordContinues); partWordBufferIndex = 0; nextWordContinues = false; } // start a new text block if needed void ChapterHtmlSlimParser::startNewTextBlock(const BlockStyle& blockStyle) { + // When inside a table cell, don't lay out to the page -- insert a forced line break + // within the cell's ParsedText so that block elements (p, div, br) create visual breaks. + if (inTable) { + if (partWordBufferIndex > 0) { + flushPartWordBuffer(); + } + if (currentTextBlock && !currentTextBlock->isEmpty()) { + currentTextBlock->addLineBreak(); + } + nextWordContinues = false; + return; + } + nextWordContinues = false; // New block = new paragraph, no continuation if (currentTextBlock) { // already have a text block running and it is empty - just reuse it @@ -140,21 +188,184 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* centeredBlockStyle.textAlignDefined = true; centeredBlockStyle.alignment = CssTextAlign::Center; - // Special handling for tables - show placeholder text instead of dropping silently + // --- Table handling --- if (strcmp(name, "table") == 0) { - // Add placeholder text - self->startNewTextBlock(centeredBlockStyle); + if (self->inTable) { + // Nested table: skip it entirely for v1 + self->skipUntilDepth = self->depth; + self->depth += 1; + return; + } + + // Flush any pending content before the table + if (self->currentTextBlock && !self->currentTextBlock->isEmpty()) { + self->makePages(); + } + + self->inTable = true; + self->tableData.reset(new TableData()); + + // Create a safe empty currentTextBlock so character data outside cells + // (e.g. whitespace between tags) doesn't crash + auto tableBlockStyle = BlockStyle(); + tableBlockStyle.alignment = CssTextAlign::Left; + self->currentTextBlock.reset(new ParsedText(self->extraParagraphSpacing, self->hyphenationEnabled, tableBlockStyle)); - self->italicUntilDepth = min(self->italicUntilDepth, self->depth); - // Advance depth before processing character data (like you would for an element with text) self->depth += 1; - self->characterData(userData, "[Table omitted]", strlen("[Table omitted]")); - - // Skip table contents (skip until parent as we pre-advanced depth above) - self->skipUntilDepth = self->depth - 1; return; } + // Table structure tags (only when inside a table) + if (self->inTable) { + if (strcmp(name, "tr") == 0) { + self->tableData->rows.push_back(TableRow()); + self->depth += 1; + return; + } + + // — capture width hint for column sizing + if (strcmp(name, "col") == 0) { + CssLength widthHint; + bool hasHint = false; + + // Parse HTML width attribute + if (atts != nullptr) { + for (int i = 0; atts[i]; i += 2) { + if (strcmp(atts[i], "width") == 0) { + hasHint = parseHtmlWidthAttr(atts[i + 1], widthHint); + break; + } + } + } + + // CSS width (inline style) overrides HTML attribute + if (self->cssParser) { + std::string styleAttr; + if (atts != nullptr) { + for (int i = 0; atts[i]; i += 2) { + if (strcmp(atts[i], "style") == 0) { + styleAttr = atts[i + 1]; + break; + } + } + } + if (!styleAttr.empty()) { + CssStyle inlineStyle = CssParser::parseInlineStyle(styleAttr); + if (inlineStyle.hasWidth()) { + widthHint = inlineStyle.width; + hasHint = true; + } + } + } + + if (hasHint) { + self->tableData->colWidthHints.push_back(widthHint); + } else { + // Push a zero-value placeholder to maintain index alignment + self->tableData->colWidthHints.push_back(CssLength()); + } + + self->depth += 1; + return; + } + + if (strcmp(name, "td") == 0 || strcmp(name, "th") == 0) { + const bool isHeader = strcmp(name, "th") == 0; + + // Parse colspan and width attributes + int colspan = 1; + CssLength cellWidthHint; + bool hasCellWidthHint = false; + std::string cellStyleAttr; + + if (atts != nullptr) { + for (int i = 0; atts[i]; i += 2) { + if (strcmp(atts[i], "colspan") == 0) { + colspan = atoi(atts[i + 1]); + if (colspan < 1) colspan = 1; + } else if (strcmp(atts[i], "width") == 0) { + hasCellWidthHint = parseHtmlWidthAttr(atts[i + 1], cellWidthHint); + } else if (strcmp(atts[i], "style") == 0) { + cellStyleAttr = atts[i + 1]; + } + } + } + + // CSS width (inline style or stylesheet) overrides HTML attribute + if (self->cssParser) { + std::string classAttr; + if (atts != nullptr) { + for (int i = 0; atts[i]; i += 2) { + if (strcmp(atts[i], "class") == 0) { + classAttr = atts[i + 1]; + break; + } + } + } + CssStyle cellCssStyle = self->cssParser->resolveStyle(name, classAttr); + if (!cellStyleAttr.empty()) { + CssStyle inlineStyle = CssParser::parseInlineStyle(cellStyleAttr); + cellCssStyle.applyOver(inlineStyle); + } + if (cellCssStyle.hasWidth()) { + cellWidthHint = cellCssStyle.width; + hasCellWidthHint = true; + } + } + + // Ensure there's a row to add cells to + if (self->tableData->rows.empty()) { + self->tableData->rows.push_back(TableRow()); + } + + // Create a new ParsedText for this cell (characterData will flow into it) + auto cellBlockStyle = BlockStyle(); + cellBlockStyle.alignment = CssTextAlign::Left; + cellBlockStyle.textAlignDefined = true; + // Explicitly disable paragraph indent for table cells + cellBlockStyle.textIndent = 0; + cellBlockStyle.textIndentDefined = true; + self->currentTextBlock.reset( + new ParsedText(self->extraParagraphSpacing, self->hyphenationEnabled, cellBlockStyle)); + self->nextWordContinues = false; + + // Track the cell + auto& currentRow = self->tableData->rows.back(); + currentRow.cells.push_back(TableCell()); + currentRow.cells.back().isHeader = isHeader; + currentRow.cells.back().colspan = colspan; + if (hasCellWidthHint) { + currentRow.cells.back().widthHint = cellWidthHint; + currentRow.cells.back().hasWidthHint = true; + } + + // Apply bold for header cells + if (isHeader) { + self->boldUntilDepth = std::min(self->boldUntilDepth, self->depth); + self->updateEffectiveInlineStyle(); + } + + self->depth += 1; + return; + } + + // Transparent table container tags + if (matches(name, TABLE_TRANSPARENT_TAGS, NUM_TABLE_TRANSPARENT_TAGS)) { + self->depth += 1; + return; + } + + // Skip colgroup, col, caption + if (matches(name, TABLE_SKIP_TAGS, NUM_TABLE_SKIP_TAGS)) { + self->skipUntilDepth = self->depth; + self->depth += 1; + return; + } + + // Other tags inside table cells (p, div, span, b, i, etc.) fall through + // to the normal handling below. startNewTextBlock is a no-op when inTable. + } + if (matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS)) { // TODO: Start processing image tags std::string alt = "[Image]"; @@ -408,7 +619,8 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char // There should be enough here to build out 1-2 full pages and doing this will free up a lot of // memory. // Spotted when reading Intermezzo, there are some really long text blocks in there. - if (self->currentTextBlock->size() > 750) { + // Skip this when inside a table - cell content is buffered for later layout. + if (!self->inTable && self->currentTextBlock->size() > 750) { LOG_DBG("EHP", "Text block too long, splitting into multiple pages"); self->currentTextBlock->layoutAndExtractLines( self->renderer, self->fontId, self->viewportWidth, @@ -446,15 +658,17 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n const bool styleWillChange = willPopStyleStack || willClearBold || willClearItalic || willClearUnderline; const bool headerOrBlockTag = isHeaderOrBlock(name); + const bool isTableCellTag = strcmp(name, "td") == 0 || strcmp(name, "th") == 0; + const bool isTableTag = strcmp(name, "table") == 0; // Flush buffer with current style BEFORE any style changes if (self->partWordBufferIndex > 0) { // Flush if style will change OR if we're closing a block/structural element - const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 && + const bool isInlineTag = !headerOrBlockTag && !isTableTag && !isTableCellTag && !matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1; const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || - matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 || + matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || isTableTag || isTableCellTag || matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) || self->depth == 1; if (shouldFlush) { @@ -466,6 +680,57 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n } } + // --- Table cell/row/table close handling --- + if (self->inTable) { + if (isTableCellTag) { + // Save the current cell content into the table data + if (self->tableData && !self->tableData->rows.empty()) { + auto& currentRow = self->tableData->rows.back(); + if (!currentRow.cells.empty()) { + currentRow.cells.back().content = std::move(self->currentTextBlock); + } + } + + // Create a safe empty ParsedText so character data between cells doesn't crash + auto safeBlockStyle = BlockStyle(); + safeBlockStyle.alignment = CssTextAlign::Left; + self->currentTextBlock.reset( + new ParsedText(self->extraParagraphSpacing, self->hyphenationEnabled, safeBlockStyle)); + self->nextWordContinues = false; + } + + if (isTableTag) { + // Process the entire buffered table + self->depth -= 1; + + // Clean up style state for this depth + if (self->skipUntilDepth == self->depth) self->skipUntilDepth = INT_MAX; + if (self->boldUntilDepth == self->depth) self->boldUntilDepth = INT_MAX; + if (self->italicUntilDepth == self->depth) self->italicUntilDepth = INT_MAX; + if (self->underlineUntilDepth == self->depth) self->underlineUntilDepth = INT_MAX; + if (!self->inlineStyleStack.empty() && self->inlineStyleStack.back().depth == self->depth) { + self->inlineStyleStack.pop_back(); + self->updateEffectiveInlineStyle(); + } + + self->processTable(); + + self->inTable = false; + self->tableData.reset(); + + // Restore a fresh text block for content after the table + auto paragraphAlignmentBlockStyle = BlockStyle(); + paragraphAlignmentBlockStyle.textAlignDefined = true; + const auto align = (self->paragraphAlignment == static_cast(CssTextAlign::None)) + ? CssTextAlign::Justify + : static_cast(self->paragraphAlignment); + paragraphAlignmentBlockStyle.alignment = align; + self->currentTextBlock.reset( + new ParsedText(self->extraParagraphSpacing, self->hyphenationEnabled, paragraphAlignmentBlockStyle)); + return; // depth already decremented, skip the normal endElement cleanup + } + } + self->depth -= 1; // Leaving skip @@ -653,3 +918,335 @@ void ChapterHtmlSlimParser::makePages() { currentPageNextY += lineHeight / 2; } } + +// --------------------------------------------------------------------------- +// Table processing +// --------------------------------------------------------------------------- + +// Cell padding in pixels (horizontal space between grid line and cell text) +static constexpr int TABLE_CELL_PAD_X = 4; +// Vertical cell padding — asymmetric because font metrics include internal leading (whitespace +// above glyphs), so the top already has built-in visual space. Less explicit padding on top, +// more on bottom, produces visually balanced results. +static constexpr int TABLE_CELL_PAD_TOP = 1; +static constexpr int TABLE_CELL_PAD_BOTTOM = 3; +// Minimum usable column width in pixels (below this text is unreadable) +static constexpr int TABLE_MIN_COL_WIDTH = 30; +// Grid line width in pixels +static constexpr int TABLE_GRID_LINE_PX = 1; + +void ChapterHtmlSlimParser::addTableRowToPage(std::shared_ptr row) { + if (!currentPage) { + currentPage.reset(new Page()); + currentPageNextY = 0; + } + + const int16_t rowH = row->getHeight(); + + // If this row doesn't fit on the current page, start a new one + if (currentPageNextY + rowH > viewportHeight) { + completePageFn(std::move(currentPage)); + currentPage.reset(new Page()); + currentPageNextY = 0; + } + + row->xPos = 0; + row->yPos = currentPageNextY; + currentPage->elements.push_back(std::move(row)); + currentPageNextY += rowH; +} + +void ChapterHtmlSlimParser::processTable() { + if (!tableData || tableData->rows.empty()) { + return; + } + + if (!currentPage) { + currentPage.reset(new Page()); + currentPageNextY = 0; + } + + const int lh = static_cast(renderer.getLineHeight(fontId) * lineCompression); + + // 1. Determine logical column count using colspan. + // Each cell occupies cell.colspan logical columns. The total for a row is the sum of colspans. + size_t numCols = 0; + for (const auto& row : tableData->rows) { + size_t rowLogicalCols = 0; + for (const auto& cell : row.cells) { + rowLogicalCols += static_cast(cell.colspan); + } + numCols = std::max(numCols, rowLogicalCols); + } + + if (numCols == 0) { + return; + } + + // 2. Measure natural width of each cell and compute per-column max natural width. + // Only non-spanning cells (colspan==1) contribute to individual column widths. + // Spanning cells use the combined width of their spanned columns. + std::vector colNaturalWidth(numCols, 0); + + for (const auto& row : tableData->rows) { + size_t logicalCol = 0; + for (const auto& cell : row.cells) { + if (cell.colspan == 1 && cell.content && !cell.content->isEmpty()) { + if (logicalCol < numCols) { + const uint16_t w = cell.content->getNaturalWidth(renderer, fontId); + if (w > colNaturalWidth[logicalCol]) { + colNaturalWidth[logicalCol] = w; + } + } + } + logicalCol += static_cast(cell.colspan); + } + } + + // 3. Calculate column widths to fit viewport. + // Available width = viewport - outer borders - internal column borders - cell padding + const int totalGridLines = static_cast(numCols) + 1; // left + between columns + right + const int totalPadding = static_cast(numCols) * TABLE_CELL_PAD_X * 2; + const int availableForContent = viewportWidth - totalGridLines * TABLE_GRID_LINE_PX - totalPadding; + + // 3a. Resolve width hints per column. + // Priority: hints > max cell hint (colspan=1 only). + // Percentages are relative to availableForContent. + const float emSize = static_cast(lh); + const float containerW = static_cast(std::max(availableForContent, 0)); + + std::vector colHintedWidth(numCols, -1); // -1 = no hint + + // From tags + for (size_t c = 0; c < numCols && c < tableData->colWidthHints.size(); ++c) { + const auto& hint = tableData->colWidthHints[c]; + if (hint.value > 0) { + int px = static_cast(hint.toPixels(emSize, containerW)); + if (px > 0) { + colHintedWidth[c] = std::max(px, TABLE_MIN_COL_WIDTH); + } + } + } + + // From / cell width hints (only override if no hint exists for this column) + for (const auto& row : tableData->rows) { + size_t logicalCol = 0; + for (const auto& cell : row.cells) { + if (cell.colspan == 1 && cell.hasWidthHint && logicalCol < numCols) { + if (colHintedWidth[logicalCol] < 0) { // no hint yet + int px = static_cast(cell.widthHint.toPixels(emSize, containerW)); + if (px > colHintedWidth[logicalCol]) { + colHintedWidth[logicalCol] = std::max(px, TABLE_MIN_COL_WIDTH); + } + } + } + logicalCol += static_cast(cell.colspan); + } + } + + // 3b. Distribute column widths: hinted columns get their hint, unhinted use auto-sizing. + std::vector colWidths(numCols, 0); + + if (availableForContent <= 0) { + const uint16_t equalWidth = static_cast(viewportWidth / numCols); + for (size_t c = 0; c < numCols; ++c) { + colWidths[c] = equalWidth; + } + } else { + // First, assign hinted columns and track how much space they consume + int hintedSpaceUsed = 0; + size_t unhintedCount = 0; + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] > 0) { + hintedSpaceUsed += colHintedWidth[c]; + } else { + unhintedCount++; + } + } + + // If hinted columns exceed available space, scale them down proportionally + if (hintedSpaceUsed > availableForContent && hintedSpaceUsed > 0) { + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] > 0) { + colHintedWidth[c] = colHintedWidth[c] * availableForContent / hintedSpaceUsed; + colHintedWidth[c] = std::max(colHintedWidth[c], TABLE_MIN_COL_WIDTH); + } + } + // Recalculate + hintedSpaceUsed = 0; + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] > 0) { + hintedSpaceUsed += colHintedWidth[c]; + } + } + } + + // Assign hinted columns + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] > 0) { + colWidths[c] = static_cast(colHintedWidth[c]); + } + } + + // Distribute remaining space among unhinted columns using the existing algorithm + const int remainingForUnhinted = std::max(availableForContent - hintedSpaceUsed, 0); + + if (unhintedCount > 0 && remainingForUnhinted > 0) { + // Compute total natural width of unhinted columns + int totalNaturalUnhinted = 0; + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] <= 0) { + totalNaturalUnhinted += colNaturalWidth[c]; + } + } + + if (totalNaturalUnhinted <= remainingForUnhinted) { + // All unhinted content fits — distribute extra space equally among unhinted columns + const int extraSpace = remainingForUnhinted - totalNaturalUnhinted; + const int perColExtra = extraSpace / static_cast(unhintedCount); + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] <= 0) { + colWidths[c] = static_cast(colNaturalWidth[c] + perColExtra); + } + } + } else { + // Unhinted content exceeds remaining space — two-pass fair-share among unhinted columns + const int equalShare = remainingForUnhinted / static_cast(unhintedCount); + + int spaceUsedByFitting = 0; + int naturalOfWide = 0; + size_t wideCount = 0; + + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] <= 0) { + if (static_cast(colNaturalWidth[c]) <= equalShare) { + colWidths[c] = colNaturalWidth[c]; + spaceUsedByFitting += colNaturalWidth[c]; + } else { + naturalOfWide += colNaturalWidth[c]; + wideCount++; + } + } + } + + const int wideSpace = remainingForUnhinted - spaceUsedByFitting; + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] <= 0 && static_cast(colNaturalWidth[c]) > equalShare) { + if (naturalOfWide > 0 && wideCount > 1) { + int proportional = static_cast(colNaturalWidth[c]) * wideSpace / naturalOfWide; + colWidths[c] = static_cast(std::max(proportional, TABLE_MIN_COL_WIDTH)); + } else { + colWidths[c] = static_cast(std::max(wideSpace, TABLE_MIN_COL_WIDTH)); + } + } + } + } + } else if (unhintedCount > 0) { + // No remaining space for unhinted columns — give them minimum width + for (size_t c = 0; c < numCols; ++c) { + if (colHintedWidth[c] <= 0) { + colWidths[c] = static_cast(TABLE_MIN_COL_WIDTH); + } + } + } + } + + // Compute column x-offsets (cumulative: border + padding + content width + padding + border ...) + std::vector colXOffsets(numCols, 0); + int xAccum = TABLE_GRID_LINE_PX; // start after left border + for (size_t c = 0; c < numCols; ++c) { + colXOffsets[c] = static_cast(xAccum); + xAccum += TABLE_CELL_PAD_X + colWidths[c] + TABLE_CELL_PAD_X + TABLE_GRID_LINE_PX; + } + const int16_t totalTableWidth = static_cast(xAccum); + + // Helper: compute the combined content width for a cell spanning multiple columns. + // This includes the content widths plus the internal grid lines and padding between spanned columns. + auto spanContentWidth = [&](size_t startCol, int colspan) -> uint16_t { + int width = 0; + for (int s = 0; s < colspan && startCol + s < numCols; ++s) { + width += colWidths[startCol + s]; + if (s > 0) { + // Add internal padding and grid line between spanned columns + width += TABLE_CELL_PAD_X * 2 + TABLE_GRID_LINE_PX; + } + } + return static_cast(std::max(width, 0)); + }; + + // Helper: compute the full cell width (including padding on both sides) for a spanning cell. + auto spanFullCellWidth = [&](size_t startCol, int colspan) -> uint16_t { + if (colspan <= 0 || startCol >= numCols) return 0; + const size_t endCol = std::min(startCol + static_cast(colspan), numCols) - 1; + // From the left edge of startCol's cell to the right edge of endCol's cell + const int leftEdge = colXOffsets[startCol]; + const int rightEdge = colXOffsets[endCol] + TABLE_CELL_PAD_X + colWidths[endCol] + TABLE_CELL_PAD_X; + return static_cast(rightEdge - leftEdge); + }; + + // 4. Lay out each row: map cells to logical columns, create PageTableRow + for (auto& row : tableData->rows) { + // Build cell data for this row, one entry per CELL (not per logical column). + // Each PageTableCellData gets the correct x-offset and combined column width. + std::vector cellDataVec; + size_t maxLinesInRow = 1; + size_t logicalCol = 0; + + for (size_t ci = 0; ci < row.cells.size() && logicalCol < numCols; ++ci) { + auto& cell = row.cells[ci]; + const int cs = cell.colspan; + + PageTableCellData cellData; + cellData.xOffset = colXOffsets[logicalCol]; + cellData.columnWidth = spanFullCellWidth(logicalCol, cs); + + if (cell.content && !cell.content->isEmpty()) { + // Center-align cells that span the full table width (common for section headers/titles) + if (cs >= static_cast(numCols)) { + BlockStyle centeredStyle = cell.content->getBlockStyle(); + centeredStyle.alignment = CssTextAlign::Center; + centeredStyle.textAlignDefined = true; + cell.content->setBlockStyle(centeredStyle); + } + + const uint16_t contentWidth = spanContentWidth(logicalCol, cs); + std::vector> cellLines; + + cell.content->layoutAndExtractLines( + renderer, fontId, contentWidth, + [&cellLines](const std::shared_ptr& textBlock) { cellLines.push_back(textBlock); }); + + if (cellLines.size() > maxLinesInRow) { + maxLinesInRow = cellLines.size(); + } + cellData.lines = std::move(cellLines); + } + + cellDataVec.push_back(std::move(cellData)); + logicalCol += static_cast(cs); + } + + // Fill remaining logical columns with empty cells (rows shorter than numCols) + while (logicalCol < numCols) { + PageTableCellData emptyCell; + emptyCell.xOffset = colXOffsets[logicalCol]; + emptyCell.columnWidth = static_cast(TABLE_CELL_PAD_X + colWidths[logicalCol] + TABLE_CELL_PAD_X); + cellDataVec.push_back(std::move(emptyCell)); + logicalCol++; + } + + // Row height = max lines * lineHeight + top/bottom border + asymmetric vertical padding + const int16_t rowHeight = static_cast( + static_cast(maxLinesInRow) * lh + 2 + TABLE_CELL_PAD_TOP + TABLE_CELL_PAD_BOTTOM); + + auto pageTableRow = std::make_shared( + std::move(cellDataVec), rowHeight, totalTableWidth, static_cast(lh), 0, 0); + + addTableRowToPage(std::move(pageTableRow)); + } + + // Add a small gap after the table + if (extraParagraphSpacing) { + currentPageNextY += lh / 2; + } +} diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h index 761ee1d5..c42b9348 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h @@ -7,11 +7,13 @@ #include #include "../ParsedText.h" +#include "../TableData.h" #include "../blocks/TextBlock.h" #include "../css/CssParser.h" #include "../css/CssStyle.h" class Page; +class PageTableRow; class GfxRenderer; #define MAX_WORD_SIZE 200 @@ -57,10 +59,16 @@ class ChapterHtmlSlimParser { bool effectiveItalic = false; bool effectiveUnderline = false; + // Table buffering state + bool inTable = false; + std::unique_ptr tableData; + void updateEffectiveInlineStyle(); void startNewTextBlock(const BlockStyle& blockStyle); void flushPartWordBuffer(); void makePages(); + void processTable(); + void addTableRowToPage(std::shared_ptr row); // XML callbacks static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts); static void XMLCALL characterData(void* userData, const XML_Char* s, int len);