diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index da8bbf57..7bada8f2 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -53,6 +53,10 @@ bool isHeaderOrBlock(const char* name) { return matches(name, HEADER_TAGS, NUM_HEADER_TAGS) || matches(name, BLOCK_TAGS, NUM_BLOCK_TAGS); } +bool isTableStructuralTag(const char* name) { + return strcmp(name, "table") == 0 || strcmp(name, "tr") == 0 || strcmp(name, "td") == 0 || strcmp(name, "th") == 0; +} + // Update effective bold/italic/underline based on block style and inline style stack void ChapterHtmlSlimParser::updateEffectiveInlineStyle() { // Start with block-level styles @@ -145,18 +149,66 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* centeredBlockStyle.textAlignDefined = true; centeredBlockStyle.alignment = CssTextAlign::Center; - // Special handling for tables - show placeholder text instead of dropping silently + // Special handling for tables/cells: flatten into per-cell paragraphs with a prefixed header. if (strcmp(name, "table") == 0) { - // Add placeholder text - self->startNewTextBlock(centeredBlockStyle); + // skip nested tables + if (self->tableDepth > 0) { + self->tableDepth += 1; + return; + } - self->italicUntilDepth = min(self->italicUntilDepth, self->depth); - // Advance depth before processing character data (like you would for an element with text) + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + } + self->tableDepth += 1; + self->tableRowIndex = 0; + self->tableColIndex = 0; self->depth += 1; - self->characterData(userData, "[Table omitted]", strlen("[Table omitted]")); + return; + } - // Skip table contents (skip until parent as we pre-advanced depth above) - self->skipUntilDepth = self->depth - 1; + if (self->tableDepth == 1 && strcmp(name, "tr") == 0) { + self->tableRowIndex += 1; + self->tableColIndex = 0; + self->depth += 1; + return; + } + + if (self->tableDepth == 1 && (strcmp(name, "td") == 0 || strcmp(name, "th") == 0)) { + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + } + self->tableColIndex += 1; + + auto tableCellBlockStyle = BlockStyle(); + tableCellBlockStyle.textAlignDefined = true; + const auto align = (self->paragraphAlignment == static_cast(CssTextAlign::None)) + ? CssTextAlign::Justify + : static_cast(self->paragraphAlignment); + tableCellBlockStyle.alignment = align; + self->startNewTextBlock(tableCellBlockStyle); + + const std::string headerText = + "Tab Row " + std::to_string(self->tableRowIndex) + ", Cell " + std::to_string(self->tableColIndex) + ":"; + StyleStackEntry headerStyle; + headerStyle.depth = self->depth; + headerStyle.hasBold = true; + headerStyle.bold = false; + headerStyle.hasItalic = true; + headerStyle.italic = true; + headerStyle.hasUnderline = true; + headerStyle.underline = false; + self->inlineStyleStack.push_back(headerStyle); + self->updateEffectiveInlineStyle(); + self->characterData(userData, headerText.c_str(), static_cast(headerText.length())); + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + } + self->nextWordContinues = false; + self->inlineStyleStack.pop_back(); + self->updateEffectiveInlineStyle(); + + self->depth += 1; return; } @@ -445,6 +497,11 @@ void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char* s, const int len) { auto* self = static_cast(userData); + // Skip content of nested table + if (self->tableDepth > 1) { + return; + } + // Middle of skip if (self->skipUntilDepth < self->depth) { return; @@ -548,15 +605,24 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n const bool styleWillChange = willPopStyleStack || willClearBold || willClearItalic || willClearUnderline; const bool headerOrBlockTag = isHeaderOrBlock(name); + const bool tableStructuralTag = isTableStructuralTag(name); + + if (self->tableDepth > 1 && strcmp(name, "table") == 0) { + // get rid of all text inside the nested table + self->partWordBufferIndex = 0; + self->tableDepth -= 1; + LOG_DBG("EHP", "nested table detected, get rid of its content"); + return; + } // Flush buffer with current style BEFORE any style changes if (self->partWordBufferIndex > 0) { // Flush if style will change OR if we're closing a block/structural element - const bool isInlineTag = !headerOrBlockTag && strcmp(name, "table") != 0 && - !matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1; + const bool isInlineTag = + !headerOrBlockTag && !tableStructuralTag && !matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) && self->depth != 1; const bool shouldFlush = styleWillChange || headerOrBlockTag || matches(name, BOLD_TAGS, NUM_BOLD_TAGS) || matches(name, ITALIC_TAGS, NUM_ITALIC_TAGS) || - matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || strcmp(name, "table") == 0 || + matches(name, UNDERLINE_TAGS, NUM_UNDERLINE_TAGS) || tableStructuralTag || matches(name, IMAGE_TAGS, NUM_IMAGE_TAGS) || self->depth == 1; if (shouldFlush) { @@ -575,6 +641,21 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n self->skipUntilDepth = INT_MAX; } + if (self->tableDepth == 1 && (strcmp(name, "td") == 0 || strcmp(name, "th") == 0)) { + self->nextWordContinues = false; + } + + if (self->tableDepth == 1 && (strcmp(name, "tr") == 0)) { + self->nextWordContinues = false; + } + + if (self->tableDepth == 1 && strcmp(name, "table") == 0) { + self->tableDepth -= 1; + self->tableRowIndex = 0; + self->tableColIndex = 0; + self->nextWordContinues = false; + } + // Leaving bold tag if (self->boldUntilDepth == self->depth) { self->boldUntilDepth = INT_MAX; diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h index f9d622ba..a496e5c2 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h @@ -62,6 +62,9 @@ class ChapterHtmlSlimParser { bool effectiveBold = false; bool effectiveItalic = false; bool effectiveUnderline = false; + int tableDepth = 0; + int tableRowIndex = 0; + int tableColIndex = 0; void updateEffectiveInlineStyle(); void startNewTextBlock(const BlockStyle& blockStyle); diff --git a/test/epubs/test_tables.epub b/test/epubs/test_tables.epub new file mode 100644 index 00000000..a7a4d311 Binary files /dev/null and b/test/epubs/test_tables.epub differ