From 6e51afb9776529c67c0e79f5f8d4eacb09a413d1 Mon Sep 17 00:00:00 2001 From: Jake Kenneally Date: Fri, 13 Feb 2026 09:46:46 -0500 Subject: [PATCH] fix: Account for `nbsp;` character as non-breaking space (#757) ## Summary Closes #743. **What is the goal of this PR?** - Add back handling for HTML entities in expat. This was originally part of the code that got removed [here](https://github.com/crosspoint-reader/crosspoint-reader/pull/274) - Handle ` ` characters to resolve issue #743 **What changes are included?** - Brought back HTML entity table from previous commit and refactored it to use a static const char * table with linear lookup to reduce heap allocations. - Used `XML_SetDefaultHandlerExpand` in expat to parse out the entities correctly, without needing them defined in DOCTYPE - Added handling for ` ` so that the text stays together and doesn't break onto a new line with text separated by an ` ` ## Additional Context - This supersedes [this PR](https://github.com/crosspoint-reader/crosspoint-reader/pull/751) that simply handled `nbsp;` as whitespace. Instead, we want that character to serve its true purpose and affect the line-breaking algorithm. - Updated my test EPUB [here](https://github.com/jdk2pq/css-test-epub) with ` ` characters examples at the end of the book --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**YES**_, Claude Code --- lib/Epub/Epub/ParsedText.cpp | 3 + lib/Epub/Epub/htmlEntities.cpp | 76 +++++++++++++++++++ lib/Epub/Epub/htmlEntities.h | 9 +++ .../Epub/parsers/ChapterHtmlSlimParser.cpp | 43 +++++++++++ lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h | 1 + 5 files changed, 132 insertions(+) create mode 100644 lib/Epub/Epub/htmlEntities.cpp create mode 100644 lib/Epub/Epub/htmlEntities.h diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 82ddaecd..82d0db61 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -32,6 +32,9 @@ void stripSoftHyphensInPlace(std::string& word) { // Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen. uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, const EpdFontFamily::Style style, const bool appendHyphen = false) { + if (word.size() == 1 && word[0] == ' ' && !appendHyphen) { + return renderer.getSpaceWidth(fontId); + } const bool hasSoftHyphen = containsSoftHyphen(word); if (!hasSoftHyphen && !appendHyphen) { return renderer.getTextWidth(fontId, word.c_str(), style); diff --git a/lib/Epub/Epub/htmlEntities.cpp b/lib/Epub/Epub/htmlEntities.cpp new file mode 100644 index 00000000..82d3819a --- /dev/null +++ b/lib/Epub/Epub/htmlEntities.cpp @@ -0,0 +1,76 @@ +// from +// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp + +#include "htmlEntities.h" + +#include + +struct EntityPair { + const char* key; + const char* value; +}; + +static const EntityPair ENTITY_LOOKUP[] = { + {""", "\""}, {"⁄", "⁄"}, {"&", "&"}, {"<", "<"}, {">", ">"}, + {"À", "À"}, {"Á", "Á"}, {"Â", "Â"}, {"Ã", "Ã"}, {"Ä", "Ä"}, + {"Å", "Å"}, {"Æ", "Æ"}, {"Ç", "Ç"}, {"È", "È"}, {"É", "É"}, + {"Ê", "Ê"}, {"Ë", "Ë"}, {"Ì", "Ì"}, {"Í", "Í"}, {"Î", "Î"}, + {"Ï", "Ï"}, {"Ð", "Ð"}, {"Ñ", "Ñ"}, {"Ò", "Ò"}, {"Ó", "Ó"}, + {"Ô", "Ô"}, {"Õ", "Õ"}, {"Ö", "Ö"}, {"Ø", "Ø"}, {"Ù", "Ù"}, + {"Ú", "Ú"}, {"Û", "Û"}, {"Ü", "Ü"}, {"Ý", "Ý"}, {"Þ", "Þ"}, + {"ß", "ß"}, {"à", "à"}, {"á", "á"}, {"â", "â"}, {"ã", "ã"}, + {"ä", "ä"}, {"å", "å"}, {"æ", "æ"}, {"ç", "ç"}, {"è", "è"}, + {"é", "é"}, {"ê", "ê"}, {"ë", "ë"}, {"ì", "ì"}, {"í", "í"}, + {"î", "î"}, {"ï", "ï"}, {"ð", "ð"}, {"ñ", "ñ"}, {"ò", "ò"}, + {"ó", "ó"}, {"ô", "ô"}, {"õ", "õ"}, {"ö", "ö"}, {"ø", "ø"}, + {"ù", "ù"}, {"ú", "ú"}, {"û", "û"}, {"ü", "ü"}, {"ý", "ý"}, + {"þ", "þ"}, {"ÿ", "ÿ"}, {" ", "\xC2\xA0"}, {"¡", "¡"}, {"¢", "¢"}, + {"£", "£"}, {"¤", "¤"}, {"¥", "¥"}, {"¦", "¦"}, {"§", "§"}, + {"¨", "¨"}, {"©", "©"}, {"ª", "ª"}, {"«", "«"}, {"¬", "¬"}, + {"­", "­"}, {"®", "®"}, {"¯", "¯"}, {"°", "°"}, {"±", "±"}, + {"²", "²"}, {"³", "³"}, {"´", "´"}, {"µ", "µ"}, {"¶", "¶"}, + {"¸", "¸"}, {"¹", "¹"}, {"º", "º"}, {"»", "»"}, {"¼", "¼"}, + {"½", "½"}, {"¾", "¾"}, {"¿", "¿"}, {"×", "×"}, {"÷", "÷"}, + {"∀", "∀"}, {"∂", "∂"}, {"∃", "∃"}, {"∅", "∅"}, {"∇", "∇"}, + {"∈", "∈"}, {"∉", "∉"}, {"∋", "∋"}, {"∏", "∏"}, {"∑", "∑"}, + {"−", "−"}, {"∗", "∗"}, {"√", "√"}, {"∝", "∝"}, {"∞", "∞"}, + {"∠", "∠"}, {"∧", "∧"}, {"∨", "∨"}, {"∩", "∩"}, {"∪", "∪"}, + {"∫", "∫"}, {"∴", "∴"}, {"∼", "∼"}, {"≅", "≅"}, {"≈", "≈"}, + {"≠", "≠"}, {"≡", "≡"}, {"≤", "≤"}, {"≥", "≥"}, {"⊂", "⊂"}, + {"⊃", "⊃"}, {"⊄", "⊄"}, {"⊆", "⊆"}, {"⊇", "⊇"}, {"⊕", "⊕"}, + {"⊗", "⊗"}, {"⊥", "⊥"}, {"⋅", "⋅"}, {"Α", "Α"}, {"Β", "Β"}, + {"Γ", "Γ"}, {"Δ", "Δ"}, {"Ε", "Ε"}, {"Ζ", "Ζ"}, {"Η", "Η"}, + {"Θ", "Θ"}, {"Ι", "Ι"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"}, + {"Ν", "Ν"}, {"Ξ", "Ξ"}, {"Ο", "Ο"}, {"Π", "Π"}, {"Ρ", "Ρ"}, + {"Σ", "Σ"}, {"Τ", "Τ"}, {"Υ", "Υ"}, {"Φ", "Φ"}, {"Χ", "Χ"}, + {"Ψ", "Ψ"}, {"Ω", "Ω"}, {"α", "α"}, {"β", "β"}, {"γ", "γ"}, + {"δ", "δ"}, {"ε", "ε"}, {"ζ", "ζ"}, {"η", "η"}, {"θ", "θ"}, + {"ι", "ι"}, {"κ", "κ"}, {"λ", "λ"}, {"μ", "μ"}, {"ν", "ν"}, + {"ξ", "ξ"}, {"ο", "ο"}, {"π", "π"}, {"ρ", "ρ"}, {"ς", "ς"}, + {"σ", "σ"}, {"τ", "τ"}, {"υ", "υ"}, {"φ", "φ"}, {"χ", "χ"}, + {"ψ", "ψ"}, {"ω", "ω"}, {"ϑ", "ϑ"}, {"ϒ", "ϒ"}, {"ϖ", "ϖ"}, + {"Œ", "Œ"}, {"œ", "œ"}, {"Š", "Š"}, {"š", "š"}, {"Ÿ", "Ÿ"}, + {"ƒ", "ƒ"}, {"ˆ", "ˆ"}, {"˜", "˜"}, {" ", " "}, {" ", " "}, + {" ", " "}, {"‌", "‌"}, {"‍", "‍"}, {"‎", "‎"}, {"‏", "‏"}, + {"–", "–"}, {"—", "—"}, {"‘", "‘"}, {"’", "’"}, {"‚", "‚"}, + {"“", "“"}, {"”", "”"}, {"„", "„"}, {"†", "†"}, {"‡", "‡"}, + {"•", "•"}, {"…", "…"}, {"‰", "‰"}, {"′", "′"}, {"″", "″"}, + {"‹", "‹"}, {"›", "›"}, {"‾", "‾"}, {"€", "€"}, {"™", "™"}, + {"←", "←"}, {"↑", "↑"}, {"→", "→"}, {"↓", "↓"}, {"↔", "↔"}, + {"↵", "↵"}, {"⌈", "⌈"}, {"⌉", "⌉"}, {"⌊", "⌊"}, {"⌋", "⌋"}, + {"◊", "◊"}, {"♠", "♠"}, {"♣", "♣"}, {"♥", "♥"}, {"♦", "♦"}}; + +static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]); + +// Lookup a single HTML entity and return its UTF-8 value +const char* lookupHtmlEntity(const char* entity, int len) { + for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) { + const char* key = ENTITY_LOOKUP[i].key; + const size_t keyLen = strlen(key); + if (static_cast(len) == keyLen && memcmp(entity, key, keyLen) == 0) { + return ENTITY_LOOKUP[i].value; + } + } + + return nullptr; // Entity not found +} diff --git a/lib/Epub/Epub/htmlEntities.h b/lib/Epub/Epub/htmlEntities.h new file mode 100644 index 00000000..0221195f --- /dev/null +++ b/lib/Epub/Epub/htmlEntities.h @@ -0,0 +1,9 @@ +// from +// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp + +#pragma once +#include + +// Lookup a single HTML entity (including & and ;) and return its UTF-8 value +// Returns nullptr if entity is not found +const char* lookupHtmlEntity(const char* entity, int len); diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index 3df222a0..e5512472 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -6,6 +6,7 @@ #include #include "../Page.h" +#include "../htmlEntities.h" const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"}; constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]); @@ -359,6 +360,28 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char continue; } + // Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0 + // Render a visible space without allowing a line break around it. + if (static_cast(s[i]) == 0xC2 && i + 1 < len && static_cast(s[i + 1]) == 0xA0) { + // Flush any pending text so style is applied correctly. + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + } + + // Add a standalone space that attaches to the previous word. + self->partWordBuffer[0] = ' '; + self->partWordBuffer[1] = '\0'; + self->partWordBufferIndex = 1; + self->nextWordContinues = true; // Attach space to previous word (no break). + self->flushPartWordBuffer(); + + // Ensure the next real word attaches to this space (no break). + self->nextWordContinues = true; + + i++; // Skip the second byte (0xA0) + continue; + } + // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF const XML_Char FEFF_BYTE_1 = static_cast(0xEF); const XML_Char FEFF_BYTE_2 = static_cast(0xBB); @@ -393,6 +416,22 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char } } +void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) { + // Check if this looks like an entity reference (&...;) + if (len >= 3 && s[0] == '&' && s[len - 1] == ';') { + const char* utf8Value = lookupHtmlEntity(s, len); + if (utf8Value != nullptr) { + // Known entity: expand to its UTF-8 value + characterData(userData, utf8Value, strlen(utf8Value)); + return; + } + // Unknown entity: preserve original &...; sequence + characterData(userData, s, len); + return; + } + // Not an entity we recognize - skip it +} + void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) { auto* self = static_cast(userData); @@ -481,6 +520,10 @@ bool ChapterHtmlSlimParser::parseAndBuildPages() { return false; } + // Handle HTML entities (like  ) that aren't in XML spec or DTD + // Using DefaultHandlerExpand preserves normal entity expansion from DOCTYPE + XML_SetDefaultHandlerExpand(parser, defaultHandlerExpand); + FsFile file; if (!Storage.openFileForRead("EHP", filepath, file)) { XML_ParserFree(parser); diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h index 909913b1..761ee1d5 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h @@ -64,6 +64,7 @@ class ChapterHtmlSlimParser { // XML callbacks static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts); static void XMLCALL characterData(void* userData, const XML_Char* s, int len); + static void XMLCALL defaultHandlerExpand(void* userData, const XML_Char* s, int len); static void XMLCALL endElement(void* userData, const XML_Char* name); public: