diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp index 82ddaecd..82d0db61 100644 --- a/lib/Epub/Epub/ParsedText.cpp +++ b/lib/Epub/Epub/ParsedText.cpp @@ -32,6 +32,9 @@ void stripSoftHyphensInPlace(std::string& word) { // Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen. uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word, const EpdFontFamily::Style style, const bool appendHyphen = false) { + if (word.size() == 1 && word[0] == ' ' && !appendHyphen) { + return renderer.getSpaceWidth(fontId); + } const bool hasSoftHyphen = containsSoftHyphen(word); if (!hasSoftHyphen && !appendHyphen) { return renderer.getTextWidth(fontId, word.c_str(), style); diff --git a/lib/Epub/Epub/htmlEntities.cpp b/lib/Epub/Epub/htmlEntities.cpp new file mode 100644 index 00000000..82d3819a --- /dev/null +++ b/lib/Epub/Epub/htmlEntities.cpp @@ -0,0 +1,76 @@ +// from +// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp + +#include "htmlEntities.h" + +#include + +struct EntityPair { + const char* key; + const char* value; +}; + +static const EntityPair ENTITY_LOOKUP[] = { + {""", "\""}, {"⁄", "⁄"}, {"&", "&"}, {"<", "<"}, {">", ">"}, + {"À", "À"}, {"Á", "Á"}, {"Â", "Â"}, {"Ã", "Ã"}, {"Ä", "Ä"}, + {"Å", "Å"}, {"Æ", "Æ"}, {"Ç", "Ç"}, {"È", "È"}, {"É", "É"}, + {"Ê", "Ê"}, {"Ë", "Ë"}, {"Ì", "Ì"}, {"Í", "Í"}, {"Î", "Î"}, + {"Ï", "Ï"}, {"Ð", "Ð"}, {"Ñ", "Ñ"}, {"Ò", "Ò"}, {"Ó", "Ó"}, + {"Ô", "Ô"}, {"Õ", "Õ"}, {"Ö", "Ö"}, {"Ø", "Ø"}, {"Ù", "Ù"}, + {"Ú", "Ú"}, {"Û", "Û"}, {"Ü", "Ü"}, {"Ý", "Ý"}, {"Þ", "Þ"}, + {"ß", "ß"}, {"à", "à"}, {"á", "á"}, {"â", "â"}, {"ã", "ã"}, + {"ä", "ä"}, {"å", "å"}, {"æ", "æ"}, {"ç", "ç"}, {"è", "è"}, + {"é", "é"}, {"ê", "ê"}, {"ë", "ë"}, {"ì", "ì"}, {"í", "í"}, + {"î", "î"}, {"ï", "ï"}, {"ð", "ð"}, {"ñ", "ñ"}, {"ò", "ò"}, + {"ó", "ó"}, {"ô", "ô"}, {"õ", "õ"}, {"ö", "ö"}, {"ø", "ø"}, + {"ù", "ù"}, {"ú", "ú"}, {"û", "û"}, {"ü", "ü"}, {"ý", "ý"}, + {"þ", "þ"}, {"ÿ", "ÿ"}, {" ", "\xC2\xA0"}, {"¡", "¡"}, {"¢", "¢"}, + {"£", "£"}, {"¤", "¤"}, {"¥", "¥"}, {"¦", "¦"}, {"§", "§"}, + {"¨", "¨"}, {"©", "©"}, {"ª", "ª"}, {"«", "«"}, {"¬", "¬"}, + {"­", "­"}, {"®", "®"}, {"¯", "¯"}, {"°", "°"}, {"±", "±"}, + {"²", "²"}, {"³", "³"}, {"´", "´"}, {"µ", "µ"}, {"¶", "¶"}, + {"¸", "¸"}, {"¹", "¹"}, {"º", "º"}, {"»", "»"}, {"¼", "¼"}, + {"½", "½"}, {"¾", "¾"}, {"¿", "¿"}, {"×", "×"}, {"÷", "÷"}, + {"∀", "∀"}, {"∂", "∂"}, {"∃", "∃"}, {"∅", "∅"}, {"∇", "∇"}, + {"∈", "∈"}, {"∉", "∉"}, {"∋", "∋"}, {"∏", "∏"}, {"∑", "∑"}, + {"−", "−"}, {"∗", "∗"}, {"√", "√"}, {"∝", "∝"}, {"∞", "∞"}, + {"∠", "∠"}, {"∧", "∧"}, {"∨", "∨"}, {"∩", "∩"}, {"∪", "∪"}, + {"∫", "∫"}, {"∴", "∴"}, {"∼", "∼"}, {"≅", "≅"}, {"≈", "≈"}, + {"≠", "≠"}, {"≡", "≡"}, {"≤", "≤"}, {"≥", "≥"}, {"⊂", "⊂"}, + {"⊃", "⊃"}, {"⊄", "⊄"}, {"⊆", "⊆"}, {"⊇", "⊇"}, {"⊕", "⊕"}, + {"⊗", "⊗"}, {"⊥", "⊥"}, {"⋅", "⋅"}, {"Α", "Α"}, {"Β", "Β"}, + {"Γ", "Γ"}, {"Δ", "Δ"}, {"Ε", "Ε"}, {"Ζ", "Ζ"}, {"Η", "Η"}, + {"Θ", "Θ"}, {"Ι", "Ι"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"}, + {"Ν", "Ν"}, {"Ξ", "Ξ"}, {"Ο", "Ο"}, {"Π", "Π"}, {"Ρ", "Ρ"}, + {"Σ", "Σ"}, {"Τ", "Τ"}, {"Υ", "Υ"}, {"Φ", "Φ"}, {"Χ", "Χ"}, + {"Ψ", "Ψ"}, {"Ω", "Ω"}, {"α", "α"}, {"β", "β"}, {"γ", "γ"}, + {"δ", "δ"}, {"ε", "ε"}, {"ζ", "ζ"}, {"η", "η"}, {"θ", "θ"}, + {"ι", "ι"}, {"κ", "κ"}, {"λ", "λ"}, {"μ", "μ"}, {"ν", "ν"}, + {"ξ", "ξ"}, {"ο", "ο"}, {"π", "π"}, {"ρ", "ρ"}, {"ς", "ς"}, + {"σ", "σ"}, {"τ", "τ"}, {"υ", "υ"}, {"φ", "φ"}, {"χ", "χ"}, + {"ψ", "ψ"}, {"ω", "ω"}, {"ϑ", "ϑ"}, {"ϒ", "ϒ"}, {"ϖ", "ϖ"}, + {"Œ", "Œ"}, {"œ", "œ"}, {"Š", "Š"}, {"š", "š"}, {"Ÿ", "Ÿ"}, + {"ƒ", "ƒ"}, {"ˆ", "ˆ"}, {"˜", "˜"}, {" ", " "}, {" ", " "}, + {" ", " "}, {"‌", "‌"}, {"‍", "‍"}, {"‎", "‎"}, {"‏", "‏"}, + {"–", "–"}, {"—", "—"}, {"‘", "‘"}, {"’", "’"}, {"‚", "‚"}, + {"“", "“"}, {"”", "”"}, {"„", "„"}, {"†", "†"}, {"‡", "‡"}, + {"•", "•"}, {"…", "…"}, {"‰", "‰"}, {"′", "′"}, {"″", "″"}, + {"‹", "‹"}, {"›", "›"}, {"‾", "‾"}, {"€", "€"}, {"™", "™"}, + {"←", "←"}, {"↑", "↑"}, {"→", "→"}, {"↓", "↓"}, {"↔", "↔"}, + {"↵", "↵"}, {"⌈", "⌈"}, {"⌉", "⌉"}, {"⌊", "⌊"}, {"⌋", "⌋"}, + {"◊", "◊"}, {"♠", "♠"}, {"♣", "♣"}, {"♥", "♥"}, {"♦", "♦"}}; + +static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]); + +// Lookup a single HTML entity and return its UTF-8 value +const char* lookupHtmlEntity(const char* entity, int len) { + for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) { + const char* key = ENTITY_LOOKUP[i].key; + const size_t keyLen = strlen(key); + if (static_cast(len) == keyLen && memcmp(entity, key, keyLen) == 0) { + return ENTITY_LOOKUP[i].value; + } + } + + return nullptr; // Entity not found +} diff --git a/lib/Epub/Epub/htmlEntities.h b/lib/Epub/Epub/htmlEntities.h new file mode 100644 index 00000000..0221195f --- /dev/null +++ b/lib/Epub/Epub/htmlEntities.h @@ -0,0 +1,9 @@ +// from +// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp + +#pragma once +#include + +// Lookup a single HTML entity (including & and ;) and return its UTF-8 value +// Returns nullptr if entity is not found +const char* lookupHtmlEntity(const char* entity, int len); diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index 3df222a0..e5512472 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -6,6 +6,7 @@ #include #include "../Page.h" +#include "../htmlEntities.h" const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"}; constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]); @@ -359,6 +360,28 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char continue; } + // Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0 + // Render a visible space without allowing a line break around it. + if (static_cast(s[i]) == 0xC2 && i + 1 < len && static_cast(s[i + 1]) == 0xA0) { + // Flush any pending text so style is applied correctly. + if (self->partWordBufferIndex > 0) { + self->flushPartWordBuffer(); + } + + // Add a standalone space that attaches to the previous word. + self->partWordBuffer[0] = ' '; + self->partWordBuffer[1] = '\0'; + self->partWordBufferIndex = 1; + self->nextWordContinues = true; // Attach space to previous word (no break). + self->flushPartWordBuffer(); + + // Ensure the next real word attaches to this space (no break). + self->nextWordContinues = true; + + i++; // Skip the second byte (0xA0) + continue; + } + // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF const XML_Char FEFF_BYTE_1 = static_cast(0xEF); const XML_Char FEFF_BYTE_2 = static_cast(0xBB); @@ -393,6 +416,22 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char } } +void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) { + // Check if this looks like an entity reference (&...;) + if (len >= 3 && s[0] == '&' && s[len - 1] == ';') { + const char* utf8Value = lookupHtmlEntity(s, len); + if (utf8Value != nullptr) { + // Known entity: expand to its UTF-8 value + characterData(userData, utf8Value, strlen(utf8Value)); + return; + } + // Unknown entity: preserve original &...; sequence + characterData(userData, s, len); + return; + } + // Not an entity we recognize - skip it +} + void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) { auto* self = static_cast(userData); @@ -481,6 +520,10 @@ bool ChapterHtmlSlimParser::parseAndBuildPages() { return false; } + // Handle HTML entities (like  ) that aren't in XML spec or DTD + // Using DefaultHandlerExpand preserves normal entity expansion from DOCTYPE + XML_SetDefaultHandlerExpand(parser, defaultHandlerExpand); + FsFile file; if (!Storage.openFileForRead("EHP", filepath, file)) { XML_ParserFree(parser); diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h index 909913b1..761ee1d5 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h @@ -64,6 +64,7 @@ class ChapterHtmlSlimParser { // XML callbacks static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts); static void XMLCALL characterData(void* userData, const XML_Char* s, int len); + static void XMLCALL defaultHandlerExpand(void* userData, const XML_Char* s, int len); static void XMLCALL endElement(void* userData, const XML_Char* name); public: