fix: Account for nbsp; character as non-breaking space (#757)
## Summary Closes #743. **What is the goal of this PR?** - Add back handling for HTML entities in expat. This was originally part of the code that got removed [here](https://github.com/crosspoint-reader/crosspoint-reader/pull/274) - Handle ` ` characters to resolve issue #743 **What changes are included?** - Brought back HTML entity table from previous commit and refactored it to use a static const char * table with linear lookup to reduce heap allocations. - Used `XML_SetDefaultHandlerExpand` in expat to parse out the entities correctly, without needing them defined in DOCTYPE - Added handling for ` ` so that the text stays together and doesn't break onto a new line with text separated by an ` ` ## Additional Context - This supersedes [this PR](https://github.com/crosspoint-reader/crosspoint-reader/pull/751) that simply handled `nbsp;` as whitespace. Instead, we want that character to serve its true purpose and affect the line-breaking algorithm. - Updated my test EPUB [here](https://github.com/jdk2pq/css-test-epub) with ` ` characters examples at the end of the book --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**YES**_, Claude Code
This commit is contained in:
@@ -32,6 +32,9 @@ void stripSoftHyphensInPlace(std::string& word) {
|
|||||||
// Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
|
// Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
|
||||||
uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
|
||||||
const EpdFontFamily::Style style, const bool appendHyphen = false) {
|
const EpdFontFamily::Style style, const bool appendHyphen = false) {
|
||||||
|
if (word.size() == 1 && word[0] == ' ' && !appendHyphen) {
|
||||||
|
return renderer.getSpaceWidth(fontId);
|
||||||
|
}
|
||||||
const bool hasSoftHyphen = containsSoftHyphen(word);
|
const bool hasSoftHyphen = containsSoftHyphen(word);
|
||||||
if (!hasSoftHyphen && !appendHyphen) {
|
if (!hasSoftHyphen && !appendHyphen) {
|
||||||
return renderer.getTextWidth(fontId, word.c_str(), style);
|
return renderer.getTextWidth(fontId, word.c_str(), style);
|
||||||
|
|||||||
76
lib/Epub/Epub/htmlEntities.cpp
Normal file
76
lib/Epub/Epub/htmlEntities.cpp
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
// from
|
||||||
|
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
|
||||||
|
|
||||||
|
#include "htmlEntities.h"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
struct EntityPair {
|
||||||
|
const char* key;
|
||||||
|
const char* value;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const EntityPair ENTITY_LOOKUP[] = {
|
||||||
|
{""", "\""}, {"⁄", "⁄"}, {"&", "&"}, {"<", "<"}, {">", ">"},
|
||||||
|
{"À", "À"}, {"Á", "Á"}, {"Â", "Â"}, {"Ã", "Ã"}, {"Ä", "Ä"},
|
||||||
|
{"Å", "Å"}, {"Æ", "Æ"}, {"Ç", "Ç"}, {"È", "È"}, {"É", "É"},
|
||||||
|
{"Ê", "Ê"}, {"Ë", "Ë"}, {"Ì", "Ì"}, {"Í", "Í"}, {"Î", "Î"},
|
||||||
|
{"Ï", "Ï"}, {"Ð", "Ð"}, {"Ñ", "Ñ"}, {"Ò", "Ò"}, {"Ó", "Ó"},
|
||||||
|
{"Ô", "Ô"}, {"Õ", "Õ"}, {"Ö", "Ö"}, {"Ø", "Ø"}, {"Ù", "Ù"},
|
||||||
|
{"Ú", "Ú"}, {"Û", "Û"}, {"Ü", "Ü"}, {"Ý", "Ý"}, {"Þ", "Þ"},
|
||||||
|
{"ß", "ß"}, {"à", "à"}, {"á", "á"}, {"â", "â"}, {"ã", "ã"},
|
||||||
|
{"ä", "ä"}, {"å", "å"}, {"æ", "æ"}, {"ç", "ç"}, {"è", "è"},
|
||||||
|
{"é", "é"}, {"ê", "ê"}, {"ë", "ë"}, {"ì", "ì"}, {"í", "í"},
|
||||||
|
{"î", "î"}, {"ï", "ï"}, {"ð", "ð"}, {"ñ", "ñ"}, {"ò", "ò"},
|
||||||
|
{"ó", "ó"}, {"ô", "ô"}, {"õ", "õ"}, {"ö", "ö"}, {"ø", "ø"},
|
||||||
|
{"ù", "ù"}, {"ú", "ú"}, {"û", "û"}, {"ü", "ü"}, {"ý", "ý"},
|
||||||
|
{"þ", "þ"}, {"ÿ", "ÿ"}, {" ", "\xC2\xA0"}, {"¡", "¡"}, {"¢", "¢"},
|
||||||
|
{"£", "£"}, {"¤", "¤"}, {"¥", "¥"}, {"¦", "¦"}, {"§", "§"},
|
||||||
|
{"¨", "¨"}, {"©", "©"}, {"ª", "ª"}, {"«", "«"}, {"¬", "¬"},
|
||||||
|
{"­", ""}, {"®", "®"}, {"¯", "¯"}, {"°", "°"}, {"±", "±"},
|
||||||
|
{"²", "²"}, {"³", "³"}, {"´", "´"}, {"µ", "µ"}, {"¶", "¶"},
|
||||||
|
{"¸", "¸"}, {"¹", "¹"}, {"º", "º"}, {"»", "»"}, {"¼", "¼"},
|
||||||
|
{"½", "½"}, {"¾", "¾"}, {"¿", "¿"}, {"×", "×"}, {"÷", "÷"},
|
||||||
|
{"∀", "∀"}, {"∂", "∂"}, {"∃", "∃"}, {"∅", "∅"}, {"∇", "∇"},
|
||||||
|
{"∈", "∈"}, {"∉", "∉"}, {"∋", "∋"}, {"∏", "∏"}, {"∑", "∑"},
|
||||||
|
{"−", "−"}, {"∗", "∗"}, {"√", "√"}, {"∝", "∝"}, {"∞", "∞"},
|
||||||
|
{"∠", "∠"}, {"∧", "∧"}, {"∨", "∨"}, {"∩", "∩"}, {"∪", "∪"},
|
||||||
|
{"∫", "∫"}, {"∴", "∴"}, {"∼", "∼"}, {"≅", "≅"}, {"≈", "≈"},
|
||||||
|
{"≠", "≠"}, {"≡", "≡"}, {"≤", "≤"}, {"≥", "≥"}, {"⊂", "⊂"},
|
||||||
|
{"⊃", "⊃"}, {"⊄", "⊄"}, {"⊆", "⊆"}, {"⊇", "⊇"}, {"⊕", "⊕"},
|
||||||
|
{"⊗", "⊗"}, {"⊥", "⊥"}, {"⋅", "⋅"}, {"Α", "Α"}, {"Β", "Β"},
|
||||||
|
{"Γ", "Γ"}, {"Δ", "Δ"}, {"Ε", "Ε"}, {"Ζ", "Ζ"}, {"Η", "Η"},
|
||||||
|
{"Θ", "Θ"}, {"Ι", "Ι"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"},
|
||||||
|
{"Ν", "Ν"}, {"Ξ", "Ξ"}, {"Ο", "Ο"}, {"Π", "Π"}, {"Ρ", "Ρ"},
|
||||||
|
{"Σ", "Σ"}, {"Τ", "Τ"}, {"Υ", "Υ"}, {"Φ", "Φ"}, {"Χ", "Χ"},
|
||||||
|
{"Ψ", "Ψ"}, {"Ω", "Ω"}, {"α", "α"}, {"β", "β"}, {"γ", "γ"},
|
||||||
|
{"δ", "δ"}, {"ε", "ε"}, {"ζ", "ζ"}, {"η", "η"}, {"θ", "θ"},
|
||||||
|
{"ι", "ι"}, {"κ", "κ"}, {"λ", "λ"}, {"μ", "μ"}, {"ν", "ν"},
|
||||||
|
{"ξ", "ξ"}, {"ο", "ο"}, {"π", "π"}, {"ρ", "ρ"}, {"ς", "ς"},
|
||||||
|
{"σ", "σ"}, {"τ", "τ"}, {"υ", "υ"}, {"φ", "φ"}, {"χ", "χ"},
|
||||||
|
{"ψ", "ψ"}, {"ω", "ω"}, {"ϑ", "ϑ"}, {"ϒ", "ϒ"}, {"ϖ", "ϖ"},
|
||||||
|
{"Œ", "Œ"}, {"œ", "œ"}, {"Š", "Š"}, {"š", "š"}, {"Ÿ", "Ÿ"},
|
||||||
|
{"ƒ", "ƒ"}, {"ˆ", "ˆ"}, {"˜", "˜"}, {" ", " "}, {" ", " "},
|
||||||
|
{" ", " "}, {"‌", ""}, {"‍", ""}, {"‎", ""}, {"‏", ""},
|
||||||
|
{"–", "–"}, {"—", "—"}, {"‘", "‘"}, {"’", "’"}, {"‚", "‚"},
|
||||||
|
{"“", "“"}, {"”", "”"}, {"„", "„"}, {"†", "†"}, {"‡", "‡"},
|
||||||
|
{"•", "•"}, {"…", "…"}, {"‰", "‰"}, {"′", "′"}, {"″", "″"},
|
||||||
|
{"‹", "‹"}, {"›", "›"}, {"‾", "‾"}, {"€", "€"}, {"™", "™"},
|
||||||
|
{"←", "←"}, {"↑", "↑"}, {"→", "→"}, {"↓", "↓"}, {"↔", "↔"},
|
||||||
|
{"↵", "↵"}, {"⌈", "⌈"}, {"⌉", "⌉"}, {"⌊", "⌊"}, {"⌋", "⌋"},
|
||||||
|
{"◊", "◊"}, {"♠", "♠"}, {"♣", "♣"}, {"♥", "♥"}, {"♦", "♦"}};
|
||||||
|
|
||||||
|
static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]);
|
||||||
|
|
||||||
|
// Lookup a single HTML entity and return its UTF-8 value
|
||||||
|
const char* lookupHtmlEntity(const char* entity, int len) {
|
||||||
|
for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) {
|
||||||
|
const char* key = ENTITY_LOOKUP[i].key;
|
||||||
|
const size_t keyLen = strlen(key);
|
||||||
|
if (static_cast<size_t>(len) == keyLen && memcmp(entity, key, keyLen) == 0) {
|
||||||
|
return ENTITY_LOOKUP[i].value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr; // Entity not found
|
||||||
|
}
|
||||||
9
lib/Epub/Epub/htmlEntities.h
Normal file
9
lib/Epub/Epub/htmlEntities.h
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
// from
|
||||||
|
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// Lookup a single HTML entity (including & and ;) and return its UTF-8 value
|
||||||
|
// Returns nullptr if entity is not found
|
||||||
|
const char* lookupHtmlEntity(const char* entity, int len);
|
||||||
@@ -6,6 +6,7 @@
|
|||||||
#include <expat.h>
|
#include <expat.h>
|
||||||
|
|
||||||
#include "../Page.h"
|
#include "../Page.h"
|
||||||
|
#include "../htmlEntities.h"
|
||||||
|
|
||||||
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
|
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
|
||||||
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
|
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
|
||||||
@@ -359,6 +360,28 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0
|
||||||
|
// Render a visible space without allowing a line break around it.
|
||||||
|
if (static_cast<uint8_t>(s[i]) == 0xC2 && i + 1 < len && static_cast<uint8_t>(s[i + 1]) == 0xA0) {
|
||||||
|
// Flush any pending text so style is applied correctly.
|
||||||
|
if (self->partWordBufferIndex > 0) {
|
||||||
|
self->flushPartWordBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a standalone space that attaches to the previous word.
|
||||||
|
self->partWordBuffer[0] = ' ';
|
||||||
|
self->partWordBuffer[1] = '\0';
|
||||||
|
self->partWordBufferIndex = 1;
|
||||||
|
self->nextWordContinues = true; // Attach space to previous word (no break).
|
||||||
|
self->flushPartWordBuffer();
|
||||||
|
|
||||||
|
// Ensure the next real word attaches to this space (no break).
|
||||||
|
self->nextWordContinues = true;
|
||||||
|
|
||||||
|
i++; // Skip the second byte (0xA0)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
|
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
|
||||||
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
|
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
|
||||||
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
|
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
|
||||||
@@ -393,6 +416,22 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) {
|
||||||
|
// Check if this looks like an entity reference (&...;)
|
||||||
|
if (len >= 3 && s[0] == '&' && s[len - 1] == ';') {
|
||||||
|
const char* utf8Value = lookupHtmlEntity(s, len);
|
||||||
|
if (utf8Value != nullptr) {
|
||||||
|
// Known entity: expand to its UTF-8 value
|
||||||
|
characterData(userData, utf8Value, strlen(utf8Value));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Unknown entity: preserve original &...; sequence
|
||||||
|
characterData(userData, s, len);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Not an entity we recognize - skip it
|
||||||
|
}
|
||||||
|
|
||||||
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
|
void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
|
||||||
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
|
||||||
|
|
||||||
@@ -481,6 +520,10 @@ bool ChapterHtmlSlimParser::parseAndBuildPages() {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle HTML entities (like ) that aren't in XML spec or DTD
|
||||||
|
// Using DefaultHandlerExpand preserves normal entity expansion from DOCTYPE
|
||||||
|
XML_SetDefaultHandlerExpand(parser, defaultHandlerExpand);
|
||||||
|
|
||||||
FsFile file;
|
FsFile file;
|
||||||
if (!Storage.openFileForRead("EHP", filepath, file)) {
|
if (!Storage.openFileForRead("EHP", filepath, file)) {
|
||||||
XML_ParserFree(parser);
|
XML_ParserFree(parser);
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ class ChapterHtmlSlimParser {
|
|||||||
// XML callbacks
|
// XML callbacks
|
||||||
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
|
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
|
||||||
static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
|
static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
|
||||||
|
static void XMLCALL defaultHandlerExpand(void* userData, const XML_Char* s, int len);
|
||||||
static void XMLCALL endElement(void* userData, const XML_Char* name);
|
static void XMLCALL endElement(void* userData, const XML_Char* name);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|||||||
Reference in New Issue
Block a user