From 6e51afb9776529c67c0e79f5f8d4eacb09a413d1 Mon Sep 17 00:00:00 2001
From: Jake Kenneally <jakekenneally@gmail.com>
Date: Fri, 13 Feb 2026 09:46:46 -0500
Subject: [PATCH] fix: Account for `nbsp;` character as non-breaking space
 (#757)

## Summary

Closes #743.

**What is the goal of this PR?**

- Add back handling for HTML entities in expat. This was originally part
of the code that got removed
[here](https://github.com/crosspoint-reader/crosspoint-reader/pull/274)
- Handle `&nbsp;` characters to resolve issue #743

**What changes are included?**

- Brought back HTML entity table from previous commit and refactored it
to use a static const char * table with linear lookup to reduce heap
allocations.
- Used `XML_SetDefaultHandlerExpand` in expat to parse out the entities
correctly, without needing them defined in DOCTYPE
- Added handling for `&nbsp;` so that the text stays together and
doesn't break onto a new line with text separated by an `&nbsp;`

## Additional Context

- This supersedes [this
PR](https://github.com/crosspoint-reader/crosspoint-reader/pull/751)
that simply handled `nbsp;` as whitespace. Instead, we want that
character to serve its true purpose and affect the line-breaking
algorithm.
- Updated my test EPUB [here](https://github.com/jdk2pq/css-test-epub)
with `&nbsp;` characters examples at the end of the book

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**YES**_, Claude Code
---
 lib/Epub/Epub/ParsedText.cpp                  |  3 +
 lib/Epub/Epub/htmlEntities.cpp                | 76 +++++++++++++++++++
 lib/Epub/Epub/htmlEntities.h                  |  9 +++
 .../Epub/parsers/ChapterHtmlSlimParser.cpp    | 43 +++++++++++
 lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h |  1 +
 5 files changed, 132 insertions(+)
 create mode 100644 lib/Epub/Epub/htmlEntities.cpp
 create mode 100644 lib/Epub/Epub/htmlEntities.h
diff --git a/lib/Epub/Epub/ParsedText.cpp b/lib/Epub/Epub/ParsedText.cpp
index 82ddaecd..82d0db61 100644
--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@@ -32,6 +32,9 @@ void stripSoftHyphensInPlace(std::string& word) {
 // Returns the rendered width for a word while ignoring soft hyphen glyphs and optionally appending a visible hyphen.
 uint16_t measureWordWidth(const GfxRenderer& renderer, const int fontId, const std::string& word,
                           const EpdFontFamily::Style style, const bool appendHyphen = false) {
+  if (word.size() == 1 && word[0] == ' ' && !appendHyphen) {
+    return renderer.getSpaceWidth(fontId);
+  }
   const bool hasSoftHyphen = containsSoftHyphen(word);
   if (!hasSoftHyphen && !appendHyphen) {
     return renderer.getTextWidth(fontId, word.c_str(), style);
diff --git a/lib/Epub/Epub/htmlEntities.cpp b/lib/Epub/Epub/htmlEntities.cpp
new file mode 100644
index 00000000..82d3819a
--- /dev/null
+++ b/lib/Epub/Epub/htmlEntities.cpp
@@ -0,0 +1,76 @@
+// from
+// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
+
+#include "htmlEntities.h"
+
+#include <cstring>
+
+struct EntityPair {
+  const char* key;
+  const char* value;
+};
+
+static const EntityPair ENTITY_LOOKUP[] = {
+    {"&quot;", "\""},  {"&frasl;", "⁄"},   {"&amp;", "&"},         {"&lt;", "<"},     {"&gt;", ">"},
+    {"&Agrave;", "À"}, {"&Aacute;", "Á"},  {"&Acirc;", "Â"},       {"&Atilde;", "Ã"}, {"&Auml;", "Ä"},
+    {"&Aring;", "Å"},  {"&AElig;", "Æ"},   {"&Ccedil;", "Ç"},      {"&Egrave;", "È"}, {"&Eacute;", "É"},
+    {"&Ecirc;", "Ê"},  {"&Euml;", "Ë"},    {"&Igrave;", "Ì"},      {"&Iacute;", "Í"}, {"&Icirc;", "Î"},
+    {"&Iuml;", "Ï"},   {"&ETH;", "Ð"},     {"&Ntilde;", "Ñ"},      {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"},
+    {"&Ocirc;", "Ô"},  {"&Otilde;", "Õ"},  {"&Ouml;", "Ö"},        {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"},
+    {"&Uacute;", "Ú"}, {"&Ucirc;", "Û"},   {"&Uuml;", "Ü"},        {"&Yacute;", "Ý"}, {"&THORN;", "Þ"},
+    {"&szlig;", "ß"},  {"&agrave;", "à"},  {"&aacute;", "á"},      {"&acirc;", "â"},  {"&atilde;", "ã"},
+    {"&auml;", "ä"},   {"&aring;", "å"},   {"&aelig;", "æ"},       {"&ccedil;", "ç"}, {"&egrave;", "è"},
+    {"&eacute;", "é"}, {"&ecirc;", "ê"},   {"&euml;", "ë"},        {"&igrave;", "ì"}, {"&iacute;", "í"},
+    {"&icirc;", "î"},  {"&iuml;", "ï"},    {"&eth;", "ð"},         {"&ntilde;", "ñ"}, {"&ograve;", "ò"},
+    {"&oacute;", "ó"}, {"&ocirc;", "ô"},   {"&otilde;", "õ"},      {"&ouml;", "ö"},   {"&oslash;", "ø"},
+    {"&ugrave;", "ù"}, {"&uacute;", "ú"},  {"&ucirc;", "û"},       {"&uuml;", "ü"},   {"&yacute;", "ý"},
+    {"&thorn;", "þ"},  {"&yuml;", "ÿ"},    {"&nbsp;", "\xC2\xA0"}, {"&iexcl;", "¡"},  {"&cent;", "¢"},
+    {"&pound;", "£"},  {"&curren;", "¤"},  {"&yen;", "¥"},         {"&brvbar;", "¦"}, {"&sect;", "§"},
+    {"&uml;", "¨"},    {"&copy;", "©"},    {"&ordf;", "ª"},        {"&laquo;", "«"},  {"&not;", "¬"},
+    {"&shy;", "­"},    {"&reg;", "®"},     {"&macr;", "¯"},        {"&deg;", "°"},    {"&plusmn;", "±"},
+    {"&sup2;", "²"},   {"&sup3;", "³"},    {"&acute;", "´"},       {"&micro;", "µ"},  {"&para;", "¶"},
+    {"&cedil;", "¸"},  {"&sup1;", "¹"},    {"&ordm;", "º"},        {"&raquo;", "»"},  {"&frac14;", "¼"},
+    {"&frac12;", "½"}, {"&frac34;", "¾"},  {"&iquest;", "¿"},      {"&times;", "×"},  {"&divide;", "÷"},
+    {"&forall;", "∀"}, {"&part;", "∂"},    {"&exist;", "∃"},       {"&empty;", "∅"},  {"&nabla;", "∇"},
+    {"&isin;", "∈"},   {"&notin;", "∉"},   {"&ni;", "∋"},          {"&prod;", "∏"},   {"&sum;", "∑"},
+    {"&minus;", "−"},  {"&lowast;", "∗"},  {"&radic;", "√"},       {"&prop;", "∝"},   {"&infin;", "∞"},
+    {"&ang;", "∠"},    {"&and;", "∧"},     {"&or;", "∨"},          {"&cap;", "∩"},    {"&cup;", "∪"},
+    {"&int;", "∫"},    {"&there4;", "∴"},  {"&sim;", "∼"},         {"&cong;", "≅"},   {"&asymp;", "≈"},
+    {"&ne;", "≠"},     {"&equiv;", "≡"},   {"&le;", "≤"},          {"&ge;", "≥"},     {"&sub;", "⊂"},
+    {"&sup;", "⊃"},    {"&nsub;", "⊄"},    {"&sube;", "⊆"},        {"&supe;", "⊇"},   {"&oplus;", "⊕"},
+    {"&otimes;", "⊗"}, {"&perp;", "⊥"},    {"&sdot;", "⋅"},        {"&Alpha;", "Α"},  {"&Beta;", "Β"},
+    {"&Gamma;", "Γ"},  {"&Delta;", "Δ"},   {"&Epsilon;", "Ε"},     {"&Zeta;", "Ζ"},   {"&Eta;", "Η"},
+    {"&Theta;", "Θ"},  {"&Iota;", "Ι"},    {"&Kappa;", "Κ"},       {"&Lambda;", "Λ"}, {"&Mu;", "Μ"},
+    {"&Nu;", "Ν"},     {"&Xi;", "Ξ"},      {"&Omicron;", "Ο"},     {"&Pi;", "Π"},     {"&Rho;", "Ρ"},
+    {"&Sigma;", "Σ"},  {"&Tau;", "Τ"},     {"&Upsilon;", "Υ"},     {"&Phi;", "Φ"},    {"&Chi;", "Χ"},
+    {"&Psi;", "Ψ"},    {"&Omega;", "Ω"},   {"&alpha;", "α"},       {"&beta;", "β"},   {"&gamma;", "γ"},
+    {"&delta;", "δ"},  {"&epsilon;", "ε"}, {"&zeta;", "ζ"},        {"&eta;", "η"},    {"&theta;", "θ"},
+    {"&iota;", "ι"},   {"&kappa;", "κ"},   {"&lambda;", "λ"},      {"&mu;", "μ"},     {"&nu;", "ν"},
+    {"&xi;", "ξ"},     {"&omicron;", "ο"}, {"&pi;", "π"},          {"&rho;", "ρ"},    {"&sigmaf;", "ς"},
+    {"&sigma;", "σ"},  {"&tau;", "τ"},     {"&upsilon;", "υ"},     {"&phi;", "φ"},    {"&chi;", "χ"},
+    {"&psi;", "ψ"},    {"&omega;", "ω"},   {"&thetasym;", "ϑ"},    {"&upsih;", "ϒ"},  {"&piv;", "ϖ"},
+    {"&OElig;", "Œ"},  {"&oelig;", "œ"},   {"&Scaron;", "Š"},      {"&scaron;", "š"}, {"&Yuml;", "Ÿ"},
+    {"&fnof;", "ƒ"},   {"&circ;", "ˆ"},    {"&tilde;", "˜"},       {"&ensp;", " "},   {"&emsp;", " "},
+    {"&thinsp;", " "}, {"&zwnj;", "‌"},  {"&zwj;", "‍"},       {"&lrm;", "‎"},  {"&rlm;", "‏"},
+    {"&ndash;", "–"},  {"&mdash;", "—"},   {"&lsquo;", "‘"},       {"&rsquo;", "’"},  {"&sbquo;", "‚"},
+    {"&ldquo;", "“"},  {"&rdquo;", "”"},   {"&bdquo;", "„"},       {"&dagger;", "†"}, {"&Dagger;", "‡"},
+    {"&bull;", "•"},   {"&hellip;", "…"},  {"&permil;", "‰"},      {"&prime;", "′"},  {"&Prime;", "″"},
+    {"&lsaquo;", "‹"}, {"&rsaquo;", "›"},  {"&oline;", "‾"},       {"&euro;", "€"},   {"&trade;", "™"},
+    {"&larr;", "←"},   {"&uarr;", "↑"},    {"&rarr;", "→"},        {"&darr;", "↓"},   {"&harr;", "↔"},
+    {"&crarr;", "↵"},  {"&lceil;", "⌈"},   {"&rceil;", "⌉"},       {"&lfloor;", "⌊"}, {"&rfloor;", "⌋"},
+    {"&loz;", "◊"},    {"&spades;", "♠"},  {"&clubs;", "♣"},       {"&hearts;", "♥"}, {"&diams;", "♦"}};
+
+static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]);
+
+// Lookup a single HTML entity and return its UTF-8 value
+const char* lookupHtmlEntity(const char* entity, int len) {
+  for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) {
+    const char* key = ENTITY_LOOKUP[i].key;
+    const size_t keyLen = strlen(key);
+    if (static_cast<size_t>(len) == keyLen && memcmp(entity, key, keyLen) == 0) {
+      return ENTITY_LOOKUP[i].value;
+    }
+  }
+
+  return nullptr;  // Entity not found
+}
diff --git a/lib/Epub/Epub/htmlEntities.h b/lib/Epub/Epub/htmlEntities.h
new file mode 100644
index 00000000..0221195f
--- /dev/null
+++ b/lib/Epub/Epub/htmlEntities.h
@@ -0,0 +1,9 @@
+// from
+// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
+
+#pragma once
+#include <string>
+
+// Lookup a single HTML entity (including & and ;) and return its UTF-8 value
+// Returns nullptr if entity is not found
+const char* lookupHtmlEntity(const char* entity, int len);
diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
index 3df222a0..e5512472 100644
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
@@ -6,6 +6,7 @@
 #include <expat.h>
 
 #include "../Page.h"
+#include "../htmlEntities.h"
 
 const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
 constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
@@ -359,6 +360,28 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
       continue;
     }
 
+    // Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0
+    // Render a visible space without allowing a line break around it.
+    if (static_cast<uint8_t>(s[i]) == 0xC2 && i + 1 < len && static_cast<uint8_t>(s[i + 1]) == 0xA0) {
+      // Flush any pending text so style is applied correctly.
+      if (self->partWordBufferIndex > 0) {
+        self->flushPartWordBuffer();
+      }
+
+      // Add a standalone space that attaches to the previous word.
+      self->partWordBuffer[0] = ' ';
+      self->partWordBuffer[1] = '\0';
+      self->partWordBufferIndex = 1;
+      self->nextWordContinues = true;  // Attach space to previous word (no break).
+      self->flushPartWordBuffer();
+
+      // Ensure the next real word attaches to this space (no break).
+      self->nextWordContinues = true;
+
+      i++;  // Skip the second byte (0xA0)
+      continue;
+    }
+
     // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
     const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
     const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
@@ -393,6 +416,22 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
   }
 }
 
+void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) {
+  // Check if this looks like an entity reference (&...;)
+  if (len >= 3 && s[0] == '&' && s[len - 1] == ';') {
+    const char* utf8Value = lookupHtmlEntity(s, len);
+    if (utf8Value != nullptr) {
+      // Known entity: expand to its UTF-8 value
+      characterData(userData, utf8Value, strlen(utf8Value));
+      return;
+    }
+    // Unknown entity: preserve original &...; sequence
+    characterData(userData, s, len);
+    return;
+  }
+  // Not an entity we recognize - skip it
+}
+
 void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* name) {
   auto* self = static_cast<ChapterHtmlSlimParser*>(userData);
 
@@ -481,6 +520,10 @@ bool ChapterHtmlSlimParser::parseAndBuildPages() {
     return false;
   }
 
+  // Handle HTML entities (like &nbsp;) that aren't in XML spec or DTD
+  // Using DefaultHandlerExpand preserves normal entity expansion from DOCTYPE
+  XML_SetDefaultHandlerExpand(parser, defaultHandlerExpand);
+
   FsFile file;
   if (!Storage.openFileForRead("EHP", filepath, file)) {
     XML_ParserFree(parser);
diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h
index 909913b1..761ee1d5 100644
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h
@@ -64,6 +64,7 @@ class ChapterHtmlSlimParser {
   // XML callbacks
   static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
   static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
+  static void XMLCALL defaultHandlerExpand(void* userData, const XML_Char* s, int len);
   static void XMLCALL endElement(void* userData, const XML_Char* name);
 
  public: