diff --git a/lib/Epub/Epub/htmlEntities.cpp b/lib/Epub/Epub/htmlEntities.cpp
index 82d3819a..6fdcb71c 100644
--- a/lib/Epub/Epub/htmlEntities.cpp
+++ b/lib/Epub/Epub/htmlEntities.cpp
@@ -1,4 +1,4 @@
-// from
+// based on
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
#include "htmlEntities.h"
@@ -10,67 +10,105 @@ struct EntityPair {
const char* value;
};
-static const EntityPair ENTITY_LOOKUP[] = {
- {""", "\""}, {"⁄", "⁄"}, {"&", "&"}, {"<", "<"}, {">", ">"},
- {"À", "À"}, {"Á", "Á"}, {"Â", "Â"}, {"Ã", "Ã"}, {"Ä", "Ä"},
- {"Å", "Å"}, {"Æ", "Æ"}, {"Ç", "Ç"}, {"È", "È"}, {"É", "É"},
- {"Ê", "Ê"}, {"Ë", "Ë"}, {"Ì", "Ì"}, {"Í", "Í"}, {"Î", "Î"},
- {"Ï", "Ï"}, {"Ð", "Ð"}, {"Ñ", "Ñ"}, {"Ò", "Ò"}, {"Ó", "Ó"},
- {"Ô", "Ô"}, {"Õ", "Õ"}, {"Ö", "Ö"}, {"Ø", "Ø"}, {"Ù", "Ù"},
- {"Ú", "Ú"}, {"Û", "Û"}, {"Ü", "Ü"}, {"Ý", "Ý"}, {"Þ", "Þ"},
- {"ß", "ß"}, {"à", "à"}, {"á", "á"}, {"â", "â"}, {"ã", "ã"},
- {"ä", "ä"}, {"å", "å"}, {"æ", "æ"}, {"ç", "ç"}, {"è", "è"},
- {"é", "é"}, {"ê", "ê"}, {"ë", "ë"}, {"ì", "ì"}, {"í", "í"},
- {"î", "î"}, {"ï", "ï"}, {"ð", "ð"}, {"ñ", "ñ"}, {"ò", "ò"},
- {"ó", "ó"}, {"ô", "ô"}, {"õ", "õ"}, {"ö", "ö"}, {"ø", "ø"},
- {"ù", "ù"}, {"ú", "ú"}, {"û", "û"}, {"ü", "ü"}, {"ý", "ý"},
- {"þ", "þ"}, {"ÿ", "ÿ"}, {" ", "\xC2\xA0"}, {"¡", "¡"}, {"¢", "¢"},
- {"£", "£"}, {"¤", "¤"}, {"¥", "¥"}, {"¦", "¦"}, {"§", "§"},
- {"¨", "¨"}, {"©", "©"}, {"ª", "ª"}, {"«", "«"}, {"¬", "¬"},
- {"", ""}, {"®", "®"}, {"¯", "¯"}, {"°", "°"}, {"±", "±"},
- {"²", "²"}, {"³", "³"}, {"´", "´"}, {"µ", "µ"}, {"¶", "¶"},
- {"¸", "¸"}, {"¹", "¹"}, {"º", "º"}, {"»", "»"}, {"¼", "¼"},
- {"½", "½"}, {"¾", "¾"}, {"¿", "¿"}, {"×", "×"}, {"÷", "÷"},
- {"∀", "∀"}, {"∂", "∂"}, {"∃", "∃"}, {"∅", "∅"}, {"∇", "∇"},
- {"∈", "∈"}, {"∉", "∉"}, {"∋", "∋"}, {"∏", "∏"}, {"∑", "∑"},
- {"−", "−"}, {"∗", "∗"}, {"√", "√"}, {"∝", "∝"}, {"∞", "∞"},
- {"∠", "∠"}, {"∧", "∧"}, {"∨", "∨"}, {"∩", "∩"}, {"∪", "∪"},
- {"∫", "∫"}, {"∴", "∴"}, {"∼", "∼"}, {"≅", "≅"}, {"≈", "≈"},
- {"≠", "≠"}, {"≡", "≡"}, {"≤", "≤"}, {"≥", "≥"}, {"⊂", "⊂"},
- {"⊃", "⊃"}, {"⊄", "⊄"}, {"⊆", "⊆"}, {"⊇", "⊇"}, {"⊕", "⊕"},
- {"⊗", "⊗"}, {"⊥", "⊥"}, {"⋅", "⋅"}, {"Α", "Α"}, {"Β", "Β"},
- {"Γ", "Γ"}, {"Δ", "Δ"}, {"Ε", "Ε"}, {"Ζ", "Ζ"}, {"Η", "Η"},
- {"Θ", "Θ"}, {"Ι", "Ι"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"},
- {"Ν", "Ν"}, {"Ξ", "Ξ"}, {"Ο", "Ο"}, {"Π", "Π"}, {"Ρ", "Ρ"},
- {"Σ", "Σ"}, {"Τ", "Τ"}, {"Υ", "Υ"}, {"Φ", "Φ"}, {"Χ", "Χ"},
- {"Ψ", "Ψ"}, {"Ω", "Ω"}, {"α", "α"}, {"β", "β"}, {"γ", "γ"},
- {"δ", "δ"}, {"ε", "ε"}, {"ζ", "ζ"}, {"η", "η"}, {"θ", "θ"},
- {"ι", "ι"}, {"κ", "κ"}, {"λ", "λ"}, {"μ", "μ"}, {"ν", "ν"},
- {"ξ", "ξ"}, {"ο", "ο"}, {"π", "π"}, {"ρ", "ρ"}, {"ς", "ς"},
- {"σ", "σ"}, {"τ", "τ"}, {"υ", "υ"}, {"φ", "φ"}, {"χ", "χ"},
- {"ψ", "ψ"}, {"ω", "ω"}, {"ϑ", "ϑ"}, {"ϒ", "ϒ"}, {"ϖ", "ϖ"},
- {"Œ", "Œ"}, {"œ", "œ"}, {"Š", "Š"}, {"š", "š"}, {"Ÿ", "Ÿ"},
- {"ƒ", "ƒ"}, {"ˆ", "ˆ"}, {"˜", "˜"}, {" ", " "}, {" ", " "},
- {" ", " "}, {"", ""}, {"", ""}, {"", ""}, {"", ""},
- {"–", "–"}, {"—", "—"}, {"‘", "‘"}, {"’", "’"}, {"‚", "‚"},
- {"“", "“"}, {"”", "”"}, {"„", "„"}, {"†", "†"}, {"‡", "‡"},
- {"•", "•"}, {"…", "…"}, {"‰", "‰"}, {"′", "′"}, {"″", "″"},
- {"‹", "‹"}, {"›", "›"}, {"‾", "‾"}, {"€", "€"}, {"™", "™"},
- {"←", "←"}, {"↑", "↑"}, {"→", "→"}, {"↓", "↓"}, {"↔", "↔"},
- {"↵", "↵"}, {"⌈", "⌈"}, {"⌉", "⌉"}, {"⌊", "⌊"}, {"⌋", "⌋"},
- {"◊", "◊"}, {"♠", "♠"}, {"♣", "♣"}, {"♥", "♥"}, {"♦", "♦"}};
+// Sorted lexicographically by key to allow binary search.
+static constexpr EntityPair ENTITY_LOOKUP[] = {
+ {"Æ", "Æ"}, {"Á", "Á"}, {"Â", "Â"}, {"À", "À"}, {"Α", "Α"},
+ {"Å", "Å"}, {"Ã", "Ã"}, {"Ä", "Ä"}, {"Β", "Β"}, {"Ç", "Ç"},
+ {"Χ", "Χ"}, {"‡", "‡"}, {"Δ", "Δ"}, {"Ð", "Ð"}, {"É", "É"},
+ {"Ê", "Ê"}, {"È", "È"}, {"Ε", "Ε"}, {"Η", "Η"}, {"Ë", "Ë"},
+ {"Γ", "Γ"}, {"Í", "Í"}, {"Î", "Î"}, {"Ì", "Ì"}, {"Ι", "Ι"},
+ {"Ï", "Ï"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"}, {"Ñ", "Ñ"},
+ {"Ν", "Ν"}, {"Œ", "Œ"}, {"Ó", "Ó"}, {"Ô", "Ô"}, {"Ò", "Ò"},
+ {"Ω", "Ω"}, {"Ο", "Ο"}, {"Ø", "Ø"}, {"Õ", "Õ"}, {"Ö", "Ö"},
+ {"Φ", "Φ"}, {"Π", "Π"}, {"″", "″"}, {"Ψ", "Ψ"}, {"Ρ", "Ρ"},
+ {"Š", "Š"}, {"Σ", "Σ"}, {"Þ", "Þ"}, {"Τ", "Τ"}, {"Θ", "Θ"},
+ {"Ú", "Ú"}, {"Û", "Û"}, {"Ù", "Ù"}, {"Υ", "Υ"}, {"Ü", "Ü"},
+ {"Ξ", "Ξ"}, {"Ý", "Ý"}, {"Ÿ", "Ÿ"}, {"Ζ", "Ζ"}, {"á", "á"},
+ {"â", "â"}, {"´", "´"}, {"æ", "æ"}, {"à", "à"}, {"α", "α"},
+ {"&", "&"}, {"∧", "∧"}, {"∠", "∠"}, {"å", "å"}, {"≈", "≈"},
+ {"ã", "ã"}, {"ä", "ä"}, {"„", "„"}, {"β", "β"}, {"¦", "¦"},
+ {"•", "•"}, {"∩", "∩"}, {"ç", "ç"}, {"¸", "¸"}, {"¢", "¢"},
+ {"χ", "χ"}, {"ˆ", "ˆ"}, {"♣", "♣"}, {"≅", "≅"}, {"©", "©"},
+ {"↵", "↵"}, {"∪", "∪"}, {"¤", "¤"}, {"†", "†"}, {"↓", "↓"},
+ {"°", "°"}, {"δ", "δ"}, {"♦", "♦"}, {"÷", "÷"}, {"é", "é"},
+ {"ê", "ê"}, {"è", "è"}, {"∅", "∅"}, {" ", " "}, {" ", " "},
+ {"ε", "ε"}, {"≡", "≡"}, {"η", "η"}, {"ð", "ð"}, {"ë", "ë"},
+ {"€", "€"}, {"∃", "∃"}, {"ƒ", "ƒ"}, {"∀", "∀"}, {"½", "½"},
+ {"¼", "¼"}, {"¾", "¾"}, {"⁄", "⁄"}, {"γ", "γ"}, {"≥", "≥"},
+ {">", ">"}, {"↔", "↔"}, {"♥", "♥"}, {"…", "…"}, {"í", "í"},
+ {"î", "î"}, {"¡", "¡"}, {"ì", "ì"}, {"∞", "∞"}, {"∫", "∫"},
+ {"ι", "ι"}, {"¿", "¿"}, {"∈", "∈"}, {"ï", "ï"}, {"κ", "κ"},
+ {"λ", "λ"}, {"«", "«"}, {"←", "←"}, {"⌈", "⌈"}, {"“", "\u201C"},
+ {"≤", "≤"}, {"⌊", "⌊"}, {"∗", "∗"}, {"◊", "◊"}, {"", "\u200E"},
+ {"‹", "‹"}, {"‘", "\u2018"}, {"<", "<"}, {"¯", "¯"}, {"—", "—"},
+ {"µ", "µ"}, {"−", "−"}, {"μ", "μ"}, {"∇", "∇"}, {" ", "\xC2\xA0"},
+ {"–", "–"}, {"≠", "≠"}, {"∋", "∋"}, {"¬", "¬"}, {"∉", "∉"},
+ {"⊄", "⊄"}, {"ñ", "ñ"}, {"ν", "ν"}, {"ó", "ó"}, {"ô", "ô"},
+ {"œ", "œ"}, {"ò", "ò"}, {"‾", "‾"}, {"ω", "ω"}, {"ο", "ο"},
+ {"⊕", "⊕"}, {"∨", "∨"}, {"ª", "ª"}, {"º", "º"}, {"ø", "ø"},
+ {"õ", "õ"}, {"⊗", "⊗"}, {"ö", "ö"}, {"¶", "¶"}, {"∂", "∂"},
+ {"‰", "‰"}, {"⊥", "⊥"}, {"φ", "φ"}, {"π", "π"}, {"ϖ", "ϖ"},
+ {"±", "±"}, {"£", "£"}, {"′", "′"}, {"∏", "∏"}, {"∝", "∝"},
+ {"ψ", "ψ"}, {""", "\""}, {"√", "√"}, {"»", "»"}, {"→", "→"},
+ {"⌉", "⌉"}, {"”", "\u201D"}, {"®", "®"}, {"⌋", "⌋"}, {"ρ", "ρ"},
+ {"", "\u200F"}, {"›", "›"}, {"’", "\u2019"}, {"‚", "‚"}, {"š", "š"},
+ {"⋅", "⋅"}, {"§", "§"}, {"", "\xC2\xAD"}, {"σ", "σ"}, {"ς", "ς"},
+ {"∼", "∼"}, {"♠", "♠"}, {"⊂", "⊂"}, {"⊆", "⊆"}, {"∑", "∑"},
+ {"¹", "¹"}, {"²", "²"}, {"³", "³"}, {"⊃", "⊃"}, {"⊇", "⊇"},
+ {"ß", "ß"}, {"τ", "τ"}, {"∴", "∴"}, {"θ", "θ"}, {"ϑ", "ϑ"},
+ {" ", " "}, {"þ", "þ"}, {"˜", "˜"}, {"×", "×"}, {"™", "™"},
+ {"ú", "ú"}, {"↑", "↑"}, {"û", "û"}, {"ù", "ù"}, {"¨", "¨"},
+ {"ϒ", "ϒ"}, {"υ", "υ"}, {"ü", "ü"}, {"ξ", "ξ"}, {"ý", "ý"},
+ {"¥", "¥"}, {"ÿ", "ÿ"}, {"ζ", "ζ"}, {"", "\u200D"}, {"", "\u200C"},
+};
static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]);
-// Lookup a single HTML entity and return its UTF-8 value
-const char* lookupHtmlEntity(const char* entity, int len) {
- for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) {
- const char* key = ENTITY_LOOKUP[i].key;
+// Verify the table is sorted at compile time.
+static constexpr int constexprStrcmp(const char* a, const char* b) {
+ for (size_t i = 0;; i++) {
+ if (a[i] != b[i]) return (unsigned char)a[i] < (unsigned char)b[i] ? -1 : 1;
+ if (a[i] == '\0') return 0;
+ }
+}
+
+static constexpr bool isTableSorted() {
+ for (size_t i = 1; i < ENTITY_LOOKUP_COUNT; i++) {
+ if (constexprStrcmp(ENTITY_LOOKUP[i - 1].key, ENTITY_LOOKUP[i].key) >= 0) return false;
+ }
+ return true;
+}
+static_assert(isTableSorted(), "ENTITY_LOOKUP must be sorted lexicographically by key");
+
+// Lookup a single HTML entity and return its UTF-8 value.
+const char* lookupHtmlEntity(const char* entity, size_t len) {
+ if (entity == nullptr || len == 0) return nullptr;
+
+ size_t lo = 0;
+ size_t hi = ENTITY_LOOKUP_COUNT;
+
+ while (lo < hi) {
+ const size_t mid = lo + (hi - lo) / 2;
+ const char* key = ENTITY_LOOKUP[mid].key;
const size_t keyLen = strlen(key);
- if (static_cast(len) == keyLen && memcmp(entity, key, keyLen) == 0) {
- return ENTITY_LOOKUP[i].value;
+ const size_t cmpLen = (len < keyLen) ? len : keyLen;
+ int cmp = memcmp(entity, key, cmpLen);
+ if (cmp == 0) {
+ // safety net: if prefix equal, shorter string is considered smaller
+ if (len < keyLen)
+ cmp = -1;
+ else if (len > keyLen)
+ cmp = 1;
+ else
+ cmp = 0;
}
+
+ if (cmp == 0) return ENTITY_LOOKUP[mid].value;
+ if (cmp < 0)
+ hi = mid;
+ else
+ lo = mid + 1;
}
- return nullptr; // Entity not found
+ return nullptr;
}
diff --git a/lib/Epub/Epub/htmlEntities.h b/lib/Epub/Epub/htmlEntities.h
index 0221195f..c39448b1 100644
--- a/lib/Epub/Epub/htmlEntities.h
+++ b/lib/Epub/Epub/htmlEntities.h
@@ -1,4 +1,4 @@
-// from
+// based on
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
#pragma once
@@ -6,4 +6,4 @@
// Lookup a single HTML entity (including & and ;) and return its UTF-8 value
// Returns nullptr if entity is not found
-const char* lookupHtmlEntity(const char* entity, int len);
+const char* lookupHtmlEntity(const char* entity, size_t len);
diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
index 8afe3ff9..e732b60a 100644
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
@@ -761,7 +761,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) {
// Check if this looks like an entity reference (&...;)
if (len >= 3 && s[0] == '&' && s[len - 1] == ';') {
- const char* utf8Value = lookupHtmlEntity(s, len);
+ const char* utf8Value = lookupHtmlEntity(s, static_cast(len));
if (utf8Value != nullptr) {
// Known entity: expand to its UTF-8 value
characterData(userData, utf8Value, strlen(utf8Value));