From 1abe307f20490815b3d86d57f46febbaf865a88d Mon Sep 17 00:00:00 2001 From: Uri Tauber <142022451+Uri-Tauber@users.noreply.github.com> Date: Thu, 26 Feb 2026 20:55:31 +0200 Subject: [PATCH] perf: Optimize HTML entities lookup to O(log(n)) (#1194) ## Summary **What is the goal of this PR?** Replace the linear scan of `lookupHtmlEntity` with a simple binary search to improve lookup performance. **What changes are included?** `lib/Epub/Epub/Entities/htmlEntities.cpp`: - Sorted the `ENTITY_LOOKUP` array. - Added a compile-time assertion to guarantee the array remains sorted. - Rewrote `lookupHtmlEntity` to use a binary search. ## Additional Context Benchmarked on my x64 laptop (probably will be different on RISC-V) ``` === Benchmark (53 entities x 10000 iterations) === Version Total time Avg per lookup ---------------------------------------------- linear 236.97 ms total 447.11 ns/lookup binary search 22.09 ms total 41.68 ns/lookup === Summary === Binary search is 10.73x faster than linear scan. ``` This is a simplified alternative to #1180, focused on keeping the implementation clean, and maintainable. ### AI Usage Did you use AI tools to help write this code? _**< NO >**_ --------- Co-authored-by: Zach Nelson --- lib/Epub/Epub/htmlEntities.cpp | 152 +++++++++++------- lib/Epub/Epub/htmlEntities.h | 4 +- .../Epub/parsers/ChapterHtmlSlimParser.cpp | 2 +- 3 files changed, 98 insertions(+), 60 deletions(-) diff --git a/lib/Epub/Epub/htmlEntities.cpp b/lib/Epub/Epub/htmlEntities.cpp index 82d3819a..6fdcb71c 100644 --- a/lib/Epub/Epub/htmlEntities.cpp +++ b/lib/Epub/Epub/htmlEntities.cpp @@ -1,4 +1,4 @@ -// from +// based on // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp #include "htmlEntities.h" @@ -10,67 +10,105 @@ struct EntityPair { const char* value; }; -static const EntityPair ENTITY_LOOKUP[] = { - {""", "\""}, {"⁄", "⁄"}, {"&", "&"}, {"<", "<"}, {">", ">"}, - {"À", "À"}, {"Á", "Á"}, {"Â", "Â"}, {"Ã", "Ã"}, {"Ä", "Ä"}, - {"Å", "Å"}, {"Æ", "Æ"}, {"Ç", "Ç"}, {"È", "È"}, {"É", "É"}, - {"Ê", "Ê"}, {"Ë", "Ë"}, {"Ì", "Ì"}, {"Í", "Í"}, {"Î", "Î"}, - {"Ï", "Ï"}, {"Ð", "Ð"}, {"Ñ", "Ñ"}, {"Ò", "Ò"}, {"Ó", "Ó"}, - {"Ô", "Ô"}, {"Õ", "Õ"}, {"Ö", "Ö"}, {"Ø", "Ø"}, {"Ù", "Ù"}, - {"Ú", "Ú"}, {"Û", "Û"}, {"Ü", "Ü"}, {"Ý", "Ý"}, {"Þ", "Þ"}, - {"ß", "ß"}, {"à", "à"}, {"á", "á"}, {"â", "â"}, {"ã", "ã"}, - {"ä", "ä"}, {"å", "å"}, {"æ", "æ"}, {"ç", "ç"}, {"è", "è"}, - {"é", "é"}, {"ê", "ê"}, {"ë", "ë"}, {"ì", "ì"}, {"í", "í"}, - {"î", "î"}, {"ï", "ï"}, {"ð", "ð"}, {"ñ", "ñ"}, {"ò", "ò"}, - {"ó", "ó"}, {"ô", "ô"}, {"õ", "õ"}, {"ö", "ö"}, {"ø", "ø"}, - {"ù", "ù"}, {"ú", "ú"}, {"û", "û"}, {"ü", "ü"}, {"ý", "ý"}, - {"þ", "þ"}, {"ÿ", "ÿ"}, {" ", "\xC2\xA0"}, {"¡", "¡"}, {"¢", "¢"}, - {"£", "£"}, {"¤", "¤"}, {"¥", "¥"}, {"¦", "¦"}, {"§", "§"}, - {"¨", "¨"}, {"©", "©"}, {"ª", "ª"}, {"«", "«"}, {"¬", "¬"}, - {"­", "­"}, {"®", "®"}, {"¯", "¯"}, {"°", "°"}, {"±", "±"}, - {"²", "²"}, {"³", "³"}, {"´", "´"}, {"µ", "µ"}, {"¶", "¶"}, - {"¸", "¸"}, {"¹", "¹"}, {"º", "º"}, {"»", "»"}, {"¼", "¼"}, - {"½", "½"}, {"¾", "¾"}, {"¿", "¿"}, {"×", "×"}, {"÷", "÷"}, - {"∀", "∀"}, {"∂", "∂"}, {"∃", "∃"}, {"∅", "∅"}, {"∇", "∇"}, - {"∈", "∈"}, {"∉", "∉"}, {"∋", "∋"}, {"∏", "∏"}, {"∑", "∑"}, - {"−", "−"}, {"∗", "∗"}, {"√", "√"}, {"∝", "∝"}, {"∞", "∞"}, - {"∠", "∠"}, {"∧", "∧"}, {"∨", "∨"}, {"∩", "∩"}, {"∪", "∪"}, - {"∫", "∫"}, {"∴", "∴"}, {"∼", "∼"}, {"≅", "≅"}, {"≈", "≈"}, - {"≠", "≠"}, {"≡", "≡"}, {"≤", "≤"}, {"≥", "≥"}, {"⊂", "⊂"}, - {"⊃", "⊃"}, {"⊄", "⊄"}, {"⊆", "⊆"}, {"⊇", "⊇"}, {"⊕", "⊕"}, - {"⊗", "⊗"}, {"⊥", "⊥"}, {"⋅", "⋅"}, {"Α", "Α"}, {"Β", "Β"}, - {"Γ", "Γ"}, {"Δ", "Δ"}, {"Ε", "Ε"}, {"Ζ", "Ζ"}, {"Η", "Η"}, - {"Θ", "Θ"}, {"Ι", "Ι"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"}, - {"Ν", "Ν"}, {"Ξ", "Ξ"}, {"Ο", "Ο"}, {"Π", "Π"}, {"Ρ", "Ρ"}, - {"Σ", "Σ"}, {"Τ", "Τ"}, {"Υ", "Υ"}, {"Φ", "Φ"}, {"Χ", "Χ"}, - {"Ψ", "Ψ"}, {"Ω", "Ω"}, {"α", "α"}, {"β", "β"}, {"γ", "γ"}, - {"δ", "δ"}, {"ε", "ε"}, {"ζ", "ζ"}, {"η", "η"}, {"θ", "θ"}, - {"ι", "ι"}, {"κ", "κ"}, {"λ", "λ"}, {"μ", "μ"}, {"ν", "ν"}, - {"ξ", "ξ"}, {"ο", "ο"}, {"π", "π"}, {"ρ", "ρ"}, {"ς", "ς"}, - {"σ", "σ"}, {"τ", "τ"}, {"υ", "υ"}, {"φ", "φ"}, {"χ", "χ"}, - {"ψ", "ψ"}, {"ω", "ω"}, {"ϑ", "ϑ"}, {"ϒ", "ϒ"}, {"ϖ", "ϖ"}, - {"Œ", "Œ"}, {"œ", "œ"}, {"Š", "Š"}, {"š", "š"}, {"Ÿ", "Ÿ"}, - {"ƒ", "ƒ"}, {"ˆ", "ˆ"}, {"˜", "˜"}, {" ", " "}, {" ", " "}, - {" ", " "}, {"‌", "‌"}, {"‍", "‍"}, {"‎", "‎"}, {"‏", "‏"}, - {"–", "–"}, {"—", "—"}, {"‘", "‘"}, {"’", "’"}, {"‚", "‚"}, - {"“", "“"}, {"”", "”"}, {"„", "„"}, {"†", "†"}, {"‡", "‡"}, - {"•", "•"}, {"…", "…"}, {"‰", "‰"}, {"′", "′"}, {"″", "″"}, - {"‹", "‹"}, {"›", "›"}, {"‾", "‾"}, {"€", "€"}, {"™", "™"}, - {"←", "←"}, {"↑", "↑"}, {"→", "→"}, {"↓", "↓"}, {"↔", "↔"}, - {"↵", "↵"}, {"⌈", "⌈"}, {"⌉", "⌉"}, {"⌊", "⌊"}, {"⌋", "⌋"}, - {"◊", "◊"}, {"♠", "♠"}, {"♣", "♣"}, {"♥", "♥"}, {"♦", "♦"}}; +// Sorted lexicographically by key to allow binary search. +static constexpr EntityPair ENTITY_LOOKUP[] = { + {"Æ", "Æ"}, {"Á", "Á"}, {"Â", "Â"}, {"À", "À"}, {"Α", "Α"}, + {"Å", "Å"}, {"Ã", "Ã"}, {"Ä", "Ä"}, {"Β", "Β"}, {"Ç", "Ç"}, + {"Χ", "Χ"}, {"‡", "‡"}, {"Δ", "Δ"}, {"Ð", "Ð"}, {"É", "É"}, + {"Ê", "Ê"}, {"È", "È"}, {"Ε", "Ε"}, {"Η", "Η"}, {"Ë", "Ë"}, + {"Γ", "Γ"}, {"Í", "Í"}, {"Î", "Î"}, {"Ì", "Ì"}, {"Ι", "Ι"}, + {"Ï", "Ï"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"}, {"Ñ", "Ñ"}, + {"Ν", "Ν"}, {"Œ", "Œ"}, {"Ó", "Ó"}, {"Ô", "Ô"}, {"Ò", "Ò"}, + {"Ω", "Ω"}, {"Ο", "Ο"}, {"Ø", "Ø"}, {"Õ", "Õ"}, {"Ö", "Ö"}, + {"Φ", "Φ"}, {"Π", "Π"}, {"″", "″"}, {"Ψ", "Ψ"}, {"Ρ", "Ρ"}, + {"Š", "Š"}, {"Σ", "Σ"}, {"Þ", "Þ"}, {"Τ", "Τ"}, {"Θ", "Θ"}, + {"Ú", "Ú"}, {"Û", "Û"}, {"Ù", "Ù"}, {"Υ", "Υ"}, {"Ü", "Ü"}, + {"Ξ", "Ξ"}, {"Ý", "Ý"}, {"Ÿ", "Ÿ"}, {"Ζ", "Ζ"}, {"á", "á"}, + {"â", "â"}, {"´", "´"}, {"æ", "æ"}, {"à", "à"}, {"α", "α"}, + {"&", "&"}, {"∧", "∧"}, {"∠", "∠"}, {"å", "å"}, {"≈", "≈"}, + {"ã", "ã"}, {"ä", "ä"}, {"„", "„"}, {"β", "β"}, {"¦", "¦"}, + {"•", "•"}, {"∩", "∩"}, {"ç", "ç"}, {"¸", "¸"}, {"¢", "¢"}, + {"χ", "χ"}, {"ˆ", "ˆ"}, {"♣", "♣"}, {"≅", "≅"}, {"©", "©"}, + {"↵", "↵"}, {"∪", "∪"}, {"¤", "¤"}, {"†", "†"}, {"↓", "↓"}, + {"°", "°"}, {"δ", "δ"}, {"♦", "♦"}, {"÷", "÷"}, {"é", "é"}, + {"ê", "ê"}, {"è", "è"}, {"∅", "∅"}, {" ", " "}, {" ", " "}, + {"ε", "ε"}, {"≡", "≡"}, {"η", "η"}, {"ð", "ð"}, {"ë", "ë"}, + {"€", "€"}, {"∃", "∃"}, {"ƒ", "ƒ"}, {"∀", "∀"}, {"½", "½"}, + {"¼", "¼"}, {"¾", "¾"}, {"⁄", "⁄"}, {"γ", "γ"}, {"≥", "≥"}, + {">", ">"}, {"↔", "↔"}, {"♥", "♥"}, {"…", "…"}, {"í", "í"}, + {"î", "î"}, {"¡", "¡"}, {"ì", "ì"}, {"∞", "∞"}, {"∫", "∫"}, + {"ι", "ι"}, {"¿", "¿"}, {"∈", "∈"}, {"ï", "ï"}, {"κ", "κ"}, + {"λ", "λ"}, {"«", "«"}, {"←", "←"}, {"⌈", "⌈"}, {"“", "\u201C"}, + {"≤", "≤"}, {"⌊", "⌊"}, {"∗", "∗"}, {"◊", "◊"}, {"‎", "\u200E"}, + {"‹", "‹"}, {"‘", "\u2018"}, {"<", "<"}, {"¯", "¯"}, {"—", "—"}, + {"µ", "µ"}, {"−", "−"}, {"μ", "μ"}, {"∇", "∇"}, {" ", "\xC2\xA0"}, + {"–", "–"}, {"≠", "≠"}, {"∋", "∋"}, {"¬", "¬"}, {"∉", "∉"}, + {"⊄", "⊄"}, {"ñ", "ñ"}, {"ν", "ν"}, {"ó", "ó"}, {"ô", "ô"}, + {"œ", "œ"}, {"ò", "ò"}, {"‾", "‾"}, {"ω", "ω"}, {"ο", "ο"}, + {"⊕", "⊕"}, {"∨", "∨"}, {"ª", "ª"}, {"º", "º"}, {"ø", "ø"}, + {"õ", "õ"}, {"⊗", "⊗"}, {"ö", "ö"}, {"¶", "¶"}, {"∂", "∂"}, + {"‰", "‰"}, {"⊥", "⊥"}, {"φ", "φ"}, {"π", "π"}, {"ϖ", "ϖ"}, + {"±", "±"}, {"£", "£"}, {"′", "′"}, {"∏", "∏"}, {"∝", "∝"}, + {"ψ", "ψ"}, {""", "\""}, {"√", "√"}, {"»", "»"}, {"→", "→"}, + {"⌉", "⌉"}, {"”", "\u201D"}, {"®", "®"}, {"⌋", "⌋"}, {"ρ", "ρ"}, + {"‏", "\u200F"}, {"›", "›"}, {"’", "\u2019"}, {"‚", "‚"}, {"š", "š"}, + {"⋅", "⋅"}, {"§", "§"}, {"­", "\xC2\xAD"}, {"σ", "σ"}, {"ς", "ς"}, + {"∼", "∼"}, {"♠", "♠"}, {"⊂", "⊂"}, {"⊆", "⊆"}, {"∑", "∑"}, + {"¹", "¹"}, {"²", "²"}, {"³", "³"}, {"⊃", "⊃"}, {"⊇", "⊇"}, + {"ß", "ß"}, {"τ", "τ"}, {"∴", "∴"}, {"θ", "θ"}, {"ϑ", "ϑ"}, + {" ", " "}, {"þ", "þ"}, {"˜", "˜"}, {"×", "×"}, {"™", "™"}, + {"ú", "ú"}, {"↑", "↑"}, {"û", "û"}, {"ù", "ù"}, {"¨", "¨"}, + {"ϒ", "ϒ"}, {"υ", "υ"}, {"ü", "ü"}, {"ξ", "ξ"}, {"ý", "ý"}, + {"¥", "¥"}, {"ÿ", "ÿ"}, {"ζ", "ζ"}, {"‍", "\u200D"}, {"‌", "\u200C"}, +}; static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]); -// Lookup a single HTML entity and return its UTF-8 value -const char* lookupHtmlEntity(const char* entity, int len) { - for (size_t i = 0; i < ENTITY_LOOKUP_COUNT; i++) { - const char* key = ENTITY_LOOKUP[i].key; +// Verify the table is sorted at compile time. +static constexpr int constexprStrcmp(const char* a, const char* b) { + for (size_t i = 0;; i++) { + if (a[i] != b[i]) return (unsigned char)a[i] < (unsigned char)b[i] ? -1 : 1; + if (a[i] == '\0') return 0; + } +} + +static constexpr bool isTableSorted() { + for (size_t i = 1; i < ENTITY_LOOKUP_COUNT; i++) { + if (constexprStrcmp(ENTITY_LOOKUP[i - 1].key, ENTITY_LOOKUP[i].key) >= 0) return false; + } + return true; +} +static_assert(isTableSorted(), "ENTITY_LOOKUP must be sorted lexicographically by key"); + +// Lookup a single HTML entity and return its UTF-8 value. +const char* lookupHtmlEntity(const char* entity, size_t len) { + if (entity == nullptr || len == 0) return nullptr; + + size_t lo = 0; + size_t hi = ENTITY_LOOKUP_COUNT; + + while (lo < hi) { + const size_t mid = lo + (hi - lo) / 2; + const char* key = ENTITY_LOOKUP[mid].key; const size_t keyLen = strlen(key); - if (static_cast(len) == keyLen && memcmp(entity, key, keyLen) == 0) { - return ENTITY_LOOKUP[i].value; + const size_t cmpLen = (len < keyLen) ? len : keyLen; + int cmp = memcmp(entity, key, cmpLen); + if (cmp == 0) { + // safety net: if prefix equal, shorter string is considered smaller + if (len < keyLen) + cmp = -1; + else if (len > keyLen) + cmp = 1; + else + cmp = 0; } + + if (cmp == 0) return ENTITY_LOOKUP[mid].value; + if (cmp < 0) + hi = mid; + else + lo = mid + 1; } - return nullptr; // Entity not found + return nullptr; } diff --git a/lib/Epub/Epub/htmlEntities.h b/lib/Epub/Epub/htmlEntities.h index 0221195f..c39448b1 100644 --- a/lib/Epub/Epub/htmlEntities.h +++ b/lib/Epub/Epub/htmlEntities.h @@ -1,4 +1,4 @@ -// from +// based on // https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp #pragma once @@ -6,4 +6,4 @@ // Lookup a single HTML entity (including & and ;) and return its UTF-8 value // Returns nullptr if entity is not found -const char* lookupHtmlEntity(const char* entity, int len); +const char* lookupHtmlEntity(const char* entity, size_t len); diff --git a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp index 8afe3ff9..e732b60a 100644 --- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp +++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp @@ -761,7 +761,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char void XMLCALL ChapterHtmlSlimParser::defaultHandlerExpand(void* userData, const XML_Char* s, const int len) { // Check if this looks like an entity reference (&...;) if (len >= 3 && s[0] == '&' && s[len - 1] == ';') { - const char* utf8Value = lookupHtmlEntity(s, len); + const char* utf8Value = lookupHtmlEntity(s, static_cast(len)); if (utf8Value != nullptr) { // Known entity: expand to its UTF-8 value characterData(userData, utf8Value, strlen(utf8Value));