## Summary **What is the goal of this PR?** Replace the linear scan of `lookupHtmlEntity` with a simple binary search to improve lookup performance. **What changes are included?** `lib/Epub/Epub/Entities/htmlEntities.cpp`: - Sorted the `ENTITY_LOOKUP` array. - Added a compile-time assertion to guarantee the array remains sorted. - Rewrote `lookupHtmlEntity` to use a binary search. ## Additional Context Benchmarked on my x64 laptop (probably will be different on RISC-V) ``` === Benchmark (53 entities x 10000 iterations) === Version Total time Avg per lookup ---------------------------------------------- linear 236.97 ms total 447.11 ns/lookup binary search 22.09 ms total 41.68 ns/lookup === Summary === Binary search is 10.73x faster than linear scan. ``` This is a simplified alternative to #1180, focused on keeping the implementation clean, and maintainable. ### AI Usage Did you use AI tools to help write this code? _**< NO >**_ --------- Co-authored-by: Zach Nelson <zach@zdnelson.com>
115 lines
7.3 KiB
C++
115 lines
7.3 KiB
C++
// based on
|
||
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
|
||
|
||
#include "htmlEntities.h"
|
||
|
||
#include <cstring>
|
||
|
||
struct EntityPair {
|
||
const char* key;
|
||
const char* value;
|
||
};
|
||
|
||
// Sorted lexicographically by key to allow binary search.
|
||
static constexpr EntityPair ENTITY_LOOKUP[] = {
|
||
{"Æ", "Æ"}, {"Á", "Á"}, {"Â", "Â"}, {"À", "À"}, {"Α", "Α"},
|
||
{"Å", "Å"}, {"Ã", "Ã"}, {"Ä", "Ä"}, {"Β", "Β"}, {"Ç", "Ç"},
|
||
{"Χ", "Χ"}, {"‡", "‡"}, {"Δ", "Δ"}, {"Ð", "Ð"}, {"É", "É"},
|
||
{"Ê", "Ê"}, {"È", "È"}, {"Ε", "Ε"}, {"Η", "Η"}, {"Ë", "Ë"},
|
||
{"Γ", "Γ"}, {"Í", "Í"}, {"Î", "Î"}, {"Ì", "Ì"}, {"Ι", "Ι"},
|
||
{"Ï", "Ï"}, {"Κ", "Κ"}, {"Λ", "Λ"}, {"Μ", "Μ"}, {"Ñ", "Ñ"},
|
||
{"Ν", "Ν"}, {"Œ", "Œ"}, {"Ó", "Ó"}, {"Ô", "Ô"}, {"Ò", "Ò"},
|
||
{"Ω", "Ω"}, {"Ο", "Ο"}, {"Ø", "Ø"}, {"Õ", "Õ"}, {"Ö", "Ö"},
|
||
{"Φ", "Φ"}, {"Π", "Π"}, {"″", "″"}, {"Ψ", "Ψ"}, {"Ρ", "Ρ"},
|
||
{"Š", "Š"}, {"Σ", "Σ"}, {"Þ", "Þ"}, {"Τ", "Τ"}, {"Θ", "Θ"},
|
||
{"Ú", "Ú"}, {"Û", "Û"}, {"Ù", "Ù"}, {"Υ", "Υ"}, {"Ü", "Ü"},
|
||
{"Ξ", "Ξ"}, {"Ý", "Ý"}, {"Ÿ", "Ÿ"}, {"Ζ", "Ζ"}, {"á", "á"},
|
||
{"â", "â"}, {"´", "´"}, {"æ", "æ"}, {"à", "à"}, {"α", "α"},
|
||
{"&", "&"}, {"∧", "∧"}, {"∠", "∠"}, {"å", "å"}, {"≈", "≈"},
|
||
{"ã", "ã"}, {"ä", "ä"}, {"„", "„"}, {"β", "β"}, {"¦", "¦"},
|
||
{"•", "•"}, {"∩", "∩"}, {"ç", "ç"}, {"¸", "¸"}, {"¢", "¢"},
|
||
{"χ", "χ"}, {"ˆ", "ˆ"}, {"♣", "♣"}, {"≅", "≅"}, {"©", "©"},
|
||
{"↵", "↵"}, {"∪", "∪"}, {"¤", "¤"}, {"†", "†"}, {"↓", "↓"},
|
||
{"°", "°"}, {"δ", "δ"}, {"♦", "♦"}, {"÷", "÷"}, {"é", "é"},
|
||
{"ê", "ê"}, {"è", "è"}, {"∅", "∅"}, {" ", " "}, {" ", " "},
|
||
{"ε", "ε"}, {"≡", "≡"}, {"η", "η"}, {"ð", "ð"}, {"ë", "ë"},
|
||
{"€", "€"}, {"∃", "∃"}, {"ƒ", "ƒ"}, {"∀", "∀"}, {"½", "½"},
|
||
{"¼", "¼"}, {"¾", "¾"}, {"⁄", "⁄"}, {"γ", "γ"}, {"≥", "≥"},
|
||
{">", ">"}, {"↔", "↔"}, {"♥", "♥"}, {"…", "…"}, {"í", "í"},
|
||
{"î", "î"}, {"¡", "¡"}, {"ì", "ì"}, {"∞", "∞"}, {"∫", "∫"},
|
||
{"ι", "ι"}, {"¿", "¿"}, {"∈", "∈"}, {"ï", "ï"}, {"κ", "κ"},
|
||
{"λ", "λ"}, {"«", "«"}, {"←", "←"}, {"⌈", "⌈"}, {"“", "\u201C"},
|
||
{"≤", "≤"}, {"⌊", "⌊"}, {"∗", "∗"}, {"◊", "◊"}, {"‎", "\u200E"},
|
||
{"‹", "‹"}, {"‘", "\u2018"}, {"<", "<"}, {"¯", "¯"}, {"—", "—"},
|
||
{"µ", "µ"}, {"−", "−"}, {"μ", "μ"}, {"∇", "∇"}, {" ", "\xC2\xA0"},
|
||
{"–", "–"}, {"≠", "≠"}, {"∋", "∋"}, {"¬", "¬"}, {"∉", "∉"},
|
||
{"⊄", "⊄"}, {"ñ", "ñ"}, {"ν", "ν"}, {"ó", "ó"}, {"ô", "ô"},
|
||
{"œ", "œ"}, {"ò", "ò"}, {"‾", "‾"}, {"ω", "ω"}, {"ο", "ο"},
|
||
{"⊕", "⊕"}, {"∨", "∨"}, {"ª", "ª"}, {"º", "º"}, {"ø", "ø"},
|
||
{"õ", "õ"}, {"⊗", "⊗"}, {"ö", "ö"}, {"¶", "¶"}, {"∂", "∂"},
|
||
{"‰", "‰"}, {"⊥", "⊥"}, {"φ", "φ"}, {"π", "π"}, {"ϖ", "ϖ"},
|
||
{"±", "±"}, {"£", "£"}, {"′", "′"}, {"∏", "∏"}, {"∝", "∝"},
|
||
{"ψ", "ψ"}, {""", "\""}, {"√", "√"}, {"»", "»"}, {"→", "→"},
|
||
{"⌉", "⌉"}, {"”", "\u201D"}, {"®", "®"}, {"⌋", "⌋"}, {"ρ", "ρ"},
|
||
{"‏", "\u200F"}, {"›", "›"}, {"’", "\u2019"}, {"‚", "‚"}, {"š", "š"},
|
||
{"⋅", "⋅"}, {"§", "§"}, {"­", "\xC2\xAD"}, {"σ", "σ"}, {"ς", "ς"},
|
||
{"∼", "∼"}, {"♠", "♠"}, {"⊂", "⊂"}, {"⊆", "⊆"}, {"∑", "∑"},
|
||
{"¹", "¹"}, {"²", "²"}, {"³", "³"}, {"⊃", "⊃"}, {"⊇", "⊇"},
|
||
{"ß", "ß"}, {"τ", "τ"}, {"∴", "∴"}, {"θ", "θ"}, {"ϑ", "ϑ"},
|
||
{" ", " "}, {"þ", "þ"}, {"˜", "˜"}, {"×", "×"}, {"™", "™"},
|
||
{"ú", "ú"}, {"↑", "↑"}, {"û", "û"}, {"ù", "ù"}, {"¨", "¨"},
|
||
{"ϒ", "ϒ"}, {"υ", "υ"}, {"ü", "ü"}, {"ξ", "ξ"}, {"ý", "ý"},
|
||
{"¥", "¥"}, {"ÿ", "ÿ"}, {"ζ", "ζ"}, {"‍", "\u200D"}, {"‌", "\u200C"},
|
||
};
|
||
|
||
static const size_t ENTITY_LOOKUP_COUNT = sizeof(ENTITY_LOOKUP) / sizeof(ENTITY_LOOKUP[0]);
|
||
|
||
// Verify the table is sorted at compile time.
|
||
static constexpr int constexprStrcmp(const char* a, const char* b) {
|
||
for (size_t i = 0;; i++) {
|
||
if (a[i] != b[i]) return (unsigned char)a[i] < (unsigned char)b[i] ? -1 : 1;
|
||
if (a[i] == '\0') return 0;
|
||
}
|
||
}
|
||
|
||
static constexpr bool isTableSorted() {
|
||
for (size_t i = 1; i < ENTITY_LOOKUP_COUNT; i++) {
|
||
if (constexprStrcmp(ENTITY_LOOKUP[i - 1].key, ENTITY_LOOKUP[i].key) >= 0) return false;
|
||
}
|
||
return true;
|
||
}
|
||
static_assert(isTableSorted(), "ENTITY_LOOKUP must be sorted lexicographically by key");
|
||
|
||
// Lookup a single HTML entity and return its UTF-8 value.
|
||
const char* lookupHtmlEntity(const char* entity, size_t len) {
|
||
if (entity == nullptr || len == 0) return nullptr;
|
||
|
||
size_t lo = 0;
|
||
size_t hi = ENTITY_LOOKUP_COUNT;
|
||
|
||
while (lo < hi) {
|
||
const size_t mid = lo + (hi - lo) / 2;
|
||
const char* key = ENTITY_LOOKUP[mid].key;
|
||
const size_t keyLen = strlen(key);
|
||
const size_t cmpLen = (len < keyLen) ? len : keyLen;
|
||
int cmp = memcmp(entity, key, cmpLen);
|
||
if (cmp == 0) {
|
||
// safety net: if prefix equal, shorter string is considered smaller
|
||
if (len < keyLen)
|
||
cmp = -1;
|
||
else if (len > keyLen)
|
||
cmp = 1;
|
||
else
|
||
cmp = 0;
|
||
}
|
||
|
||
if (cmp == 0) return ENTITY_LOOKUP[mid].value;
|
||
if (cmp < 0)
|
||
hi = mid;
|
||
else
|
||
lo = mid + 1;
|
||
}
|
||
|
||
return nullptr;
|
||
}
|