crosspoint-reader/lib/StarDict/DictHtmlParser.cpp
cottongin 8fa01bc83a
Some checks failed
CI / build (push) Failing after 2m16s
fix: prevent Serial.printf from blocking when USB disconnected
On ESP32-C3 with USB CDC, Serial.printf() blocks indefinitely when USB
is not connected. This caused device freezes when booted without USB.

Solution: Call Serial.setTxTimeoutMs(0) after Serial.begin() to make
all Serial output non-blocking.

Also added if (Serial) guards to high-traffic logging paths in
EpubReaderActivity as belt-and-suspenders protection.

Includes documentation of the debugging process and Serial call inventory.

Also applies clang-format to fix pre-existing formatting issues.
2026-01-28 16:02:13 -05:00

364 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "DictHtmlParser.h"
#include <Epub/ParsedText.h>
#include <GfxRenderer.h>
#include <algorithm>
#include <cctype>
#include <stack>
std::string DictHtmlParser::decodeEntity(const std::string& html, size_t& i) {
const size_t start = i; // Position of '&'
const size_t remaining = html.length() - start;
// Numeric entities: &#NNN; or &#xHHH;
if (remaining > 2 && html[start + 1] == '#') {
size_t numStart = start + 2;
bool isHex = false;
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
isHex = true;
numStart++;
}
size_t numEnd = numStart;
while (numEnd < html.length() && html[numEnd] != ';') {
const char c = html[numEnd];
if (isHex) {
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
} else {
if (!std::isdigit(static_cast<unsigned char>(c))) break;
}
numEnd++;
}
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
const std::string numStr = html.substr(numStart, numEnd - numStart);
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
i = numEnd; // Will be incremented by caller's loop
// Convert codepoint to UTF-8
std::string utf8;
if (codepoint < 0x80) {
utf8 += static_cast<char>(codepoint);
} else if (codepoint < 0x800) {
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x10000) {
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x110000) {
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
}
return utf8;
}
}
// Named entities - find the semicolon first
size_t semicolon = html.find(';', start + 1);
if (semicolon != std::string::npos && semicolon - start < 12) {
const std::string entity = html.substr(start, semicolon - start + 1);
// Common named entities
struct EntityMapping {
const char* entity;
const char* replacement;
};
static const EntityMapping entities[] = {
{"&nbsp;", " "},
{"&lt;", "<"},
{"&gt;", ">"},
{"&amp;", "&"},
{"&quot;", "\""},
{"&apos;", "'"},
{"&mdash;", "\xe2\x80\x94"}, // —
{"&ndash;", "\xe2\x80\x93"}, //
{"&hellip;", "\xe2\x80\xa6"}, // …
{"&rsquo;", "\xe2\x80\x99"}, // '
{"&lsquo;", "\xe2\x80\x98"}, // '
{"&rdquo;", "\xe2\x80\x9d"}, // "
{"&ldquo;", "\xe2\x80\x9c"}, // "
{"&deg;", "\xc2\xb0"}, // °
{"&times;", "\xc3\x97"}, // ×
{"&divide;", "\xc3\xb7"}, // ÷
{"&plusmn;", "\xc2\xb1"}, // ±
{"&frac12;", "\xc2\xbd"}, // ½
{"&frac14;", "\xc2\xbc"}, // ¼
{"&frac34;", "\xc2\xbe"}, // ¾
{"&cent;", "\xc2\xa2"}, // ¢
{"&pound;", "\xc2\xa3"}, // £
{"&euro;", "\xe2\x82\xac"}, // €
{"&yen;", "\xc2\xa5"}, // ¥
{"&copy;", "\xc2\xa9"}, // ©
{"&reg;", "\xc2\xae"}, // ®
{"&trade;", "\xe2\x84\xa2"}, // ™
{"&bull;", "\xe2\x80\xa2"}, // •
{"&middot;", "\xc2\xb7"}, // ·
{"&sect;", "\xc2\xa7"}, // §
{"&para;", "\xc2\xb6"}, // ¶
{"&dagger;", "\xe2\x80\xa0"}, // †
{"&Dagger;", "\xe2\x80\xa1"}, // ‡
{"&iexcl;", "\xc2\xa1"}, // ¡
{"&iquest;", "\xc2\xbf"}, // ¿
{"&laquo;", "\xc2\xab"}, // «
{"&raquo;", "\xc2\xbb"}, // »
{"&lrm;", ""}, // Left-to-right mark (invisible)
{"&rlm;", ""}, // Right-to-left mark (invisible)
{"&shy;", ""}, // Soft hyphen
{"&ensp;", " "},
{"&emsp;", " "},
{"&thinsp;", " "},
{"&zwj;", ""},
{"&zwnj;", ""},
};
for (const auto& mapping : entities) {
if (entity == mapping.entity) {
i = semicolon; // Will be incremented by caller's loop
return mapping.replacement;
}
}
}
// Unknown entity - return just the ampersand
return "&";
}
std::string DictHtmlParser::extractTagName(const std::string& html, size_t start, bool& isClosing) {
isClosing = false;
size_t pos = start;
// Skip whitespace after '<'
while (pos < html.length() && std::isspace(static_cast<unsigned char>(html[pos]))) {
pos++;
}
// Check for closing tag
if (pos < html.length() && html[pos] == '/') {
isClosing = true;
pos++;
}
// Extract tag name (alphanumeric characters)
size_t nameStart = pos;
while (pos < html.length() && (std::isalnum(static_cast<unsigned char>(html[pos])) || html[pos] == '!')) {
pos++;
}
std::string tagName = html.substr(nameStart, pos - nameStart);
// Convert to lowercase
std::transform(tagName.begin(), tagName.end(), tagName.begin(), [](unsigned char c) { return std::tolower(c); });
return tagName;
}
bool DictHtmlParser::isBlockTag(const std::string& tagName) {
return tagName == "p" || tagName == "div" || tagName == "br" || tagName == "hr" || tagName == "li" ||
tagName == "ol" || tagName == "ul" || tagName == "dt" || tagName == "dd" || tagName == "html";
}
bool DictHtmlParser::isBoldTag(const std::string& tagName) { return tagName == "b" || tagName == "strong"; }
bool DictHtmlParser::isItalicTag(const std::string& tagName) { return tagName == "i" || tagName == "em"; }
bool DictHtmlParser::isUnderlineTag(const std::string& tagName) { return tagName == "u" || tagName == "ins"; }
bool DictHtmlParser::isSuperscriptTag(const std::string& tagName) { return tagName == "sup"; }
bool DictHtmlParser::isListItemTag(const std::string& tagName) { return tagName == "li"; }
bool DictHtmlParser::isOrderedListTag(const std::string& tagName) { return tagName == "ol"; }
void DictHtmlParser::parse(const std::string& html, int fontId, const GfxRenderer& renderer, uint16_t viewportWidth,
const std::function<void(std::shared_ptr<TextBlock>)>& onTextBlock) {
// Current paragraph being built
ParsedText currentParagraph(TextBlock::Style::LEFT_ALIGN, false, false);
// State tracking
int boldDepth = 0;
int italicDepth = 0;
int underlineDepth = 0;
bool inSuperscript = false;
bool inTag = false;
// List tracking
std::stack<int> listCounters; // Stack for nested lists (0 = unordered, >0 = ordered counter)
// Current word being accumulated
std::string currentWord;
bool lastWasSpace = true; // Start true to skip leading spaces
// Helper to flush current word to paragraph
auto flushWord = [&]() {
if (currentWord.empty()) return;
// Determine font style
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
if (boldDepth > 0 && italicDepth > 0) {
fontStyle = EpdFontFamily::BOLD_ITALIC;
} else if (boldDepth > 0) {
fontStyle = EpdFontFamily::BOLD;
} else if (italicDepth > 0) {
fontStyle = EpdFontFamily::ITALIC;
}
currentParagraph.addWord(currentWord, fontStyle, underlineDepth > 0);
currentWord.clear();
lastWasSpace = false;
};
// Helper to flush current paragraph (create TextBlocks)
auto flushParagraph = [&]() {
flushWord();
if (!currentParagraph.isEmpty()) {
currentParagraph.layoutAndExtractLines(renderer, fontId, viewportWidth, onTextBlock);
currentParagraph = ParsedText(TextBlock::Style::LEFT_ALIGN, false, false);
}
lastWasSpace = true;
};
// Parse the HTML
for (size_t i = 0; i < html.length(); i++) {
const char c = html[i];
if (c == '<') {
// Start of tag - flush current word first
flushWord();
// Find end of tag
size_t tagEnd = html.find('>', i);
if (tagEnd == std::string::npos) {
// Malformed HTML - treat rest as text
currentWord += c;
continue;
}
// Extract tag name
bool isClosing = false;
std::string tagName = extractTagName(html, i + 1, isClosing);
// Handle different tag types
if (isBoldTag(tagName)) {
if (isClosing) {
boldDepth = std::max(0, boldDepth - 1);
} else {
boldDepth++;
}
} else if (isItalicTag(tagName)) {
if (isClosing) {
italicDepth = std::max(0, italicDepth - 1);
} else {
italicDepth++;
}
} else if (isUnderlineTag(tagName)) {
if (isClosing) {
underlineDepth = std::max(0, underlineDepth - 1);
} else {
underlineDepth++;
}
} else if (isSuperscriptTag(tagName)) {
if (isClosing) {
inSuperscript = false;
} else {
inSuperscript = true;
// Add caret prefix for superscript
currentWord += '^';
}
} else if (isOrderedListTag(tagName)) {
if (isClosing) {
if (!listCounters.empty()) {
listCounters.pop();
}
} else {
// Check if it's an unordered list style
std::string tagContent = html.substr(i, tagEnd - i);
if (tagContent.find("list-style-type:lower-alpha") != std::string::npos) {
listCounters.push(-1); // -1 = alphabetic
} else {
listCounters.push(1); // Start at 1 for ordered
}
}
} else if (tagName == "ul") {
if (isClosing) {
if (!listCounters.empty()) {
listCounters.pop();
}
} else {
listCounters.push(0); // 0 = unordered (bullet)
}
} else if (isListItemTag(tagName) && !isClosing) {
// Start of list item - flush paragraph and add bullet/number
flushParagraph();
std::string prefix;
if (!listCounters.empty()) {
int counter = listCounters.top();
if (counter == 0) {
// Unordered - bullet point
prefix = "\xe2\x80\xa2 "; // • bullet
} else if (counter == -1) {
// Alphabetic - not fully supported, just use bullet
prefix = " ";
} else {
// Ordered - number
char numBuf[8];
snprintf(numBuf, sizeof(numBuf), "%d. ", counter);
prefix = numBuf;
listCounters.pop();
listCounters.push(counter + 1); // Increment for next item
}
} else {
// No list context - just indent
prefix = "\xe2\x80\xa2 "; // • bullet
}
// Add prefix as a word (em-space for indent + prefix)
currentParagraph.addWord("\xe2\x80\x83" + prefix, EpdFontFamily::REGULAR, false);
lastWasSpace = true;
} else if (isBlockTag(tagName)) {
// Block element - flush paragraph
flushParagraph();
// Special handling for </html> which separates dictionary entries
if (tagName == "html" && isClosing) {
// Add extra spacing between entries
flushParagraph();
}
}
// Skip to end of tag
i = tagEnd;
} else if (c == '&') {
// HTML entity
std::string decoded = decodeEntity(html, i);
if (!decoded.empty()) {
if (decoded == " ") {
// Space entity - treat as space
if (!lastWasSpace) {
flushWord();
lastWasSpace = true;
}
} else {
currentWord += decoded;
lastWasSpace = false;
}
}
} else if (std::isspace(static_cast<unsigned char>(c))) {
// Whitespace - flush word and collapse
if (!lastWasSpace) {
flushWord();
lastWasSpace = true;
}
} else {
// Regular character
currentWord += c;
lastWasSpace = false;
}
}
// Flush any remaining content
flushParagraph();
}