#include "DictHtmlParser.h"
#include
#include
#include
#include
#include
std::string DictHtmlParser::decodeEntity(const std::string& html, size_t& i) {
const size_t start = i; // Position of '&'
const size_t remaining = html.length() - start;
// Numeric entities: NNN; or HHH;
if (remaining > 2 && html[start + 1] == '#') {
size_t numStart = start + 2;
bool isHex = false;
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
isHex = true;
numStart++;
}
size_t numEnd = numStart;
while (numEnd < html.length() && html[numEnd] != ';') {
const char c = html[numEnd];
if (isHex) {
if (!std::isxdigit(static_cast(c))) break;
} else {
if (!std::isdigit(static_cast(c))) break;
}
numEnd++;
}
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
const std::string numStr = html.substr(numStart, numEnd - numStart);
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
i = numEnd; // Will be incremented by caller's loop
// Convert codepoint to UTF-8
std::string utf8;
if (codepoint < 0x80) {
utf8 += static_cast(codepoint);
} else if (codepoint < 0x800) {
utf8 += static_cast(0xC0 | (codepoint >> 6));
utf8 += static_cast(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x10000) {
utf8 += static_cast(0xE0 | (codepoint >> 12));
utf8 += static_cast(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x110000) {
utf8 += static_cast(0xF0 | (codepoint >> 18));
utf8 += static_cast(0x80 | ((codepoint >> 12) & 0x3F));
utf8 += static_cast(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast(0x80 | (codepoint & 0x3F));
}
return utf8;
}
}
// Named entities - find the semicolon first
size_t semicolon = html.find(';', start + 1);
if (semicolon != std::string::npos && semicolon - start < 12) {
const std::string entity = html.substr(start, semicolon - start + 1);
// Common named entities
struct EntityMapping {
const char* entity;
const char* replacement;
};
static const EntityMapping entities[] = {
{" ", " "},
{"<", "<"},
{">", ">"},
{"&", "&"},
{""", "\""},
{"'", "'"},
{"—", "\xe2\x80\x94"}, // —
{"–", "\xe2\x80\x93"}, // –
{"…", "\xe2\x80\xa6"}, // …
{"’", "\xe2\x80\x99"}, // '
{"‘", "\xe2\x80\x98"}, // '
{"”", "\xe2\x80\x9d"}, // "
{"“", "\xe2\x80\x9c"}, // "
{"°", "\xc2\xb0"}, // °
{"×", "\xc3\x97"}, // ×
{"÷", "\xc3\xb7"}, // ÷
{"±", "\xc2\xb1"}, // ±
{"½", "\xc2\xbd"}, // ½
{"¼", "\xc2\xbc"}, // ¼
{"¾", "\xc2\xbe"}, // ¾
{"¢", "\xc2\xa2"}, // ¢
{"£", "\xc2\xa3"}, // £
{"€", "\xe2\x82\xac"}, // €
{"¥", "\xc2\xa5"}, // ¥
{"©", "\xc2\xa9"}, // ©
{"®", "\xc2\xae"}, // ®
{"™", "\xe2\x84\xa2"}, // ™
{"•", "\xe2\x80\xa2"}, // •
{"·", "\xc2\xb7"}, // ·
{"§", "\xc2\xa7"}, // §
{"¶", "\xc2\xb6"}, // ¶
{"†", "\xe2\x80\xa0"}, // †
{"‡", "\xe2\x80\xa1"}, // ‡
{"¡", "\xc2\xa1"}, // ¡
{"¿", "\xc2\xbf"}, // ¿
{"«", "\xc2\xab"}, // «
{"»", "\xc2\xbb"}, // »
{"", ""}, // Left-to-right mark (invisible)
{"", ""}, // Right-to-left mark (invisible)
{"", ""}, // Soft hyphen
{" ", " "},
{" ", " "},
{" ", " "},
{"", ""},
{"", ""},
};
for (const auto& mapping : entities) {
if (entity == mapping.entity) {
i = semicolon; // Will be incremented by caller's loop
return mapping.replacement;
}
}
}
// Unknown entity - return just the ampersand
return "&";
}
std::string DictHtmlParser::extractTagName(const std::string& html, size_t start, bool& isClosing) {
isClosing = false;
size_t pos = start;
// Skip whitespace after '<'
while (pos < html.length() && std::isspace(static_cast(html[pos]))) {
pos++;
}
// Check for closing tag
if (pos < html.length() && html[pos] == '/') {
isClosing = true;
pos++;
}
// Extract tag name (alphanumeric characters)
size_t nameStart = pos;
while (pos < html.length() && (std::isalnum(static_cast(html[pos])) || html[pos] == '!')) {
pos++;
}
std::string tagName = html.substr(nameStart, pos - nameStart);
// Convert to lowercase
std::transform(tagName.begin(), tagName.end(), tagName.begin(),
[](unsigned char c) { return std::tolower(c); });
return tagName;
}
bool DictHtmlParser::isBlockTag(const std::string& tagName) {
return tagName == "p" || tagName == "div" || tagName == "br" || tagName == "hr" || tagName == "li" ||
tagName == "ol" || tagName == "ul" || tagName == "dt" || tagName == "dd" || tagName == "html";
}
bool DictHtmlParser::isBoldTag(const std::string& tagName) {
return tagName == "b" || tagName == "strong";
}
bool DictHtmlParser::isItalicTag(const std::string& tagName) {
return tagName == "i" || tagName == "em";
}
bool DictHtmlParser::isUnderlineTag(const std::string& tagName) {
return tagName == "u" || tagName == "ins";
}
bool DictHtmlParser::isSuperscriptTag(const std::string& tagName) { return tagName == "sup"; }
bool DictHtmlParser::isListItemTag(const std::string& tagName) { return tagName == "li"; }
bool DictHtmlParser::isOrderedListTag(const std::string& tagName) { return tagName == "ol"; }
void DictHtmlParser::parse(const std::string& html, int fontId, const GfxRenderer& renderer, uint16_t viewportWidth,
const std::function)>& onTextBlock) {
// Current paragraph being built
ParsedText currentParagraph(TextBlock::Style::LEFT_ALIGN, false, false);
// State tracking
int boldDepth = 0;
int italicDepth = 0;
int underlineDepth = 0;
bool inSuperscript = false;
bool inTag = false;
// List tracking
std::stack listCounters; // Stack for nested lists (0 = unordered, >0 = ordered counter)
// Current word being accumulated
std::string currentWord;
bool lastWasSpace = true; // Start true to skip leading spaces
// Helper to flush current word to paragraph
auto flushWord = [&]() {
if (currentWord.empty()) return;
// Determine font style
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
if (boldDepth > 0 && italicDepth > 0) {
fontStyle = EpdFontFamily::BOLD_ITALIC;
} else if (boldDepth > 0) {
fontStyle = EpdFontFamily::BOLD;
} else if (italicDepth > 0) {
fontStyle = EpdFontFamily::ITALIC;
}
currentParagraph.addWord(currentWord, fontStyle, underlineDepth > 0);
currentWord.clear();
lastWasSpace = false;
};
// Helper to flush current paragraph (create TextBlocks)
auto flushParagraph = [&]() {
flushWord();
if (!currentParagraph.isEmpty()) {
currentParagraph.layoutAndExtractLines(renderer, fontId, viewportWidth, onTextBlock);
currentParagraph = ParsedText(TextBlock::Style::LEFT_ALIGN, false, false);
}
lastWasSpace = true;
};
// Parse the HTML
for (size_t i = 0; i < html.length(); i++) {
const char c = html[i];
if (c == '<') {
// Start of tag - flush current word first
flushWord();
// Find end of tag
size_t tagEnd = html.find('>', i);
if (tagEnd == std::string::npos) {
// Malformed HTML - treat rest as text
currentWord += c;
continue;
}
// Extract tag name
bool isClosing = false;
std::string tagName = extractTagName(html, i + 1, isClosing);
// Handle different tag types
if (isBoldTag(tagName)) {
if (isClosing) {
boldDepth = std::max(0, boldDepth - 1);
} else {
boldDepth++;
}
} else if (isItalicTag(tagName)) {
if (isClosing) {
italicDepth = std::max(0, italicDepth - 1);
} else {
italicDepth++;
}
} else if (isUnderlineTag(tagName)) {
if (isClosing) {
underlineDepth = std::max(0, underlineDepth - 1);
} else {
underlineDepth++;
}
} else if (isSuperscriptTag(tagName)) {
if (isClosing) {
inSuperscript = false;
} else {
inSuperscript = true;
// Add caret prefix for superscript
currentWord += '^';
}
} else if (isOrderedListTag(tagName)) {
if (isClosing) {
if (!listCounters.empty()) {
listCounters.pop();
}
} else {
// Check if it's an unordered list style
std::string tagContent = html.substr(i, tagEnd - i);
if (tagContent.find("list-style-type:lower-alpha") != std::string::npos) {
listCounters.push(-1); // -1 = alphabetic
} else {
listCounters.push(1); // Start at 1 for ordered
}
}
} else if (tagName == "ul") {
if (isClosing) {
if (!listCounters.empty()) {
listCounters.pop();
}
} else {
listCounters.push(0); // 0 = unordered (bullet)
}
} else if (isListItemTag(tagName) && !isClosing) {
// Start of list item - flush paragraph and add bullet/number
flushParagraph();
std::string prefix;
if (!listCounters.empty()) {
int counter = listCounters.top();
if (counter == 0) {
// Unordered - bullet point
prefix = "\xe2\x80\xa2 "; // • bullet
} else if (counter == -1) {
// Alphabetic - not fully supported, just use bullet
prefix = " ";
} else {
// Ordered - number
char numBuf[8];
snprintf(numBuf, sizeof(numBuf), "%d. ", counter);
prefix = numBuf;
listCounters.pop();
listCounters.push(counter + 1); // Increment for next item
}
} else {
// No list context - just indent
prefix = "\xe2\x80\xa2 "; // • bullet
}
// Add prefix as a word (em-space for indent + prefix)
currentParagraph.addWord("\xe2\x80\x83" + prefix, EpdFontFamily::REGULAR, false);
lastWasSpace = true;
} else if (isBlockTag(tagName)) {
// Block element - flush paragraph
flushParagraph();
// Special handling for