#include "DictHtmlParser.h" #include #include #include #include #include std::string DictHtmlParser::decodeEntity(const std::string& html, size_t& i) { const size_t start = i; // Position of '&' const size_t remaining = html.length() - start; // Numeric entities: &#NNN; or &#xHHH; if (remaining > 2 && html[start + 1] == '#') { size_t numStart = start + 2; bool isHex = false; if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) { isHex = true; numStart++; } size_t numEnd = numStart; while (numEnd < html.length() && html[numEnd] != ';') { const char c = html[numEnd]; if (isHex) { if (!std::isxdigit(static_cast(c))) break; } else { if (!std::isdigit(static_cast(c))) break; } numEnd++; } if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') { const std::string numStr = html.substr(numStart, numEnd - numStart); unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10); i = numEnd; // Will be incremented by caller's loop // Convert codepoint to UTF-8 std::string utf8; if (codepoint < 0x80) { utf8 += static_cast(codepoint); } else if (codepoint < 0x800) { utf8 += static_cast(0xC0 | (codepoint >> 6)); utf8 += static_cast(0x80 | (codepoint & 0x3F)); } else if (codepoint < 0x10000) { utf8 += static_cast(0xE0 | (codepoint >> 12)); utf8 += static_cast(0x80 | ((codepoint >> 6) & 0x3F)); utf8 += static_cast(0x80 | (codepoint & 0x3F)); } else if (codepoint < 0x110000) { utf8 += static_cast(0xF0 | (codepoint >> 18)); utf8 += static_cast(0x80 | ((codepoint >> 12) & 0x3F)); utf8 += static_cast(0x80 | ((codepoint >> 6) & 0x3F)); utf8 += static_cast(0x80 | (codepoint & 0x3F)); } return utf8; } } // Named entities - find the semicolon first size_t semicolon = html.find(';', start + 1); if (semicolon != std::string::npos && semicolon - start < 12) { const std::string entity = html.substr(start, semicolon - start + 1); // Common named entities struct EntityMapping { const char* entity; const char* replacement; }; static const EntityMapping entities[] = { {" ", " "}, {"<", "<"}, {">", ">"}, {"&", "&"}, {""", "\""}, {"'", "'"}, {"—", "\xe2\x80\x94"}, // — {"–", "\xe2\x80\x93"}, // – {"…", "\xe2\x80\xa6"}, // … {"’", "\xe2\x80\x99"}, // ' {"‘", "\xe2\x80\x98"}, // ' {"”", "\xe2\x80\x9d"}, // " {"“", "\xe2\x80\x9c"}, // " {"°", "\xc2\xb0"}, // ° {"×", "\xc3\x97"}, // × {"÷", "\xc3\xb7"}, // ÷ {"±", "\xc2\xb1"}, // ± {"½", "\xc2\xbd"}, // ½ {"¼", "\xc2\xbc"}, // ¼ {"¾", "\xc2\xbe"}, // ¾ {"¢", "\xc2\xa2"}, // ¢ {"£", "\xc2\xa3"}, // £ {"€", "\xe2\x82\xac"}, // € {"¥", "\xc2\xa5"}, // ¥ {"©", "\xc2\xa9"}, // © {"®", "\xc2\xae"}, // ® {"™", "\xe2\x84\xa2"}, // ™ {"•", "\xe2\x80\xa2"}, // • {"·", "\xc2\xb7"}, // · {"§", "\xc2\xa7"}, // § {"¶", "\xc2\xb6"}, // ¶ {"†", "\xe2\x80\xa0"}, // † {"‡", "\xe2\x80\xa1"}, // ‡ {"¡", "\xc2\xa1"}, // ¡ {"¿", "\xc2\xbf"}, // ¿ {"«", "\xc2\xab"}, // « {"»", "\xc2\xbb"}, // » {"‎", ""}, // Left-to-right mark (invisible) {"‏", ""}, // Right-to-left mark (invisible) {"", ""}, // Soft hyphen {" ", " "}, {" ", " "}, {" ", " "}, {"‍", ""}, {"‌", ""}, }; for (const auto& mapping : entities) { if (entity == mapping.entity) { i = semicolon; // Will be incremented by caller's loop return mapping.replacement; } } } // Unknown entity - return just the ampersand return "&"; } std::string DictHtmlParser::extractTagName(const std::string& html, size_t start, bool& isClosing) { isClosing = false; size_t pos = start; // Skip whitespace after '<' while (pos < html.length() && std::isspace(static_cast(html[pos]))) { pos++; } // Check for closing tag if (pos < html.length() && html[pos] == '/') { isClosing = true; pos++; } // Extract tag name (alphanumeric characters) size_t nameStart = pos; while (pos < html.length() && (std::isalnum(static_cast(html[pos])) || html[pos] == '!')) { pos++; } std::string tagName = html.substr(nameStart, pos - nameStart); // Convert to lowercase std::transform(tagName.begin(), tagName.end(), tagName.begin(), [](unsigned char c) { return std::tolower(c); }); return tagName; } bool DictHtmlParser::isBlockTag(const std::string& tagName) { return tagName == "p" || tagName == "div" || tagName == "br" || tagName == "hr" || tagName == "li" || tagName == "ol" || tagName == "ul" || tagName == "dt" || tagName == "dd" || tagName == "html"; } bool DictHtmlParser::isBoldTag(const std::string& tagName) { return tagName == "b" || tagName == "strong"; } bool DictHtmlParser::isItalicTag(const std::string& tagName) { return tagName == "i" || tagName == "em"; } bool DictHtmlParser::isUnderlineTag(const std::string& tagName) { return tagName == "u" || tagName == "ins"; } bool DictHtmlParser::isSuperscriptTag(const std::string& tagName) { return tagName == "sup"; } bool DictHtmlParser::isListItemTag(const std::string& tagName) { return tagName == "li"; } bool DictHtmlParser::isOrderedListTag(const std::string& tagName) { return tagName == "ol"; } void DictHtmlParser::parse(const std::string& html, int fontId, const GfxRenderer& renderer, uint16_t viewportWidth, const std::function)>& onTextBlock) { // Current paragraph being built ParsedText currentParagraph(TextBlock::Style::LEFT_ALIGN, false, false); // State tracking int boldDepth = 0; int italicDepth = 0; int underlineDepth = 0; bool inSuperscript = false; bool inTag = false; // List tracking std::stack listCounters; // Stack for nested lists (0 = unordered, >0 = ordered counter) // Current word being accumulated std::string currentWord; bool lastWasSpace = true; // Start true to skip leading spaces // Helper to flush current word to paragraph auto flushWord = [&]() { if (currentWord.empty()) return; // Determine font style EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR; if (boldDepth > 0 && italicDepth > 0) { fontStyle = EpdFontFamily::BOLD_ITALIC; } else if (boldDepth > 0) { fontStyle = EpdFontFamily::BOLD; } else if (italicDepth > 0) { fontStyle = EpdFontFamily::ITALIC; } currentParagraph.addWord(currentWord, fontStyle, underlineDepth > 0); currentWord.clear(); lastWasSpace = false; }; // Helper to flush current paragraph (create TextBlocks) auto flushParagraph = [&]() { flushWord(); if (!currentParagraph.isEmpty()) { currentParagraph.layoutAndExtractLines(renderer, fontId, viewportWidth, onTextBlock); currentParagraph = ParsedText(TextBlock::Style::LEFT_ALIGN, false, false); } lastWasSpace = true; }; // Parse the HTML for (size_t i = 0; i < html.length(); i++) { const char c = html[i]; if (c == '<') { // Start of tag - flush current word first flushWord(); // Find end of tag size_t tagEnd = html.find('>', i); if (tagEnd == std::string::npos) { // Malformed HTML - treat rest as text currentWord += c; continue; } // Extract tag name bool isClosing = false; std::string tagName = extractTagName(html, i + 1, isClosing); // Handle different tag types if (isBoldTag(tagName)) { if (isClosing) { boldDepth = std::max(0, boldDepth - 1); } else { boldDepth++; } } else if (isItalicTag(tagName)) { if (isClosing) { italicDepth = std::max(0, italicDepth - 1); } else { italicDepth++; } } else if (isUnderlineTag(tagName)) { if (isClosing) { underlineDepth = std::max(0, underlineDepth - 1); } else { underlineDepth++; } } else if (isSuperscriptTag(tagName)) { if (isClosing) { inSuperscript = false; } else { inSuperscript = true; // Add caret prefix for superscript currentWord += '^'; } } else if (isOrderedListTag(tagName)) { if (isClosing) { if (!listCounters.empty()) { listCounters.pop(); } } else { // Check if it's an unordered list style std::string tagContent = html.substr(i, tagEnd - i); if (tagContent.find("list-style-type:lower-alpha") != std::string::npos) { listCounters.push(-1); // -1 = alphabetic } else { listCounters.push(1); // Start at 1 for ordered } } } else if (tagName == "ul") { if (isClosing) { if (!listCounters.empty()) { listCounters.pop(); } } else { listCounters.push(0); // 0 = unordered (bullet) } } else if (isListItemTag(tagName) && !isClosing) { // Start of list item - flush paragraph and add bullet/number flushParagraph(); std::string prefix; if (!listCounters.empty()) { int counter = listCounters.top(); if (counter == 0) { // Unordered - bullet point prefix = "\xe2\x80\xa2 "; // • bullet } else if (counter == -1) { // Alphabetic - not fully supported, just use bullet prefix = " "; } else { // Ordered - number char numBuf[8]; snprintf(numBuf, sizeof(numBuf), "%d. ", counter); prefix = numBuf; listCounters.pop(); listCounters.push(counter + 1); // Increment for next item } } else { // No list context - just indent prefix = "\xe2\x80\xa2 "; // • bullet } // Add prefix as a word (em-space for indent + prefix) currentParagraph.addWord("\xe2\x80\x83" + prefix, EpdFontFamily::REGULAR, false); lastWasSpace = true; } else if (isBlockTag(tagName)) { // Block element - flush paragraph flushParagraph(); // Special handling for which separates dictionary entries if (tagName == "html" && isClosing) { // Add extra spacing between entries flushParagraph(); } } // Skip to end of tag i = tagEnd; } else if (c == '&') { // HTML entity std::string decoded = decodeEntity(html, i); if (!decoded.empty()) { if (decoded == " ") { // Space entity - treat as space if (!lastWasSpace) { flushWord(); lastWasSpace = true; } } else { currentWord += decoded; lastWasSpace = false; } } } else if (std::isspace(static_cast(c))) { // Whitespace - flush word and collapse if (!lastWasSpace) { flushWord(); lastWasSpace = true; } } else { // Regular character currentWord += c; lastWasSpace = false; } } // Flush any remaining content flushParagraph(); }