crosspoint-reader/lib/StarDict/DictHtmlParser.cpp

364 lines
12 KiB
C++
Raw Normal View History

2026-01-22 12:42:01 -05:00
#include "DictHtmlParser.h"
#include <Epub/ParsedText.h>
#include <GfxRenderer.h>
#include <algorithm>
#include <cctype>
#include <stack>
std::string DictHtmlParser::decodeEntity(const std::string& html, size_t& i) {
const size_t start = i; // Position of '&'
const size_t remaining = html.length() - start;
// Numeric entities: &#NNN; or &#xHHH;
if (remaining > 2 && html[start + 1] == '#') {
size_t numStart = start + 2;
bool isHex = false;
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
isHex = true;
numStart++;
}
size_t numEnd = numStart;
while (numEnd < html.length() && html[numEnd] != ';') {
const char c = html[numEnd];
if (isHex) {
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
} else {
if (!std::isdigit(static_cast<unsigned char>(c))) break;
}
numEnd++;
}
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
const std::string numStr = html.substr(numStart, numEnd - numStart);
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
i = numEnd; // Will be incremented by caller's loop
// Convert codepoint to UTF-8
std::string utf8;
if (codepoint < 0x80) {
utf8 += static_cast<char>(codepoint);
} else if (codepoint < 0x800) {
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x10000) {
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x110000) {
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
}
return utf8;
}
}
// Named entities - find the semicolon first
size_t semicolon = html.find(';', start + 1);
if (semicolon != std::string::npos && semicolon - start < 12) {
const std::string entity = html.substr(start, semicolon - start + 1);
// Common named entities
struct EntityMapping {
const char* entity;
const char* replacement;
};
static const EntityMapping entities[] = {
{"&nbsp;", " "},
{"&lt;", "<"},
{"&gt;", ">"},
{"&amp;", "&"},
{"&quot;", "\""},
{"&apos;", "'"},
{"&mdash;", "\xe2\x80\x94"}, // —
{"&ndash;", "\xe2\x80\x93"}, //
{"&hellip;", "\xe2\x80\xa6"}, // …
{"&rsquo;", "\xe2\x80\x99"}, // '
{"&lsquo;", "\xe2\x80\x98"}, // '
{"&rdquo;", "\xe2\x80\x9d"}, // "
{"&ldquo;", "\xe2\x80\x9c"}, // "
{"&deg;", "\xc2\xb0"}, // °
{"&times;", "\xc3\x97"}, // ×
{"&divide;", "\xc3\xb7"}, // ÷
{"&plusmn;", "\xc2\xb1"}, // ±
{"&frac12;", "\xc2\xbd"}, // ½
{"&frac14;", "\xc2\xbc"}, // ¼
{"&frac34;", "\xc2\xbe"}, // ¾
{"&cent;", "\xc2\xa2"}, // ¢
{"&pound;", "\xc2\xa3"}, // £
{"&euro;", "\xe2\x82\xac"}, // €
{"&yen;", "\xc2\xa5"}, // ¥
{"&copy;", "\xc2\xa9"}, // ©
{"&reg;", "\xc2\xae"}, // ®
{"&trade;", "\xe2\x84\xa2"}, // ™
{"&bull;", "\xe2\x80\xa2"}, // •
{"&middot;", "\xc2\xb7"}, // ·
{"&sect;", "\xc2\xa7"}, // §
{"&para;", "\xc2\xb6"}, // ¶
{"&dagger;", "\xe2\x80\xa0"}, // †
{"&Dagger;", "\xe2\x80\xa1"}, // ‡
{"&iexcl;", "\xc2\xa1"}, // ¡
{"&iquest;", "\xc2\xbf"}, // ¿
{"&laquo;", "\xc2\xab"}, // «
{"&raquo;", "\xc2\xbb"}, // »
{"&lrm;", ""}, // Left-to-right mark (invisible)
{"&rlm;", ""}, // Right-to-left mark (invisible)
{"&shy;", ""}, // Soft hyphen
{"&ensp;", " "},
{"&emsp;", " "},
{"&thinsp;", " "},
{"&zwj;", ""},
{"&zwnj;", ""},
};
for (const auto& mapping : entities) {
if (entity == mapping.entity) {
i = semicolon; // Will be incremented by caller's loop
return mapping.replacement;
}
}
}
// Unknown entity - return just the ampersand
return "&";
}
std::string DictHtmlParser::extractTagName(const std::string& html, size_t start, bool& isClosing) {
isClosing = false;
size_t pos = start;
// Skip whitespace after '<'
while (pos < html.length() && std::isspace(static_cast<unsigned char>(html[pos]))) {
pos++;
}
// Check for closing tag
if (pos < html.length() && html[pos] == '/') {
isClosing = true;
pos++;
}
// Extract tag name (alphanumeric characters)
size_t nameStart = pos;
while (pos < html.length() && (std::isalnum(static_cast<unsigned char>(html[pos])) || html[pos] == '!')) {
pos++;
}
std::string tagName = html.substr(nameStart, pos - nameStart);
// Convert to lowercase
std::transform(tagName.begin(), tagName.end(), tagName.begin(), [](unsigned char c) { return std::tolower(c); });
2026-01-22 12:42:01 -05:00
return tagName;
}
bool DictHtmlParser::isBlockTag(const std::string& tagName) {
return tagName == "p" || tagName == "div" || tagName == "br" || tagName == "hr" || tagName == "li" ||
tagName == "ol" || tagName == "ul" || tagName == "dt" || tagName == "dd" || tagName == "html";
}
bool DictHtmlParser::isBoldTag(const std::string& tagName) { return tagName == "b" || tagName == "strong"; }
2026-01-22 12:42:01 -05:00
bool DictHtmlParser::isItalicTag(const std::string& tagName) { return tagName == "i" || tagName == "em"; }
2026-01-22 12:42:01 -05:00
bool DictHtmlParser::isUnderlineTag(const std::string& tagName) { return tagName == "u" || tagName == "ins"; }
2026-01-22 12:42:01 -05:00
bool DictHtmlParser::isSuperscriptTag(const std::string& tagName) { return tagName == "sup"; }
bool DictHtmlParser::isListItemTag(const std::string& tagName) { return tagName == "li"; }
bool DictHtmlParser::isOrderedListTag(const std::string& tagName) { return tagName == "ol"; }
void DictHtmlParser::parse(const std::string& html, int fontId, const GfxRenderer& renderer, uint16_t viewportWidth,
const std::function<void(std::shared_ptr<TextBlock>)>& onTextBlock) {
// Current paragraph being built
ParsedText currentParagraph(TextBlock::Style::LEFT_ALIGN, false, false);
// State tracking
int boldDepth = 0;
int italicDepth = 0;
int underlineDepth = 0;
bool inSuperscript = false;
bool inTag = false;
// List tracking
std::stack<int> listCounters; // Stack for nested lists (0 = unordered, >0 = ordered counter)
// Current word being accumulated
std::string currentWord;
bool lastWasSpace = true; // Start true to skip leading spaces
// Helper to flush current word to paragraph
auto flushWord = [&]() {
if (currentWord.empty()) return;
// Determine font style
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
if (boldDepth > 0 && italicDepth > 0) {
fontStyle = EpdFontFamily::BOLD_ITALIC;
} else if (boldDepth > 0) {
fontStyle = EpdFontFamily::BOLD;
} else if (italicDepth > 0) {
fontStyle = EpdFontFamily::ITALIC;
}
currentParagraph.addWord(currentWord, fontStyle, underlineDepth > 0);
currentWord.clear();
lastWasSpace = false;
};
// Helper to flush current paragraph (create TextBlocks)
auto flushParagraph = [&]() {
flushWord();
if (!currentParagraph.isEmpty()) {
currentParagraph.layoutAndExtractLines(renderer, fontId, viewportWidth, onTextBlock);
currentParagraph = ParsedText(TextBlock::Style::LEFT_ALIGN, false, false);
}
lastWasSpace = true;
};
// Parse the HTML
for (size_t i = 0; i < html.length(); i++) {
const char c = html[i];
if (c == '<') {
// Start of tag - flush current word first
flushWord();
// Find end of tag
size_t tagEnd = html.find('>', i);
if (tagEnd == std::string::npos) {
// Malformed HTML - treat rest as text
currentWord += c;
continue;
}
// Extract tag name
bool isClosing = false;
std::string tagName = extractTagName(html, i + 1, isClosing);
// Handle different tag types
if (isBoldTag(tagName)) {
if (isClosing) {
boldDepth = std::max(0, boldDepth - 1);
} else {
boldDepth++;
}
} else if (isItalicTag(tagName)) {
if (isClosing) {
italicDepth = std::max(0, italicDepth - 1);
} else {
italicDepth++;
}
} else if (isUnderlineTag(tagName)) {
if (isClosing) {
underlineDepth = std::max(0, underlineDepth - 1);
} else {
underlineDepth++;
}
} else if (isSuperscriptTag(tagName)) {
if (isClosing) {
inSuperscript = false;
} else {
inSuperscript = true;
// Add caret prefix for superscript
currentWord += '^';
}
} else if (isOrderedListTag(tagName)) {
if (isClosing) {
if (!listCounters.empty()) {
listCounters.pop();
}
} else {
// Check if it's an unordered list style
std::string tagContent = html.substr(i, tagEnd - i);
if (tagContent.find("list-style-type:lower-alpha") != std::string::npos) {
listCounters.push(-1); // -1 = alphabetic
} else {
listCounters.push(1); // Start at 1 for ordered
}
}
} else if (tagName == "ul") {
if (isClosing) {
if (!listCounters.empty()) {
listCounters.pop();
}
} else {
listCounters.push(0); // 0 = unordered (bullet)
}
} else if (isListItemTag(tagName) && !isClosing) {
// Start of list item - flush paragraph and add bullet/number
flushParagraph();
std::string prefix;
if (!listCounters.empty()) {
int counter = listCounters.top();
if (counter == 0) {
// Unordered - bullet point
prefix = "\xe2\x80\xa2 "; // • bullet
} else if (counter == -1) {
// Alphabetic - not fully supported, just use bullet
prefix = " ";
} else {
// Ordered - number
char numBuf[8];
snprintf(numBuf, sizeof(numBuf), "%d. ", counter);
prefix = numBuf;
listCounters.pop();
listCounters.push(counter + 1); // Increment for next item
}
} else {
// No list context - just indent
prefix = "\xe2\x80\xa2 "; // • bullet
}
// Add prefix as a word (em-space for indent + prefix)
currentParagraph.addWord("\xe2\x80\x83" + prefix, EpdFontFamily::REGULAR, false);
lastWasSpace = true;
} else if (isBlockTag(tagName)) {
// Block element - flush paragraph
flushParagraph();
// Special handling for </html> which separates dictionary entries
if (tagName == "html" && isClosing) {
// Add extra spacing between entries
flushParagraph();
}
}
// Skip to end of tag
i = tagEnd;
} else if (c == '&') {
// HTML entity
std::string decoded = decodeEntity(html, i);
if (!decoded.empty()) {
if (decoded == " ") {
// Space entity - treat as space
if (!lastWasSpace) {
flushWord();
lastWasSpace = true;
}
} else {
currentWord += decoded;
lastWasSpace = false;
}
}
} else if (std::isspace(static_cast<unsigned char>(c))) {
// Whitespace - flush word and collapse
if (!lastWasSpace) {
flushWord();
lastWasSpace = true;
}
} else {
// Regular character
currentWord += c;
lastWasSpace = false;
}
}
// Flush any remaining content
flushParagraph();
}