371 lines
12 KiB
C++
371 lines
12 KiB
C++
#include "DictHtmlParser.h"
|
||
|
||
#include <Epub/ParsedText.h>
|
||
#include <GfxRenderer.h>
|
||
|
||
#include <algorithm>
|
||
#include <cctype>
|
||
#include <stack>
|
||
|
||
std::string DictHtmlParser::decodeEntity(const std::string& html, size_t& i) {
|
||
const size_t start = i; // Position of '&'
|
||
const size_t remaining = html.length() - start;
|
||
|
||
// Numeric entities: &#NNN; or &#xHHH;
|
||
if (remaining > 2 && html[start + 1] == '#') {
|
||
size_t numStart = start + 2;
|
||
bool isHex = false;
|
||
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
|
||
isHex = true;
|
||
numStart++;
|
||
}
|
||
|
||
size_t numEnd = numStart;
|
||
while (numEnd < html.length() && html[numEnd] != ';') {
|
||
const char c = html[numEnd];
|
||
if (isHex) {
|
||
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
|
||
} else {
|
||
if (!std::isdigit(static_cast<unsigned char>(c))) break;
|
||
}
|
||
numEnd++;
|
||
}
|
||
|
||
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
|
||
const std::string numStr = html.substr(numStart, numEnd - numStart);
|
||
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
|
||
i = numEnd; // Will be incremented by caller's loop
|
||
|
||
// Convert codepoint to UTF-8
|
||
std::string utf8;
|
||
if (codepoint < 0x80) {
|
||
utf8 += static_cast<char>(codepoint);
|
||
} else if (codepoint < 0x800) {
|
||
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
|
||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||
} else if (codepoint < 0x10000) {
|
||
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
|
||
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||
} else if (codepoint < 0x110000) {
|
||
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
|
||
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
|
||
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||
}
|
||
return utf8;
|
||
}
|
||
}
|
||
|
||
// Named entities - find the semicolon first
|
||
size_t semicolon = html.find(';', start + 1);
|
||
if (semicolon != std::string::npos && semicolon - start < 12) {
|
||
const std::string entity = html.substr(start, semicolon - start + 1);
|
||
|
||
// Common named entities
|
||
struct EntityMapping {
|
||
const char* entity;
|
||
const char* replacement;
|
||
};
|
||
static const EntityMapping entities[] = {
|
||
{" ", " "},
|
||
{"<", "<"},
|
||
{">", ">"},
|
||
{"&", "&"},
|
||
{""", "\""},
|
||
{"'", "'"},
|
||
{"—", "\xe2\x80\x94"}, // —
|
||
{"–", "\xe2\x80\x93"}, // –
|
||
{"…", "\xe2\x80\xa6"}, // …
|
||
{"’", "\xe2\x80\x99"}, // '
|
||
{"‘", "\xe2\x80\x98"}, // '
|
||
{"”", "\xe2\x80\x9d"}, // "
|
||
{"“", "\xe2\x80\x9c"}, // "
|
||
{"°", "\xc2\xb0"}, // °
|
||
{"×", "\xc3\x97"}, // ×
|
||
{"÷", "\xc3\xb7"}, // ÷
|
||
{"±", "\xc2\xb1"}, // ±
|
||
{"½", "\xc2\xbd"}, // ½
|
||
{"¼", "\xc2\xbc"}, // ¼
|
||
{"¾", "\xc2\xbe"}, // ¾
|
||
{"¢", "\xc2\xa2"}, // ¢
|
||
{"£", "\xc2\xa3"}, // £
|
||
{"€", "\xe2\x82\xac"}, // €
|
||
{"¥", "\xc2\xa5"}, // ¥
|
||
{"©", "\xc2\xa9"}, // ©
|
||
{"®", "\xc2\xae"}, // ®
|
||
{"™", "\xe2\x84\xa2"}, // ™
|
||
{"•", "\xe2\x80\xa2"}, // •
|
||
{"·", "\xc2\xb7"}, // ·
|
||
{"§", "\xc2\xa7"}, // §
|
||
{"¶", "\xc2\xb6"}, // ¶
|
||
{"†", "\xe2\x80\xa0"}, // †
|
||
{"‡", "\xe2\x80\xa1"}, // ‡
|
||
{"¡", "\xc2\xa1"}, // ¡
|
||
{"¿", "\xc2\xbf"}, // ¿
|
||
{"«", "\xc2\xab"}, // «
|
||
{"»", "\xc2\xbb"}, // »
|
||
{"‎", ""}, // Left-to-right mark (invisible)
|
||
{"‏", ""}, // Right-to-left mark (invisible)
|
||
{"­", ""}, // Soft hyphen
|
||
{" ", " "},
|
||
{" ", " "},
|
||
{" ", " "},
|
||
{"‍", ""},
|
||
{"‌", ""},
|
||
};
|
||
|
||
for (const auto& mapping : entities) {
|
||
if (entity == mapping.entity) {
|
||
i = semicolon; // Will be incremented by caller's loop
|
||
return mapping.replacement;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Unknown entity - return just the ampersand
|
||
return "&";
|
||
}
|
||
|
||
std::string DictHtmlParser::extractTagName(const std::string& html, size_t start, bool& isClosing) {
|
||
isClosing = false;
|
||
size_t pos = start;
|
||
|
||
// Skip whitespace after '<'
|
||
while (pos < html.length() && std::isspace(static_cast<unsigned char>(html[pos]))) {
|
||
pos++;
|
||
}
|
||
|
||
// Check for closing tag
|
||
if (pos < html.length() && html[pos] == '/') {
|
||
isClosing = true;
|
||
pos++;
|
||
}
|
||
|
||
// Extract tag name (alphanumeric characters)
|
||
size_t nameStart = pos;
|
||
while (pos < html.length() && (std::isalnum(static_cast<unsigned char>(html[pos])) || html[pos] == '!')) {
|
||
pos++;
|
||
}
|
||
|
||
std::string tagName = html.substr(nameStart, pos - nameStart);
|
||
// Convert to lowercase
|
||
std::transform(tagName.begin(), tagName.end(), tagName.begin(),
|
||
[](unsigned char c) { return std::tolower(c); });
|
||
return tagName;
|
||
}
|
||
|
||
bool DictHtmlParser::isBlockTag(const std::string& tagName) {
|
||
return tagName == "p" || tagName == "div" || tagName == "br" || tagName == "hr" || tagName == "li" ||
|
||
tagName == "ol" || tagName == "ul" || tagName == "dt" || tagName == "dd" || tagName == "html";
|
||
}
|
||
|
||
bool DictHtmlParser::isBoldTag(const std::string& tagName) {
|
||
return tagName == "b" || tagName == "strong";
|
||
}
|
||
|
||
bool DictHtmlParser::isItalicTag(const std::string& tagName) {
|
||
return tagName == "i" || tagName == "em";
|
||
}
|
||
|
||
bool DictHtmlParser::isUnderlineTag(const std::string& tagName) {
|
||
return tagName == "u" || tagName == "ins";
|
||
}
|
||
|
||
bool DictHtmlParser::isSuperscriptTag(const std::string& tagName) { return tagName == "sup"; }
|
||
|
||
bool DictHtmlParser::isListItemTag(const std::string& tagName) { return tagName == "li"; }
|
||
|
||
bool DictHtmlParser::isOrderedListTag(const std::string& tagName) { return tagName == "ol"; }
|
||
|
||
void DictHtmlParser::parse(const std::string& html, int fontId, const GfxRenderer& renderer, uint16_t viewportWidth,
|
||
const std::function<void(std::shared_ptr<TextBlock>)>& onTextBlock) {
|
||
// Current paragraph being built
|
||
ParsedText currentParagraph(TextBlock::Style::LEFT_ALIGN, false, false);
|
||
|
||
// State tracking
|
||
int boldDepth = 0;
|
||
int italicDepth = 0;
|
||
int underlineDepth = 0;
|
||
bool inSuperscript = false;
|
||
bool inTag = false;
|
||
|
||
// List tracking
|
||
std::stack<int> listCounters; // Stack for nested lists (0 = unordered, >0 = ordered counter)
|
||
|
||
// Current word being accumulated
|
||
std::string currentWord;
|
||
bool lastWasSpace = true; // Start true to skip leading spaces
|
||
|
||
// Helper to flush current word to paragraph
|
||
auto flushWord = [&]() {
|
||
if (currentWord.empty()) return;
|
||
|
||
// Determine font style
|
||
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
|
||
if (boldDepth > 0 && italicDepth > 0) {
|
||
fontStyle = EpdFontFamily::BOLD_ITALIC;
|
||
} else if (boldDepth > 0) {
|
||
fontStyle = EpdFontFamily::BOLD;
|
||
} else if (italicDepth > 0) {
|
||
fontStyle = EpdFontFamily::ITALIC;
|
||
}
|
||
|
||
currentParagraph.addWord(currentWord, fontStyle, underlineDepth > 0);
|
||
currentWord.clear();
|
||
lastWasSpace = false;
|
||
};
|
||
|
||
// Helper to flush current paragraph (create TextBlocks)
|
||
auto flushParagraph = [&]() {
|
||
flushWord();
|
||
if (!currentParagraph.isEmpty()) {
|
||
currentParagraph.layoutAndExtractLines(renderer, fontId, viewportWidth, onTextBlock);
|
||
currentParagraph = ParsedText(TextBlock::Style::LEFT_ALIGN, false, false);
|
||
}
|
||
lastWasSpace = true;
|
||
};
|
||
|
||
// Parse the HTML
|
||
for (size_t i = 0; i < html.length(); i++) {
|
||
const char c = html[i];
|
||
|
||
if (c == '<') {
|
||
// Start of tag - flush current word first
|
||
flushWord();
|
||
|
||
// Find end of tag
|
||
size_t tagEnd = html.find('>', i);
|
||
if (tagEnd == std::string::npos) {
|
||
// Malformed HTML - treat rest as text
|
||
currentWord += c;
|
||
continue;
|
||
}
|
||
|
||
// Extract tag name
|
||
bool isClosing = false;
|
||
std::string tagName = extractTagName(html, i + 1, isClosing);
|
||
|
||
// Handle different tag types
|
||
if (isBoldTag(tagName)) {
|
||
if (isClosing) {
|
||
boldDepth = std::max(0, boldDepth - 1);
|
||
} else {
|
||
boldDepth++;
|
||
}
|
||
} else if (isItalicTag(tagName)) {
|
||
if (isClosing) {
|
||
italicDepth = std::max(0, italicDepth - 1);
|
||
} else {
|
||
italicDepth++;
|
||
}
|
||
} else if (isUnderlineTag(tagName)) {
|
||
if (isClosing) {
|
||
underlineDepth = std::max(0, underlineDepth - 1);
|
||
} else {
|
||
underlineDepth++;
|
||
}
|
||
} else if (isSuperscriptTag(tagName)) {
|
||
if (isClosing) {
|
||
inSuperscript = false;
|
||
} else {
|
||
inSuperscript = true;
|
||
// Add caret prefix for superscript
|
||
currentWord += '^';
|
||
}
|
||
} else if (isOrderedListTag(tagName)) {
|
||
if (isClosing) {
|
||
if (!listCounters.empty()) {
|
||
listCounters.pop();
|
||
}
|
||
} else {
|
||
// Check if it's an unordered list style
|
||
std::string tagContent = html.substr(i, tagEnd - i);
|
||
if (tagContent.find("list-style-type:lower-alpha") != std::string::npos) {
|
||
listCounters.push(-1); // -1 = alphabetic
|
||
} else {
|
||
listCounters.push(1); // Start at 1 for ordered
|
||
}
|
||
}
|
||
} else if (tagName == "ul") {
|
||
if (isClosing) {
|
||
if (!listCounters.empty()) {
|
||
listCounters.pop();
|
||
}
|
||
} else {
|
||
listCounters.push(0); // 0 = unordered (bullet)
|
||
}
|
||
} else if (isListItemTag(tagName) && !isClosing) {
|
||
// Start of list item - flush paragraph and add bullet/number
|
||
flushParagraph();
|
||
|
||
std::string prefix;
|
||
if (!listCounters.empty()) {
|
||
int counter = listCounters.top();
|
||
if (counter == 0) {
|
||
// Unordered - bullet point
|
||
prefix = "\xe2\x80\xa2 "; // • bullet
|
||
} else if (counter == -1) {
|
||
// Alphabetic - not fully supported, just use bullet
|
||
prefix = " ";
|
||
} else {
|
||
// Ordered - number
|
||
char numBuf[8];
|
||
snprintf(numBuf, sizeof(numBuf), "%d. ", counter);
|
||
prefix = numBuf;
|
||
listCounters.pop();
|
||
listCounters.push(counter + 1); // Increment for next item
|
||
}
|
||
} else {
|
||
// No list context - just indent
|
||
prefix = "\xe2\x80\xa2 "; // • bullet
|
||
}
|
||
|
||
// Add prefix as a word (em-space for indent + prefix)
|
||
currentParagraph.addWord("\xe2\x80\x83" + prefix, EpdFontFamily::REGULAR, false);
|
||
lastWasSpace = true;
|
||
} else if (isBlockTag(tagName)) {
|
||
// Block element - flush paragraph
|
||
flushParagraph();
|
||
|
||
// Special handling for </html> which separates dictionary entries
|
||
if (tagName == "html" && isClosing) {
|
||
// Add extra spacing between entries
|
||
flushParagraph();
|
||
}
|
||
}
|
||
|
||
// Skip to end of tag
|
||
i = tagEnd;
|
||
} else if (c == '&') {
|
||
// HTML entity
|
||
std::string decoded = decodeEntity(html, i);
|
||
if (!decoded.empty()) {
|
||
if (decoded == " ") {
|
||
// Space entity - treat as space
|
||
if (!lastWasSpace) {
|
||
flushWord();
|
||
lastWasSpace = true;
|
||
}
|
||
} else {
|
||
currentWord += decoded;
|
||
lastWasSpace = false;
|
||
}
|
||
}
|
||
} else if (std::isspace(static_cast<unsigned char>(c))) {
|
||
// Whitespace - flush word and collapse
|
||
if (!lastWasSpace) {
|
||
flushWord();
|
||
lastWasSpace = true;
|
||
}
|
||
} else {
|
||
// Regular character
|
||
currentWord += c;
|
||
lastWasSpace = false;
|
||
}
|
||
}
|
||
|
||
// Flush any remaining content
|
||
flushParagraph();
|
||
}
|