cottongin fbe7d2feb4
All checks were successful
CI / build (push) Successful in 4m51s
Compile Release / build-release (push) Successful in 1m18s
release: ef-1.0.5 - stability, memory, and upstream merges
Webserver: JSON batching, removed MD5 blocking, simplified flow control
Memory: QR code caching, WiFi scan optimization, cover buffer leak fix
EPUB: Fixed errant underlining before styled inline elements
Flash screen: Version string overflow fix, half refresh for cleaner display

Upstream merges:
- PR #522: HAL abstraction layer (HalDisplay, HalGPIO)
- PR #603: Sunlight fading fix toggle in Display settings
2026-01-30 23:20:23 -05:00

850 lines
26 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "StarDict.h"
#include <HardwareSerial.h>
#include <SDCardManager.h>
#include <miniz.h>
#include <algorithm>
#include <cctype>
#include "DictPrefixIndex.generated.h"
StarDict::StarDict(const std::string& basePath) : basePath(basePath) {}
StarDict::~StarDict() {
if (dzInfo.chunkSizes) {
free(dzInfo.chunkSizes);
dzInfo.chunkSizes = nullptr;
}
}
uint32_t StarDict::readBE32(const uint8_t* data) {
return (static_cast<uint32_t>(data[0]) << 24) | (static_cast<uint32_t>(data[1]) << 16) |
(static_cast<uint32_t>(data[2]) << 8) | static_cast<uint32_t>(data[3]);
}
bool StarDict::loadInfo() {
const std::string ifoPath = basePath + ".ifo";
FsFile file;
if (!SdMan.openFileForRead("DICT", ifoPath, file)) {
Serial.printf("[%lu] [DICT] Failed to open .ifo file: %s\n", millis(), ifoPath.c_str());
return false;
}
char buffer[256];
while (file.available()) {
const int len = file.fgets(buffer, sizeof(buffer));
if (len <= 0) break;
// Remove newline
char* newline = strchr(buffer, '\n');
if (newline) *newline = '\0';
newline = strchr(buffer, '\r');
if (newline) *newline = '\0';
// Parse key=value
char* eq = strchr(buffer, '=');
if (!eq) continue;
*eq = '\0';
const char* key = buffer;
const char* value = eq + 1;
if (strcmp(key, "bookname") == 0) {
info.bookname = value;
} else if (strcmp(key, "wordcount") == 0) {
info.wordcount = strtoul(value, nullptr, 10);
} else if (strcmp(key, "idxfilesize") == 0) {
info.idxfilesize = strtoul(value, nullptr, 10);
} else if (strcmp(key, "sametypesequence") == 0) {
info.sametypesequence = value[0];
} else if (strcmp(key, "synwordcount") == 0) {
info.synwordcount = strtoul(value, nullptr, 10);
}
}
file.close();
info.loaded = true;
Serial.printf("[%lu] [DICT] Loaded dictionary: %s (%u words)\n", millis(), info.bookname.c_str(), info.wordcount);
return true;
}
bool StarDict::loadDictzipHeader() {
if (dzInfo.loaded) return true;
const std::string dzPath = basePath + ".dict.dz";
FsFile file;
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
Serial.printf("[%lu] [DICT] Failed to open .dict.dz file\n", millis());
return false;
}
// Read gzip header
uint8_t header[10];
if (file.read(header, 10) != 10) {
file.close();
return false;
}
// Verify gzip magic number
if (header[0] != 0x1f || header[1] != 0x8b) {
Serial.printf("[%lu] [DICT] Not a valid gzip file\n", millis());
file.close();
return false;
}
// Check for extra field flag (bit 2)
const uint8_t flags = header[3];
if (!(flags & 0x04)) {
Serial.printf("[%lu] [DICT] No extra field - not a dictzip file\n", millis());
file.close();
return false;
}
// Read extra field length
uint8_t xlenBuf[2];
if (file.read(xlenBuf, 2) != 2) {
file.close();
return false;
}
const uint16_t xlen = xlenBuf[0] | (xlenBuf[1] << 8);
// Read extra field
auto* extraField = static_cast<uint8_t*>(malloc(xlen));
if (!extraField) {
file.close();
return false;
}
if (file.read(extraField, xlen) != xlen) {
free(extraField);
file.close();
return false;
}
// Parse dictzip subfield (SI1='R', SI2='A')
bool foundDictzip = false;
uint16_t pos = 0;
while (pos + 4 <= xlen) {
const uint8_t si1 = extraField[pos];
const uint8_t si2 = extraField[pos + 1];
const uint16_t slen = extraField[pos + 2] | (extraField[pos + 3] << 8);
if (si1 == 'R' && si2 == 'A' && pos + 4 + slen <= xlen) {
// Dictzip subfield found
// Format: ver(2) + chlen(2) + count(2) + sizes[count](2 each)
const uint8_t* data = &extraField[pos + 4];
// uint16_t version = data[0] | (data[1] << 8); // Usually 1
dzInfo.chunkLength = data[2] | (data[3] << 8);
dzInfo.chunkCount = data[4] | (data[5] << 8);
dzInfo.chunkSizes = static_cast<uint16_t*>(malloc(dzInfo.chunkCount * sizeof(uint16_t)));
if (!dzInfo.chunkSizes) {
free(extraField);
file.close();
return false;
}
for (uint16_t i = 0; i < dzInfo.chunkCount; i++) {
dzInfo.chunkSizes[i] = data[6 + i * 2] | (data[7 + i * 2] << 8);
}
foundDictzip = true;
break;
}
pos += 4 + slen;
}
free(extraField);
if (!foundDictzip) {
Serial.printf("[%lu] [DICT] Dictzip subfield not found\n", millis());
file.close();
return false;
}
// Calculate header size (10 + 2 + xlen + optional fields)
dzInfo.headerSize = 10 + 2 + xlen;
// Skip FNAME if present (bit 3)
if (flags & 0x08) {
file.seek(dzInfo.headerSize);
while (file.available()) {
uint8_t c;
file.read(&c, 1);
dzInfo.headerSize++;
if (c == 0) break;
}
}
// Skip FCOMMENT if present (bit 4)
if (flags & 0x10) {
file.seek(dzInfo.headerSize);
while (file.available()) {
uint8_t c;
file.read(&c, 1);
dzInfo.headerSize++;
if (c == 0) break;
}
}
// Skip FHCRC if present (bit 1)
if (flags & 0x02) {
dzInfo.headerSize += 2;
}
file.close();
dzInfo.loaded = true;
Serial.printf("[%lu] [DICT] Dictzip: %u chunks of %u bytes, header size %u\n", millis(), dzInfo.chunkCount,
dzInfo.chunkLength, dzInfo.headerSize);
return true;
}
bool StarDict::begin() {
if (!loadInfo()) return false;
// Try uncompressed .dict file first (preferred - no memory overhead)
const std::string dictPath = basePath + ".dict";
FsFile testFile;
if (SdMan.openFileForRead("DICT", dictPath, testFile)) {
testFile.close();
useUncompressed = true;
Serial.printf("[%lu] [DICT] Using uncompressed .dict file (no decompression needed)\n", millis());
return true;
}
// Fall back to compressed .dict.dz
useUncompressed = false;
if (!loadDictzipHeader()) return false;
return true;
}
bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
uint32_t& dictSize) {
idxFile.seek(position);
// Read null-terminated word
word.clear();
char c;
while (idxFile.read(&c, 1) == 1) {
if (c == '\0') break;
word += c;
if (word.length() > 256) {
// Safety limit
return false;
}
}
if (word.empty()) return false;
// Read 4-byte big-endian offset
uint8_t buf[8];
if (idxFile.read(buf, 8) != 8) return false;
dictOffset = readBE32(buf);
dictSize = readBE32(buf + 4);
position = idxFile.position();
return true;
}
bool StarDict::readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition) {
// Read directly from uncompressed .dict file - no decompression needed!
const std::string dictPath = basePath + ".dict";
FsFile file;
if (!SdMan.openFileForRead("DICT", dictPath, file)) {
Serial.printf("[DICT-DBG] Failed to open .dict file\n");
return false;
}
// Seek to the definition offset
if (!file.seek(offset)) {
Serial.printf("[DICT-DBG] Failed to seek to offset %lu\n", offset);
file.close();
return false;
}
// Read the definition directly into the string
definition.resize(size);
const int bytesRead = file.read(&definition[0], size);
file.close();
if (bytesRead != static_cast<int>(size)) {
Serial.printf("[DICT-DBG] Read %d bytes, expected %lu\n", bytesRead, size);
definition.clear();
return false;
}
return true;
}
bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
if (!dzInfo.loaded) {
Serial.printf("[DICT-DBG] dzInfo not loaded!\n");
return false;
}
const std::string dzPath = basePath + ".dict.dz";
FsFile file;
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
Serial.printf("[DICT-DBG] Failed to open dict.dz file\n");
return false;
}
// Calculate which chunk(s) we need
const uint32_t startChunk = offset / dzInfo.chunkLength;
const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;
Serial.printf("[DICT-DBG] Chunks: start=%lu, end=%lu, total=%u\n", startChunk, endChunk, dzInfo.chunkCount);
if (endChunk >= dzInfo.chunkCount) {
Serial.printf("[DICT-DBG] endChunk %lu >= chunkCount %u\n", endChunk, dzInfo.chunkCount);
file.close();
return false;
}
// Calculate file offset for start chunk
uint32_t fileOffset = dzInfo.headerSize;
for (uint32_t i = 0; i < startChunk; i++) {
fileOffset += dzInfo.chunkSizes[i];
}
// Calculate actual max compressed size needed for the chunks we'll process
uint32_t maxCompressedSize = 0;
for (uint32_t i = startChunk; i <= endChunk; i++) {
if (dzInfo.chunkSizes[i] > maxCompressedSize) {
maxCompressedSize = dzInfo.chunkSizes[i];
}
}
// Allocate buffers - allocate inflator FIRST (smallest) to reduce fragmentation impact
// tinfl_decompressor is ~11KB, so total allocations are ~85KB
Serial.printf("[DICT-DBG] Allocating inflator=%u, comp=%lu, decomp=%u bytes\n", sizeof(tinfl_decompressor),
maxCompressedSize, dzInfo.chunkLength);
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (!inflator) {
Serial.printf("[DICT-DBG] inflator alloc failed! (need %u bytes)\n", sizeof(tinfl_decompressor));
file.close();
return false;
}
auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
if (!compressedBuf) {
Serial.printf("[DICT-DBG] compressedBuf alloc failed!\n");
free(inflator);
file.close();
return false;
}
auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
if (!decompressedBuf) {
Serial.printf("[DICT-DBG] decompressedBuf alloc failed!\n");
free(inflator);
free(compressedBuf);
file.close();
return false;
}
definition.clear();
definition.reserve(size);
// Process each needed chunk (reusing inflator allocation)
for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
const uint16_t compressedSize = dzInfo.chunkSizes[chunk];
// Seek and read compressed data
file.seek(fileOffset);
if (file.read(compressedBuf, compressedSize) != compressedSize) {
Serial.printf("[DICT-DBG] File read failed at offset %lu, size %u\n", fileOffset, compressedSize);
free(inflator);
free(compressedBuf);
free(decompressedBuf);
file.close();
return false;
}
// Decompress using raw inflate (no zlib header)
tinfl_init(inflator);
size_t inBytes = compressedSize;
size_t outBytes = dzInfo.chunkLength;
const tinfl_status status =
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);
if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
// Try without zlib header flag
tinfl_init(inflator);
inBytes = compressedSize;
outBytes = dzInfo.chunkLength;
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
}
// Extract the portion we need from this chunk
uint32_t copyStart = 0;
uint32_t copyEnd = outBytes;
if (chunk == startChunk) {
copyStart = startOffsetInChunk;
}
if (chunk == endChunk) {
const uint32_t endOffsetInChunk = (offset + size) - (endChunk * dzInfo.chunkLength);
if (endOffsetInChunk < copyEnd) {
copyEnd = endOffsetInChunk;
}
}
if (copyEnd > copyStart) {
definition.append(reinterpret_cast<char*>(decompressedBuf + copyStart), copyEnd - copyStart);
}
fileOffset += compressedSize;
}
free(inflator);
free(compressedBuf);
free(decompressedBuf);
file.close();
return true;
}
// StarDict comparison function: case-insensitive matching
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
// Case-insensitive comparison (like g_ascii_strcasecmp)
size_t i = 0;
while (i < a.length() && i < b.length()) {
const int ca = std::tolower(static_cast<unsigned char>(a[i]));
const int cb = std::tolower(static_cast<unsigned char>(b[i]));
if (ca != cb) return ca - cb;
i++;
}
if (a.length() != b.length()) {
return static_cast<int>(a.length()) - static_cast<int>(b.length());
}
// Case-insensitive match found
return 0;
}
std::string StarDict::normalizeWord(const std::string& word) {
std::string result;
result.reserve(word.length());
// Trim leading whitespace
size_t start = 0;
while (start < word.length() && std::isspace(static_cast<unsigned char>(word[start]))) {
start++;
}
// Trim trailing whitespace
size_t end = word.length();
while (end > start && std::isspace(static_cast<unsigned char>(word[end - 1]))) {
end--;
}
// Convert to lowercase
for (size_t i = start; i < end; i++) {
result += static_cast<char>(std::tolower(static_cast<unsigned char>(word[i])));
}
return result;
}
StarDict::LookupResult StarDict::lookup(const std::string& word) {
LookupResult result;
result.word = word;
if (!info.loaded) {
return result;
}
const std::string normalizedSearch = normalizeWord(word);
if (normalizedSearch.empty()) {
return result;
}
Serial.printf("[DICT-DBG] Searching for: '%s' (normalized: '%s')\n", word.c_str(), normalizedSearch.c_str());
// First try .idx (main entries) - use prefix jump table for fast lookup
const std::string idxPath = basePath + ".idx";
FsFile idxFile;
if (!SdMan.openFileForRead("DICT", idxPath, idxFile)) {
Serial.printf("[%lu] [DICT] Failed to open index file\n", millis());
return result;
}
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
uint32_t position = 0;
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
}
Serial.printf("[DICT-DBG] Starting at position %lu (prefix: %c%c)\n", position, normalizedSearch[0],
normalizedSearch[1]);
bool found = false;
uint32_t wordCount = 0;
while (position < info.idxfilesize) {
std::string currentWord;
uint32_t dictOffset, dictSize;
if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
break;
}
wordCount++;
if (wordCount % 50000 == 0) {
Serial.printf("[DICT-DBG] Progress: %lu words scanned, pos=%lu, current='%s'\n", wordCount, position,
currentWord.c_str());
}
// Use stardictStrcmp for case-insensitive matching
const int cmp = stardictStrcmp(normalizedSearch, currentWord);
if (cmp == 0) {
Serial.printf("[DICT-DBG] MATCH: '%s' == '%s' (offset=%lu, size=%lu)\n", normalizedSearch.c_str(),
currentWord.c_str(), dictOffset, dictSize);
std::string definition;
const bool loaded = useUncompressed ? readDefinitionDirect(dictOffset, dictSize, definition)
: decompressDefinition(dictOffset, dictSize, definition);
if (loaded) {
Serial.printf("[DICT-DBG] Definition loaded, %u bytes\n", definition.length());
if (!found) {
result.word = currentWord;
result.definition = definition;
result.found = true;
found = true;
} else {
result.definition += "</html>" + definition;
}
} else {
Serial.printf("[DICT-DBG] Definition load FAILED!\n");
}
// Continue scanning for additional matches (same word, different case)
} else if (found) {
// We had matches but now moved past them - safe to stop
break;
}
// Note: Cannot use early-break before first match because prefix index
// may not land exactly at target position
}
Serial.printf("[DICT-DBG] Search complete: %lu words scanned, found=%s\n", wordCount, found ? "YES" : "NO");
idxFile.close();
// If not found in main index, try synonym file with prefix jump
if (!found && info.synwordcount > 0) {
const std::string synPath = basePath + ".syn";
FsFile synFile;
if (SdMan.openFileForRead("DICT", synPath, synFile)) {
const uint32_t synFileSize = synFile.size();
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
uint32_t synPosition = 0;
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
synPosition = DictPrefixIndex::synPrefixOffsets[prefixIdx];
synFile.seek(synPosition);
}
while (synFile.position() < synFileSize) {
// Read synonym word (null-terminated)
std::string synWord;
char c;
while (synFile.read(&c, 1) == 1 && c != '\0') {
synWord += c;
}
// Read 4-byte big-endian index
uint8_t idxBytes[4];
if (synFile.read(idxBytes, 4) != 4) break;
const uint32_t mainIdx = readBE32(idxBytes);
// Use stardictStrcmp for case-insensitive comparison
const int cmp = stardictStrcmp(normalizedSearch, synWord);
if (cmp == 0) {
// Found synonym - look up the main entry by index
FsFile idxFile2;
if (SdMan.openFileForRead("DICT", idxPath, idxFile2)) {
uint32_t pos = 0;
uint32_t entryNum = 0;
while (entryNum < mainIdx && pos < info.idxfilesize) {
std::string w;
uint32_t off, sz;
if (!readWordAtPosition(idxFile2, pos, w, off, sz)) break;
entryNum++;
}
// Now read the target entry
if (entryNum == mainIdx) {
std::string mainWord;
uint32_t dictOffset, dictSize;
if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
std::string definition;
const bool loaded = useUncompressed ? readDefinitionDirect(dictOffset, dictSize, definition)
: decompressDefinition(dictOffset, dictSize, definition);
if (loaded) {
result.word = synWord;
result.definition = definition;
result.found = true;
found = true;
}
}
}
idxFile2.close();
}
break; // Found a match, stop searching
}
// Note: Cannot use early-break optimization here because prefix index
// may not land exactly at target position
}
synFile.close();
}
}
return result;
}
// Helper to decode a single HTML entity starting at position i (after the '&')
// Returns the decoded string and advances i past the entity (including ';')
static std::string decodeHtmlEntity(const std::string& html, size_t& i) {
const size_t start = i; // Position of '&'
const size_t remaining = html.length() - start;
// Numeric entities: &#NNN; or &#xHHH;
if (remaining > 2 && html[start + 1] == '#') {
size_t numStart = start + 2;
bool isHex = false;
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
isHex = true;
numStart++;
}
size_t numEnd = numStart;
while (numEnd < html.length() && html[numEnd] != ';') {
const char c = html[numEnd];
if (isHex) {
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
} else {
if (!std::isdigit(static_cast<unsigned char>(c))) break;
}
numEnd++;
}
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
const std::string numStr = html.substr(numStart, numEnd - numStart);
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
i = numEnd; // Will be incremented by caller's loop
// Convert codepoint to UTF-8
std::string utf8;
if (codepoint < 0x80) {
utf8 += static_cast<char>(codepoint);
} else if (codepoint < 0x800) {
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x10000) {
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x110000) {
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
}
return utf8;
}
}
// Named entities - find the semicolon first
size_t semicolon = html.find(';', start + 1);
if (semicolon != std::string::npos && semicolon - start < 12) {
const std::string entity = html.substr(start, semicolon - start + 1);
// Common named entities
struct EntityMapping {
const char* entity;
const char* replacement;
};
static const EntityMapping entities[] = {
{"&nbsp;", " "},
{"&lt;", "<"},
{"&gt;", ">"},
{"&amp;", "&"},
{"&quot;", "\""},
{"&apos;", "'"},
{"&mdash;", "\xe2\x80\x94"}, // —
{"&ndash;", "\xe2\x80\x93"}, //
{"&hellip;", "\xe2\x80\xa6"}, // …
{"&rsquo;", "\xe2\x80\x99"}, // '
{"&lsquo;", "\xe2\x80\x98"}, // '
{"&rdquo;", "\xe2\x80\x9d"}, // "
{"&ldquo;", "\xe2\x80\x9c"}, // "
{"&deg;", "\xc2\xb0"}, // °
{"&times;", "\xc3\x97"}, // ×
{"&divide;", "\xc3\xb7"}, // ÷
{"&plusmn;", "\xc2\xb1"}, // ±
{"&frac12;", "\xc2\xbd"}, // ½
{"&frac14;", "\xc2\xbc"}, // ¼
{"&frac34;", "\xc2\xbe"}, // ¾
{"&cent;", "\xc2\xa2"}, // ¢
{"&pound;", "\xc2\xa3"}, // £
{"&euro;", "\xe2\x82\xac"}, // €
{"&yen;", "\xc2\xa5"}, // ¥
{"&copy;", "\xc2\xa9"}, // ©
{"&reg;", "\xc2\xae"}, // ®
{"&trade;", "\xe2\x84\xa2"}, // ™
{"&bull;", "\xe2\x80\xa2"}, // •
{"&middot;", "\xc2\xb7"}, // ·
{"&sect;", "\xc2\xa7"}, // §
{"&para;", "\xc2\xb6"}, // ¶
{"&dagger;", "\xe2\x80\xa0"}, // †
{"&Dagger;", "\xe2\x80\xa1"}, // ‡
{"&iexcl;", "\xc2\xa1"}, // ¡
{"&iquest;", "\xc2\xbf"}, // ¿
{"&laquo;", "\xc2\xab"}, // «
{"&raquo;", "\xc2\xbb"}, // »
{"&shy;", ""},
{"&ensp;", " "},
{"&emsp;", " "},
{"&thinsp;", " "},
{"&zwj;", ""},
{"&zwnj;", ""},
};
for (const auto& mapping : entities) {
if (entity == mapping.entity) {
i = semicolon; // Will be incremented by caller's loop
return mapping.replacement;
}
}
}
// Unknown entity - return just the ampersand and let the rest be processed normally
return "&";
}
// Helper to check if a tag is a block-level element that needs line breaks
static bool isBlockTag(const std::string& tag, bool isClosing) {
// Normalize to lowercase for comparison
std::string lowerTag = tag;
for (char& c : lowerTag) {
c = std::tolower(static_cast<unsigned char>(c));
}
// Block-level tags that should have line breaks
if (lowerTag == "p" || lowerTag == "div" || lowerTag == "br" || lowerTag == "hr" || lowerTag == "li" ||
lowerTag == "dt" || lowerTag == "dd" || lowerTag == "tr" || lowerTag == "h1" || lowerTag == "h2" ||
lowerTag == "h3" || lowerTag == "h4" || lowerTag == "h5" || lowerTag == "h6" || lowerTag == "blockquote" ||
lowerTag == "pre" || lowerTag == "ol" || lowerTag == "ul") {
return true;
}
return false;
}
std::string StarDict::stripHtml(const std::string& html) {
std::string result;
result.reserve(html.length());
bool inTag = false;
bool lastWasSpace = false;
bool lastWasNewline = false;
for (size_t i = 0; i < html.length(); i++) {
const char c = html[i];
if (c == '<') {
// Parse the tag name
size_t tagStart = i + 1;
bool isClosing = false;
// Skip whitespace after <
while (tagStart < html.length() && std::isspace(static_cast<unsigned char>(html[tagStart]))) {
tagStart++;
}
// Check for closing tag
if (tagStart < html.length() && html[tagStart] == '/') {
isClosing = true;
tagStart++;
}
// Extract tag name
size_t tagEnd = tagStart;
while (tagEnd < html.length() && !std::isspace(static_cast<unsigned char>(html[tagEnd])) && html[tagEnd] != '>' &&
html[tagEnd] != '/') {
tagEnd++;
}
const std::string tagName = html.substr(tagStart, tagEnd - tagStart);
// Check if this is a block-level element
if (isBlockTag(tagName, isClosing)) {
// Add line break for block elements
if (!result.empty() && !lastWasNewline) {
result += '\n';
lastWasNewline = true;
lastWasSpace = true;
}
}
inTag = true;
} else if (c == '>') {
inTag = false;
} else if (!inTag) {
// Handle HTML entities
if (c == '&') {
const std::string decoded = decodeHtmlEntity(html, i);
if (!decoded.empty()) {
// Check if decoded content is whitespace
bool allSpace = true;
for (const char dc : decoded) {
if (!std::isspace(static_cast<unsigned char>(dc))) {
allSpace = false;
break;
}
}
if (allSpace) {
if (!lastWasSpace) {
result += ' ';
lastWasSpace = true;
}
} else {
result += decoded;
lastWasSpace = false;
lastWasNewline = false;
}
}
continue;
}
// Collapse whitespace
if (std::isspace(static_cast<unsigned char>(c))) {
if (!lastWasSpace) {
result += ' ';
lastWasSpace = true;
}
} else {
result += c;
lastWasSpace = false;
lastWasNewline = false;
}
}
}
// Trim trailing whitespace
while (!result.empty() && std::isspace(static_cast<unsigned char>(result.back()))) {
result.pop_back();
}
return result;
}