850 lines
26 KiB
C++
Raw Normal View History

2026-01-22 12:42:01 -05:00
#include "StarDict.h"
#include <HardwareSerial.h>
#include <SDCardManager.h>
#include <miniz.h>
#include <algorithm>
#include <cctype>
#include "DictPrefixIndex.generated.h"
StarDict::StarDict(const std::string& basePath) : basePath(basePath) {}
StarDict::~StarDict() {
if (dzInfo.chunkSizes) {
free(dzInfo.chunkSizes);
dzInfo.chunkSizes = nullptr;
}
}
uint32_t StarDict::readBE32(const uint8_t* data) {
return (static_cast<uint32_t>(data[0]) << 24) | (static_cast<uint32_t>(data[1]) << 16) |
(static_cast<uint32_t>(data[2]) << 8) | static_cast<uint32_t>(data[3]);
}
bool StarDict::loadInfo() {
const std::string ifoPath = basePath + ".ifo";
FsFile file;
if (!SdMan.openFileForRead("DICT", ifoPath, file)) {
Serial.printf("[%lu] [DICT] Failed to open .ifo file: %s\n", millis(), ifoPath.c_str());
return false;
}
char buffer[256];
while (file.available()) {
const int len = file.fgets(buffer, sizeof(buffer));
if (len <= 0) break;
// Remove newline
char* newline = strchr(buffer, '\n');
if (newline) *newline = '\0';
newline = strchr(buffer, '\r');
if (newline) *newline = '\0';
// Parse key=value
char* eq = strchr(buffer, '=');
if (!eq) continue;
*eq = '\0';
const char* key = buffer;
const char* value = eq + 1;
if (strcmp(key, "bookname") == 0) {
info.bookname = value;
} else if (strcmp(key, "wordcount") == 0) {
info.wordcount = strtoul(value, nullptr, 10);
} else if (strcmp(key, "idxfilesize") == 0) {
info.idxfilesize = strtoul(value, nullptr, 10);
} else if (strcmp(key, "sametypesequence") == 0) {
info.sametypesequence = value[0];
} else if (strcmp(key, "synwordcount") == 0) {
info.synwordcount = strtoul(value, nullptr, 10);
}
}
file.close();
info.loaded = true;
Serial.printf("[%lu] [DICT] Loaded dictionary: %s (%u words)\n", millis(), info.bookname.c_str(), info.wordcount);
return true;
}
bool StarDict::loadDictzipHeader() {
if (dzInfo.loaded) return true;
const std::string dzPath = basePath + ".dict.dz";
FsFile file;
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
Serial.printf("[%lu] [DICT] Failed to open .dict.dz file\n", millis());
return false;
}
// Read gzip header
uint8_t header[10];
if (file.read(header, 10) != 10) {
file.close();
return false;
}
// Verify gzip magic number
if (header[0] != 0x1f || header[1] != 0x8b) {
Serial.printf("[%lu] [DICT] Not a valid gzip file\n", millis());
file.close();
return false;
}
// Check for extra field flag (bit 2)
const uint8_t flags = header[3];
if (!(flags & 0x04)) {
Serial.printf("[%lu] [DICT] No extra field - not a dictzip file\n", millis());
file.close();
return false;
}
// Read extra field length
uint8_t xlenBuf[2];
if (file.read(xlenBuf, 2) != 2) {
file.close();
return false;
}
const uint16_t xlen = xlenBuf[0] | (xlenBuf[1] << 8);
// Read extra field
auto* extraField = static_cast<uint8_t*>(malloc(xlen));
if (!extraField) {
file.close();
return false;
}
if (file.read(extraField, xlen) != xlen) {
free(extraField);
file.close();
return false;
}
// Parse dictzip subfield (SI1='R', SI2='A')
bool foundDictzip = false;
uint16_t pos = 0;
while (pos + 4 <= xlen) {
const uint8_t si1 = extraField[pos];
const uint8_t si2 = extraField[pos + 1];
const uint16_t slen = extraField[pos + 2] | (extraField[pos + 3] << 8);
if (si1 == 'R' && si2 == 'A' && pos + 4 + slen <= xlen) {
// Dictzip subfield found
// Format: ver(2) + chlen(2) + count(2) + sizes[count](2 each)
const uint8_t* data = &extraField[pos + 4];
// uint16_t version = data[0] | (data[1] << 8); // Usually 1
dzInfo.chunkLength = data[2] | (data[3] << 8);
dzInfo.chunkCount = data[4] | (data[5] << 8);
dzInfo.chunkSizes = static_cast<uint16_t*>(malloc(dzInfo.chunkCount * sizeof(uint16_t)));
if (!dzInfo.chunkSizes) {
free(extraField);
file.close();
return false;
}
for (uint16_t i = 0; i < dzInfo.chunkCount; i++) {
dzInfo.chunkSizes[i] = data[6 + i * 2] | (data[7 + i * 2] << 8);
}
foundDictzip = true;
break;
}
pos += 4 + slen;
}
free(extraField);
if (!foundDictzip) {
Serial.printf("[%lu] [DICT] Dictzip subfield not found\n", millis());
file.close();
return false;
}
// Calculate header size (10 + 2 + xlen + optional fields)
dzInfo.headerSize = 10 + 2 + xlen;
// Skip FNAME if present (bit 3)
if (flags & 0x08) {
file.seek(dzInfo.headerSize);
while (file.available()) {
uint8_t c;
file.read(&c, 1);
dzInfo.headerSize++;
if (c == 0) break;
}
}
// Skip FCOMMENT if present (bit 4)
if (flags & 0x10) {
file.seek(dzInfo.headerSize);
while (file.available()) {
uint8_t c;
file.read(&c, 1);
dzInfo.headerSize++;
if (c == 0) break;
}
}
// Skip FHCRC if present (bit 1)
if (flags & 0x02) {
dzInfo.headerSize += 2;
}
file.close();
dzInfo.loaded = true;
Serial.printf("[%lu] [DICT] Dictzip: %u chunks of %u bytes, header size %u\n", millis(), dzInfo.chunkCount,
dzInfo.chunkLength, dzInfo.headerSize);
return true;
}
bool StarDict::begin() {
if (!loadInfo()) return false;
// Try uncompressed .dict file first (preferred - no memory overhead)
const std::string dictPath = basePath + ".dict";
FsFile testFile;
if (SdMan.openFileForRead("DICT", dictPath, testFile)) {
testFile.close();
useUncompressed = true;
Serial.printf("[%lu] [DICT] Using uncompressed .dict file (no decompression needed)\n", millis());
return true;
}
// Fall back to compressed .dict.dz
useUncompressed = false;
2026-01-22 12:42:01 -05:00
if (!loadDictzipHeader()) return false;
return true;
}
bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
uint32_t& dictSize) {
idxFile.seek(position);
// Read null-terminated word
word.clear();
char c;
while (idxFile.read(&c, 1) == 1) {
if (c == '\0') break;
word += c;
if (word.length() > 256) {
// Safety limit
return false;
}
}
if (word.empty()) return false;
// Read 4-byte big-endian offset
uint8_t buf[8];
if (idxFile.read(buf, 8) != 8) return false;
dictOffset = readBE32(buf);
dictSize = readBE32(buf + 4);
position = idxFile.position();
return true;
}
bool StarDict::readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition) {
// Read directly from uncompressed .dict file - no decompression needed!
const std::string dictPath = basePath + ".dict";
FsFile file;
if (!SdMan.openFileForRead("DICT", dictPath, file)) {
Serial.printf("[DICT-DBG] Failed to open .dict file\n");
return false;
}
// Seek to the definition offset
if (!file.seek(offset)) {
Serial.printf("[DICT-DBG] Failed to seek to offset %lu\n", offset);
file.close();
return false;
}
// Read the definition directly into the string
definition.resize(size);
const int bytesRead = file.read(&definition[0], size);
file.close();
if (bytesRead != static_cast<int>(size)) {
Serial.printf("[DICT-DBG] Read %d bytes, expected %lu\n", bytesRead, size);
definition.clear();
return false;
}
return true;
}
2026-01-22 12:42:01 -05:00
bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
if (!dzInfo.loaded) {
Serial.printf("[DICT-DBG] dzInfo not loaded!\n");
return false;
}
2026-01-22 12:42:01 -05:00
const std::string dzPath = basePath + ".dict.dz";
FsFile file;
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
Serial.printf("[DICT-DBG] Failed to open dict.dz file\n");
2026-01-22 12:42:01 -05:00
return false;
}
// Calculate which chunk(s) we need
const uint32_t startChunk = offset / dzInfo.chunkLength;
const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;
Serial.printf("[DICT-DBG] Chunks: start=%lu, end=%lu, total=%u\n", startChunk, endChunk, dzInfo.chunkCount);
2026-01-22 12:42:01 -05:00
if (endChunk >= dzInfo.chunkCount) {
Serial.printf("[DICT-DBG] endChunk %lu >= chunkCount %u\n", endChunk, dzInfo.chunkCount);
2026-01-22 12:42:01 -05:00
file.close();
return false;
}
// Calculate file offset for start chunk
uint32_t fileOffset = dzInfo.headerSize;
for (uint32_t i = 0; i < startChunk; i++) {
fileOffset += dzInfo.chunkSizes[i];
}
// Calculate actual max compressed size needed for the chunks we'll process
uint32_t maxCompressedSize = 0;
for (uint32_t i = startChunk; i <= endChunk; i++) {
if (dzInfo.chunkSizes[i] > maxCompressedSize) {
maxCompressedSize = dzInfo.chunkSizes[i];
}
}
// Allocate buffers - allocate inflator FIRST (smallest) to reduce fragmentation impact
// tinfl_decompressor is ~11KB, so total allocations are ~85KB
Serial.printf("[DICT-DBG] Allocating inflator=%u, comp=%lu, decomp=%u bytes\n", sizeof(tinfl_decompressor),
maxCompressedSize, dzInfo.chunkLength);
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (!inflator) {
Serial.printf("[DICT-DBG] inflator alloc failed! (need %u bytes)\n", sizeof(tinfl_decompressor));
file.close();
return false;
}
2026-01-22 12:42:01 -05:00
auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
if (!compressedBuf) {
Serial.printf("[DICT-DBG] compressedBuf alloc failed!\n");
free(inflator);
file.close();
return false;
}
2026-01-22 12:42:01 -05:00
auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
if (!decompressedBuf) {
Serial.printf("[DICT-DBG] decompressedBuf alloc failed!\n");
free(inflator);
2026-01-22 12:42:01 -05:00
free(compressedBuf);
file.close();
return false;
}
definition.clear();
definition.reserve(size);
// Process each needed chunk (reusing inflator allocation)
2026-01-22 12:42:01 -05:00
for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
const uint16_t compressedSize = dzInfo.chunkSizes[chunk];
// Seek and read compressed data
file.seek(fileOffset);
if (file.read(compressedBuf, compressedSize) != compressedSize) {
Serial.printf("[DICT-DBG] File read failed at offset %lu, size %u\n", fileOffset, compressedSize);
free(inflator);
2026-01-22 12:42:01 -05:00
free(compressedBuf);
free(decompressedBuf);
file.close();
return false;
}
// Decompress using raw inflate (no zlib header)
tinfl_init(inflator);
size_t inBytes = compressedSize;
size_t outBytes = dzInfo.chunkLength;
const tinfl_status status =
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);
if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
// Try without zlib header flag
tinfl_init(inflator);
inBytes = compressedSize;
outBytes = dzInfo.chunkLength;
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
2026-01-22 12:42:01 -05:00
}
// Extract the portion we need from this chunk
uint32_t copyStart = 0;
uint32_t copyEnd = outBytes;
if (chunk == startChunk) {
copyStart = startOffsetInChunk;
}
if (chunk == endChunk) {
const uint32_t endOffsetInChunk = (offset + size) - (endChunk * dzInfo.chunkLength);
if (endOffsetInChunk < copyEnd) {
copyEnd = endOffsetInChunk;
}
}
if (copyEnd > copyStart) {
definition.append(reinterpret_cast<char*>(decompressedBuf + copyStart), copyEnd - copyStart);
}
fileOffset += compressedSize;
}
free(inflator);
2026-01-22 12:42:01 -05:00
free(compressedBuf);
free(decompressedBuf);
file.close();
return true;
}
// StarDict comparison function: case-insensitive matching
2026-01-22 12:42:01 -05:00
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
// Case-insensitive comparison (like g_ascii_strcasecmp)
2026-01-22 12:42:01 -05:00
size_t i = 0;
while (i < a.length() && i < b.length()) {
const int ca = std::tolower(static_cast<unsigned char>(a[i]));
const int cb = std::tolower(static_cast<unsigned char>(b[i]));
if (ca != cb) return ca - cb;
i++;
}
if (a.length() != b.length()) {
return static_cast<int>(a.length()) - static_cast<int>(b.length());
}
// Case-insensitive match found
return 0;
2026-01-22 12:42:01 -05:00
}
std::string StarDict::normalizeWord(const std::string& word) {
std::string result;
result.reserve(word.length());
// Trim leading whitespace
size_t start = 0;
while (start < word.length() && std::isspace(static_cast<unsigned char>(word[start]))) {
start++;
}
// Trim trailing whitespace
size_t end = word.length();
while (end > start && std::isspace(static_cast<unsigned char>(word[end - 1]))) {
end--;
}
// Convert to lowercase
for (size_t i = start; i < end; i++) {
result += static_cast<char>(std::tolower(static_cast<unsigned char>(word[i])));
}
return result;
}
StarDict::LookupResult StarDict::lookup(const std::string& word) {
LookupResult result;
result.word = word;
if (!info.loaded) {
return result;
}
const std::string normalizedSearch = normalizeWord(word);
if (normalizedSearch.empty()) {
return result;
}
Serial.printf("[DICT-DBG] Searching for: '%s' (normalized: '%s')\n", word.c_str(), normalizedSearch.c_str());
2026-01-22 12:42:01 -05:00
// First try .idx (main entries) - use prefix jump table for fast lookup
const std::string idxPath = basePath + ".idx";
FsFile idxFile;
if (!SdMan.openFileForRead("DICT", idxPath, idxFile)) {
Serial.printf("[%lu] [DICT] Failed to open index file\n", millis());
return result;
}
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
uint32_t position = 0;
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
}
Serial.printf("[DICT-DBG] Starting at position %lu (prefix: %c%c)\n", position, normalizedSearch[0],
normalizedSearch[1]);
2026-01-22 12:42:01 -05:00
bool found = false;
uint32_t wordCount = 0;
2026-01-22 12:42:01 -05:00
while (position < info.idxfilesize) {
std::string currentWord;
uint32_t dictOffset, dictSize;
if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
break;
}
wordCount++;
if (wordCount % 50000 == 0) {
Serial.printf("[DICT-DBG] Progress: %lu words scanned, pos=%lu, current='%s'\n", wordCount, position,
currentWord.c_str());
}
2026-01-22 12:42:01 -05:00
// Use stardictStrcmp for case-insensitive matching
const int cmp = stardictStrcmp(normalizedSearch, currentWord);
if (cmp == 0) {
Serial.printf("[DICT-DBG] MATCH: '%s' == '%s' (offset=%lu, size=%lu)\n", normalizedSearch.c_str(),
currentWord.c_str(), dictOffset, dictSize);
2026-01-22 12:42:01 -05:00
std::string definition;
const bool loaded = useUncompressed ? readDefinitionDirect(dictOffset, dictSize, definition)
: decompressDefinition(dictOffset, dictSize, definition);
if (loaded) {
Serial.printf("[DICT-DBG] Definition loaded, %u bytes\n", definition.length());
2026-01-22 12:42:01 -05:00
if (!found) {
result.word = currentWord;
result.definition = definition;
result.found = true;
found = true;
} else {
result.definition += "</html>" + definition;
}
} else {
Serial.printf("[DICT-DBG] Definition load FAILED!\n");
2026-01-22 12:42:01 -05:00
}
// Continue scanning for additional matches (same word, different case)
} else if (found) {
// We had matches but now moved past them - safe to stop
2026-01-22 12:42:01 -05:00
break;
}
// Note: Cannot use early-break before first match because prefix index
// may not land exactly at target position
2026-01-22 12:42:01 -05:00
}
Serial.printf("[DICT-DBG] Search complete: %lu words scanned, found=%s\n", wordCount, found ? "YES" : "NO");
2026-01-22 12:42:01 -05:00
idxFile.close();
// If not found in main index, try synonym file with prefix jump
if (!found && info.synwordcount > 0) {
const std::string synPath = basePath + ".syn";
FsFile synFile;
if (SdMan.openFileForRead("DICT", synPath, synFile)) {
const uint32_t synFileSize = synFile.size();
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
uint32_t synPosition = 0;
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
synPosition = DictPrefixIndex::synPrefixOffsets[prefixIdx];
synFile.seek(synPosition);
}
while (synFile.position() < synFileSize) {
// Read synonym word (null-terminated)
std::string synWord;
char c;
while (synFile.read(&c, 1) == 1 && c != '\0') {
synWord += c;
}
// Read 4-byte big-endian index
uint8_t idxBytes[4];
if (synFile.read(idxBytes, 4) != 4) break;
const uint32_t mainIdx = readBE32(idxBytes);
// Use stardictStrcmp for case-insensitive comparison
const int cmp = stardictStrcmp(normalizedSearch, synWord);
if (cmp == 0) {
// Found synonym - look up the main entry by index
FsFile idxFile2;
if (SdMan.openFileForRead("DICT", idxPath, idxFile2)) {
uint32_t pos = 0;
uint32_t entryNum = 0;
while (entryNum < mainIdx && pos < info.idxfilesize) {
std::string w;
uint32_t off, sz;
if (!readWordAtPosition(idxFile2, pos, w, off, sz)) break;
entryNum++;
}
// Now read the target entry
if (entryNum == mainIdx) {
std::string mainWord;
uint32_t dictOffset, dictSize;
if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
std::string definition;
const bool loaded = useUncompressed ? readDefinitionDirect(dictOffset, dictSize, definition)
: decompressDefinition(dictOffset, dictSize, definition);
if (loaded) {
2026-01-22 12:42:01 -05:00
result.word = synWord;
result.definition = definition;
result.found = true;
found = true;
}
}
}
idxFile2.close();
}
break; // Found a match, stop searching
}
// Note: Cannot use early-break optimization here because prefix index
// may not land exactly at target position
2026-01-22 12:42:01 -05:00
}
synFile.close();
}
}
return result;
}
// Helper to decode a single HTML entity starting at position i (after the '&')
// Returns the decoded string and advances i past the entity (including ';')
static std::string decodeHtmlEntity(const std::string& html, size_t& i) {
const size_t start = i; // Position of '&'
const size_t remaining = html.length() - start;
// Numeric entities: &#NNN; or &#xHHH;
if (remaining > 2 && html[start + 1] == '#') {
size_t numStart = start + 2;
bool isHex = false;
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
isHex = true;
numStart++;
}
size_t numEnd = numStart;
while (numEnd < html.length() && html[numEnd] != ';') {
const char c = html[numEnd];
if (isHex) {
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
} else {
if (!std::isdigit(static_cast<unsigned char>(c))) break;
}
numEnd++;
}
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
const std::string numStr = html.substr(numStart, numEnd - numStart);
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
i = numEnd; // Will be incremented by caller's loop
// Convert codepoint to UTF-8
std::string utf8;
if (codepoint < 0x80) {
utf8 += static_cast<char>(codepoint);
} else if (codepoint < 0x800) {
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x10000) {
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x110000) {
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
}
return utf8;
}
}
// Named entities - find the semicolon first
size_t semicolon = html.find(';', start + 1);
if (semicolon != std::string::npos && semicolon - start < 12) {
const std::string entity = html.substr(start, semicolon - start + 1);
// Common named entities
struct EntityMapping {
const char* entity;
const char* replacement;
};
static const EntityMapping entities[] = {
{"&nbsp;", " "},
{"&lt;", "<"},
{"&gt;", ">"},
{"&amp;", "&"},
{"&quot;", "\""},
{"&apos;", "'"},
2026-01-22 12:42:01 -05:00
{"&mdash;", "\xe2\x80\x94"}, // —
{"&ndash;", "\xe2\x80\x93"}, //
{"&hellip;", "\xe2\x80\xa6"}, // …
{"&rsquo;", "\xe2\x80\x99"}, // '
{"&lsquo;", "\xe2\x80\x98"}, // '
{"&rdquo;", "\xe2\x80\x9d"}, // "
{"&ldquo;", "\xe2\x80\x9c"}, // "
{"&deg;", "\xc2\xb0"}, // °
{"&times;", "\xc3\x97"}, // ×
{"&divide;", "\xc3\xb7"}, // ÷
{"&plusmn;", "\xc2\xb1"}, // ±
{"&frac12;", "\xc2\xbd"}, // ½
{"&frac14;", "\xc2\xbc"}, // ¼
{"&frac34;", "\xc2\xbe"}, // ¾
{"&cent;", "\xc2\xa2"}, // ¢
{"&pound;", "\xc2\xa3"}, // £
{"&euro;", "\xe2\x82\xac"}, // €
{"&yen;", "\xc2\xa5"}, // ¥
{"&copy;", "\xc2\xa9"}, // ©
{"&reg;", "\xc2\xae"}, // ®
{"&trade;", "\xe2\x84\xa2"}, // ™
{"&bull;", "\xe2\x80\xa2"}, // •
{"&middot;", "\xc2\xb7"}, // ·
{"&sect;", "\xc2\xa7"}, // §
{"&para;", "\xc2\xb6"}, // ¶
{"&dagger;", "\xe2\x80\xa0"}, // †
{"&Dagger;", "\xe2\x80\xa1"}, // ‡
{"&iexcl;", "\xc2\xa1"}, // ¡
{"&iquest;", "\xc2\xbf"}, // ¿
{"&laquo;", "\xc2\xab"}, // «
{"&raquo;", "\xc2\xbb"}, // »
{"&shy;", ""},
{"&ensp;", " "},
{"&emsp;", " "},
{"&thinsp;", " "},
{"&zwj;", ""},
{"&zwnj;", ""},
};
for (const auto& mapping : entities) {
if (entity == mapping.entity) {
i = semicolon; // Will be incremented by caller's loop
return mapping.replacement;
}
}
}
// Unknown entity - return just the ampersand and let the rest be processed normally
return "&";
}
// Helper to check if a tag is a block-level element that needs line breaks
static bool isBlockTag(const std::string& tag, bool isClosing) {
// Normalize to lowercase for comparison
std::string lowerTag = tag;
for (char& c : lowerTag) {
c = std::tolower(static_cast<unsigned char>(c));
}
// Block-level tags that should have line breaks
if (lowerTag == "p" || lowerTag == "div" || lowerTag == "br" || lowerTag == "hr" || lowerTag == "li" ||
lowerTag == "dt" || lowerTag == "dd" || lowerTag == "tr" || lowerTag == "h1" || lowerTag == "h2" ||
lowerTag == "h3" || lowerTag == "h4" || lowerTag == "h5" || lowerTag == "h6" || lowerTag == "blockquote" ||
lowerTag == "pre" || lowerTag == "ol" || lowerTag == "ul") {
return true;
}
return false;
}
std::string StarDict::stripHtml(const std::string& html) {
std::string result;
result.reserve(html.length());
bool inTag = false;
bool lastWasSpace = false;
bool lastWasNewline = false;
for (size_t i = 0; i < html.length(); i++) {
const char c = html[i];
if (c == '<') {
// Parse the tag name
size_t tagStart = i + 1;
bool isClosing = false;
// Skip whitespace after <
while (tagStart < html.length() && std::isspace(static_cast<unsigned char>(html[tagStart]))) {
tagStart++;
}
// Check for closing tag
if (tagStart < html.length() && html[tagStart] == '/') {
isClosing = true;
tagStart++;
}
// Extract tag name
size_t tagEnd = tagStart;
while (tagEnd < html.length() && !std::isspace(static_cast<unsigned char>(html[tagEnd])) && html[tagEnd] != '>' &&
html[tagEnd] != '/') {
2026-01-22 12:42:01 -05:00
tagEnd++;
}
const std::string tagName = html.substr(tagStart, tagEnd - tagStart);
// Check if this is a block-level element
if (isBlockTag(tagName, isClosing)) {
// Add line break for block elements
if (!result.empty() && !lastWasNewline) {
result += '\n';
lastWasNewline = true;
lastWasSpace = true;
}
}
inTag = true;
} else if (c == '>') {
inTag = false;
} else if (!inTag) {
// Handle HTML entities
if (c == '&') {
const std::string decoded = decodeHtmlEntity(html, i);
if (!decoded.empty()) {
// Check if decoded content is whitespace
bool allSpace = true;
for (const char dc : decoded) {
if (!std::isspace(static_cast<unsigned char>(dc))) {
allSpace = false;
break;
}
}
if (allSpace) {
if (!lastWasSpace) {
result += ' ';
lastWasSpace = true;
}
} else {
result += decoded;
lastWasSpace = false;
lastWasNewline = false;
}
}
continue;
}
// Collapse whitespace
if (std::isspace(static_cast<unsigned char>(c))) {
if (!lastWasSpace) {
result += ' ';
lastWasSpace = true;
}
} else {
result += c;
lastWasSpace = false;
lastWasNewline = false;
}
}
}
// Trim trailing whitespace
while (!result.empty() && std::isspace(static_cast<unsigned char>(result.back()))) {
result.pop_back();
}
return result;
}