- Add uncompressed dictionary (.dict) file support to avoid decompression memory issues - Implement chunked on-demand parsing for large definitions - Add backward navigation with re-parse capability - Limit cached pages to MAX_CACHED_PAGES (4) to prevent memory exhaustion - Add helper script for extracting/recompressing dictzip files
855 lines
26 KiB
C++
855 lines
26 KiB
C++
#include "StarDict.h"
|
||
|
||
#include <HardwareSerial.h>
|
||
#include <SDCardManager.h>
|
||
#include <miniz.h>
|
||
|
||
#include <algorithm>
|
||
#include <cctype>
|
||
|
||
#include "DictPrefixIndex.generated.h"
|
||
|
||
StarDict::StarDict(const std::string& basePath) : basePath(basePath) {}
|
||
|
||
StarDict::~StarDict() {
|
||
if (dzInfo.chunkSizes) {
|
||
free(dzInfo.chunkSizes);
|
||
dzInfo.chunkSizes = nullptr;
|
||
}
|
||
}
|
||
|
||
uint32_t StarDict::readBE32(const uint8_t* data) {
|
||
return (static_cast<uint32_t>(data[0]) << 24) | (static_cast<uint32_t>(data[1]) << 16) |
|
||
(static_cast<uint32_t>(data[2]) << 8) | static_cast<uint32_t>(data[3]);
|
||
}
|
||
|
||
bool StarDict::loadInfo() {
|
||
const std::string ifoPath = basePath + ".ifo";
|
||
FsFile file;
|
||
if (!SdMan.openFileForRead("DICT", ifoPath, file)) {
|
||
Serial.printf("[%lu] [DICT] Failed to open .ifo file: %s\n", millis(), ifoPath.c_str());
|
||
return false;
|
||
}
|
||
|
||
char buffer[256];
|
||
while (file.available()) {
|
||
const int len = file.fgets(buffer, sizeof(buffer));
|
||
if (len <= 0) break;
|
||
|
||
// Remove newline
|
||
char* newline = strchr(buffer, '\n');
|
||
if (newline) *newline = '\0';
|
||
newline = strchr(buffer, '\r');
|
||
if (newline) *newline = '\0';
|
||
|
||
// Parse key=value
|
||
char* eq = strchr(buffer, '=');
|
||
if (!eq) continue;
|
||
|
||
*eq = '\0';
|
||
const char* key = buffer;
|
||
const char* value = eq + 1;
|
||
|
||
if (strcmp(key, "bookname") == 0) {
|
||
info.bookname = value;
|
||
} else if (strcmp(key, "wordcount") == 0) {
|
||
info.wordcount = strtoul(value, nullptr, 10);
|
||
} else if (strcmp(key, "idxfilesize") == 0) {
|
||
info.idxfilesize = strtoul(value, nullptr, 10);
|
||
} else if (strcmp(key, "sametypesequence") == 0) {
|
||
info.sametypesequence = value[0];
|
||
} else if (strcmp(key, "synwordcount") == 0) {
|
||
info.synwordcount = strtoul(value, nullptr, 10);
|
||
}
|
||
}
|
||
|
||
file.close();
|
||
info.loaded = true;
|
||
|
||
Serial.printf("[%lu] [DICT] Loaded dictionary: %s (%u words)\n", millis(), info.bookname.c_str(), info.wordcount);
|
||
return true;
|
||
}
|
||
|
||
bool StarDict::loadDictzipHeader() {
|
||
if (dzInfo.loaded) return true;
|
||
|
||
const std::string dzPath = basePath + ".dict.dz";
|
||
FsFile file;
|
||
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
|
||
Serial.printf("[%lu] [DICT] Failed to open .dict.dz file\n", millis());
|
||
return false;
|
||
}
|
||
|
||
// Read gzip header
|
||
uint8_t header[10];
|
||
if (file.read(header, 10) != 10) {
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
// Verify gzip magic number
|
||
if (header[0] != 0x1f || header[1] != 0x8b) {
|
||
Serial.printf("[%lu] [DICT] Not a valid gzip file\n", millis());
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
// Check for extra field flag (bit 2)
|
||
const uint8_t flags = header[3];
|
||
if (!(flags & 0x04)) {
|
||
Serial.printf("[%lu] [DICT] No extra field - not a dictzip file\n", millis());
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
// Read extra field length
|
||
uint8_t xlenBuf[2];
|
||
if (file.read(xlenBuf, 2) != 2) {
|
||
file.close();
|
||
return false;
|
||
}
|
||
const uint16_t xlen = xlenBuf[0] | (xlenBuf[1] << 8);
|
||
|
||
// Read extra field
|
||
auto* extraField = static_cast<uint8_t*>(malloc(xlen));
|
||
if (!extraField) {
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
if (file.read(extraField, xlen) != xlen) {
|
||
free(extraField);
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
// Parse dictzip subfield (SI1='R', SI2='A')
|
||
bool foundDictzip = false;
|
||
uint16_t pos = 0;
|
||
while (pos + 4 <= xlen) {
|
||
const uint8_t si1 = extraField[pos];
|
||
const uint8_t si2 = extraField[pos + 1];
|
||
const uint16_t slen = extraField[pos + 2] | (extraField[pos + 3] << 8);
|
||
|
||
if (si1 == 'R' && si2 == 'A' && pos + 4 + slen <= xlen) {
|
||
// Dictzip subfield found
|
||
// Format: ver(2) + chlen(2) + count(2) + sizes[count](2 each)
|
||
const uint8_t* data = &extraField[pos + 4];
|
||
// uint16_t version = data[0] | (data[1] << 8); // Usually 1
|
||
dzInfo.chunkLength = data[2] | (data[3] << 8);
|
||
dzInfo.chunkCount = data[4] | (data[5] << 8);
|
||
|
||
dzInfo.chunkSizes = static_cast<uint16_t*>(malloc(dzInfo.chunkCount * sizeof(uint16_t)));
|
||
if (!dzInfo.chunkSizes) {
|
||
free(extraField);
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
for (uint16_t i = 0; i < dzInfo.chunkCount; i++) {
|
||
dzInfo.chunkSizes[i] = data[6 + i * 2] | (data[7 + i * 2] << 8);
|
||
}
|
||
|
||
foundDictzip = true;
|
||
break;
|
||
}
|
||
|
||
pos += 4 + slen;
|
||
}
|
||
|
||
free(extraField);
|
||
|
||
if (!foundDictzip) {
|
||
Serial.printf("[%lu] [DICT] Dictzip subfield not found\n", millis());
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
// Calculate header size (10 + 2 + xlen + optional fields)
|
||
dzInfo.headerSize = 10 + 2 + xlen;
|
||
|
||
// Skip FNAME if present (bit 3)
|
||
if (flags & 0x08) {
|
||
file.seek(dzInfo.headerSize);
|
||
while (file.available()) {
|
||
uint8_t c;
|
||
file.read(&c, 1);
|
||
dzInfo.headerSize++;
|
||
if (c == 0) break;
|
||
}
|
||
}
|
||
|
||
// Skip FCOMMENT if present (bit 4)
|
||
if (flags & 0x10) {
|
||
file.seek(dzInfo.headerSize);
|
||
while (file.available()) {
|
||
uint8_t c;
|
||
file.read(&c, 1);
|
||
dzInfo.headerSize++;
|
||
if (c == 0) break;
|
||
}
|
||
}
|
||
|
||
// Skip FHCRC if present (bit 1)
|
||
if (flags & 0x02) {
|
||
dzInfo.headerSize += 2;
|
||
}
|
||
|
||
file.close();
|
||
dzInfo.loaded = true;
|
||
|
||
Serial.printf("[%lu] [DICT] Dictzip: %u chunks of %u bytes, header size %u\n", millis(), dzInfo.chunkCount,
|
||
dzInfo.chunkLength, dzInfo.headerSize);
|
||
return true;
|
||
}
|
||
|
||
bool StarDict::begin() {
|
||
if (!loadInfo()) return false;
|
||
|
||
// Try uncompressed .dict file first (preferred - no memory overhead)
|
||
const std::string dictPath = basePath + ".dict";
|
||
FsFile testFile;
|
||
if (SdMan.openFileForRead("DICT", dictPath, testFile)) {
|
||
testFile.close();
|
||
useUncompressed = true;
|
||
Serial.printf("[%lu] [DICT] Using uncompressed .dict file (no decompression needed)\n", millis());
|
||
return true;
|
||
}
|
||
|
||
// Fall back to compressed .dict.dz
|
||
useUncompressed = false;
|
||
if (!loadDictzipHeader()) return false;
|
||
return true;
|
||
}
|
||
|
||
bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
|
||
uint32_t& dictSize) {
|
||
idxFile.seek(position);
|
||
|
||
// Read null-terminated word
|
||
word.clear();
|
||
char c;
|
||
while (idxFile.read(&c, 1) == 1) {
|
||
if (c == '\0') break;
|
||
word += c;
|
||
if (word.length() > 256) {
|
||
// Safety limit
|
||
return false;
|
||
}
|
||
}
|
||
|
||
if (word.empty()) return false;
|
||
|
||
// Read 4-byte big-endian offset
|
||
uint8_t buf[8];
|
||
if (idxFile.read(buf, 8) != 8) return false;
|
||
|
||
dictOffset = readBE32(buf);
|
||
dictSize = readBE32(buf + 4);
|
||
|
||
position = idxFile.position();
|
||
return true;
|
||
}
|
||
|
||
bool StarDict::readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition) {
|
||
// Read directly from uncompressed .dict file - no decompression needed!
|
||
const std::string dictPath = basePath + ".dict";
|
||
FsFile file;
|
||
if (!SdMan.openFileForRead("DICT", dictPath, file)) {
|
||
Serial.printf("[DICT-DBG] Failed to open .dict file\n");
|
||
return false;
|
||
}
|
||
|
||
// Seek to the definition offset
|
||
if (!file.seek(offset)) {
|
||
Serial.printf("[DICT-DBG] Failed to seek to offset %lu\n", offset);
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
// Read the definition directly into the string
|
||
definition.resize(size);
|
||
const int bytesRead = file.read(&definition[0], size);
|
||
file.close();
|
||
|
||
if (bytesRead != static_cast<int>(size)) {
|
||
Serial.printf("[DICT-DBG] Read %d bytes, expected %lu\n", bytesRead, size);
|
||
definition.clear();
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
|
||
if (!dzInfo.loaded) {
|
||
Serial.printf("[DICT-DBG] dzInfo not loaded!\n");
|
||
return false;
|
||
}
|
||
|
||
const std::string dzPath = basePath + ".dict.dz";
|
||
FsFile file;
|
||
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
|
||
Serial.printf("[DICT-DBG] Failed to open dict.dz file\n");
|
||
return false;
|
||
}
|
||
|
||
// Calculate which chunk(s) we need
|
||
const uint32_t startChunk = offset / dzInfo.chunkLength;
|
||
const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
|
||
const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;
|
||
|
||
Serial.printf("[DICT-DBG] Chunks: start=%lu, end=%lu, total=%u\n",
|
||
startChunk, endChunk, dzInfo.chunkCount);
|
||
|
||
if (endChunk >= dzInfo.chunkCount) {
|
||
Serial.printf("[DICT-DBG] endChunk %lu >= chunkCount %u\n", endChunk, dzInfo.chunkCount);
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
// Calculate file offset for start chunk
|
||
uint32_t fileOffset = dzInfo.headerSize;
|
||
for (uint32_t i = 0; i < startChunk; i++) {
|
||
fileOffset += dzInfo.chunkSizes[i];
|
||
}
|
||
|
||
// Calculate actual max compressed size needed for the chunks we'll process
|
||
uint32_t maxCompressedSize = 0;
|
||
for (uint32_t i = startChunk; i <= endChunk; i++) {
|
||
if (dzInfo.chunkSizes[i] > maxCompressedSize) {
|
||
maxCompressedSize = dzInfo.chunkSizes[i];
|
||
}
|
||
}
|
||
|
||
// Allocate buffers - allocate inflator FIRST (smallest) to reduce fragmentation impact
|
||
// tinfl_decompressor is ~11KB, so total allocations are ~85KB
|
||
Serial.printf("[DICT-DBG] Allocating inflator=%u, comp=%lu, decomp=%u bytes\n",
|
||
sizeof(tinfl_decompressor), maxCompressedSize, dzInfo.chunkLength);
|
||
|
||
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
|
||
if (!inflator) {
|
||
Serial.printf("[DICT-DBG] inflator alloc failed! (need %u bytes)\n", sizeof(tinfl_decompressor));
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
|
||
if (!compressedBuf) {
|
||
Serial.printf("[DICT-DBG] compressedBuf alloc failed!\n");
|
||
free(inflator);
|
||
file.close();
|
||
return false;
|
||
}
|
||
auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
|
||
if (!decompressedBuf) {
|
||
Serial.printf("[DICT-DBG] decompressedBuf alloc failed!\n");
|
||
free(inflator);
|
||
free(compressedBuf);
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
definition.clear();
|
||
definition.reserve(size);
|
||
|
||
// Process each needed chunk (reusing inflator allocation)
|
||
for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
|
||
const uint16_t compressedSize = dzInfo.chunkSizes[chunk];
|
||
|
||
// Seek and read compressed data
|
||
file.seek(fileOffset);
|
||
if (file.read(compressedBuf, compressedSize) != compressedSize) {
|
||
Serial.printf("[DICT-DBG] File read failed at offset %lu, size %u\n", fileOffset, compressedSize);
|
||
free(inflator);
|
||
free(compressedBuf);
|
||
free(decompressedBuf);
|
||
file.close();
|
||
return false;
|
||
}
|
||
|
||
// Decompress using raw inflate (no zlib header)
|
||
tinfl_init(inflator);
|
||
|
||
size_t inBytes = compressedSize;
|
||
size_t outBytes = dzInfo.chunkLength;
|
||
const tinfl_status status =
|
||
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
||
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);
|
||
|
||
if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
|
||
// Try without zlib header flag
|
||
tinfl_init(inflator);
|
||
inBytes = compressedSize;
|
||
outBytes = dzInfo.chunkLength;
|
||
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
||
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
|
||
}
|
||
|
||
// Extract the portion we need from this chunk
|
||
uint32_t copyStart = 0;
|
||
uint32_t copyEnd = outBytes;
|
||
|
||
if (chunk == startChunk) {
|
||
copyStart = startOffsetInChunk;
|
||
}
|
||
if (chunk == endChunk) {
|
||
const uint32_t endOffsetInChunk = (offset + size) - (endChunk * dzInfo.chunkLength);
|
||
if (endOffsetInChunk < copyEnd) {
|
||
copyEnd = endOffsetInChunk;
|
||
}
|
||
}
|
||
|
||
if (copyEnd > copyStart) {
|
||
definition.append(reinterpret_cast<char*>(decompressedBuf + copyStart), copyEnd - copyStart);
|
||
}
|
||
|
||
fileOffset += compressedSize;
|
||
}
|
||
|
||
free(inflator);
|
||
free(compressedBuf);
|
||
free(decompressedBuf);
|
||
file.close();
|
||
|
||
return true;
|
||
}
|
||
|
||
// StarDict comparison function: case-insensitive matching
|
||
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
|
||
// Case-insensitive comparison (like g_ascii_strcasecmp)
|
||
size_t i = 0;
|
||
while (i < a.length() && i < b.length()) {
|
||
const int ca = std::tolower(static_cast<unsigned char>(a[i]));
|
||
const int cb = std::tolower(static_cast<unsigned char>(b[i]));
|
||
if (ca != cb) return ca - cb;
|
||
i++;
|
||
}
|
||
if (a.length() != b.length()) {
|
||
return static_cast<int>(a.length()) - static_cast<int>(b.length());
|
||
}
|
||
// Case-insensitive match found
|
||
return 0;
|
||
}
|
||
|
||
std::string StarDict::normalizeWord(const std::string& word) {
|
||
std::string result;
|
||
result.reserve(word.length());
|
||
|
||
// Trim leading whitespace
|
||
size_t start = 0;
|
||
while (start < word.length() && std::isspace(static_cast<unsigned char>(word[start]))) {
|
||
start++;
|
||
}
|
||
|
||
// Trim trailing whitespace
|
||
size_t end = word.length();
|
||
while (end > start && std::isspace(static_cast<unsigned char>(word[end - 1]))) {
|
||
end--;
|
||
}
|
||
|
||
// Convert to lowercase
|
||
for (size_t i = start; i < end; i++) {
|
||
result += static_cast<char>(std::tolower(static_cast<unsigned char>(word[i])));
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
||
LookupResult result;
|
||
result.word = word;
|
||
|
||
if (!info.loaded) {
|
||
return result;
|
||
}
|
||
|
||
const std::string normalizedSearch = normalizeWord(word);
|
||
if (normalizedSearch.empty()) {
|
||
return result;
|
||
}
|
||
|
||
Serial.printf("[DICT-DBG] Searching for: '%s' (normalized: '%s')\n",
|
||
word.c_str(), normalizedSearch.c_str());
|
||
|
||
// First try .idx (main entries) - use prefix jump table for fast lookup
|
||
const std::string idxPath = basePath + ".idx";
|
||
FsFile idxFile;
|
||
if (!SdMan.openFileForRead("DICT", idxPath, idxFile)) {
|
||
Serial.printf("[%lu] [DICT] Failed to open index file\n", millis());
|
||
return result;
|
||
}
|
||
|
||
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
|
||
uint32_t position = 0;
|
||
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
|
||
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
|
||
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
|
||
position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
|
||
}
|
||
Serial.printf("[DICT-DBG] Starting at position %lu (prefix: %c%c)\n",
|
||
position, normalizedSearch[0], normalizedSearch[1]);
|
||
bool found = false;
|
||
uint32_t wordCount = 0;
|
||
|
||
while (position < info.idxfilesize) {
|
||
std::string currentWord;
|
||
uint32_t dictOffset, dictSize;
|
||
|
||
if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
|
||
break;
|
||
}
|
||
wordCount++;
|
||
if (wordCount % 50000 == 0) {
|
||
Serial.printf("[DICT-DBG] Progress: %lu words scanned, pos=%lu, current='%s'\n",
|
||
wordCount, position, currentWord.c_str());
|
||
}
|
||
|
||
// Use stardictStrcmp for case-insensitive matching
|
||
const int cmp = stardictStrcmp(normalizedSearch, currentWord);
|
||
|
||
if (cmp == 0) {
|
||
Serial.printf("[DICT-DBG] MATCH: '%s' == '%s' (offset=%lu, size=%lu)\n",
|
||
normalizedSearch.c_str(), currentWord.c_str(), dictOffset, dictSize);
|
||
std::string definition;
|
||
const bool loaded = useUncompressed
|
||
? readDefinitionDirect(dictOffset, dictSize, definition)
|
||
: decompressDefinition(dictOffset, dictSize, definition);
|
||
if (loaded) {
|
||
Serial.printf("[DICT-DBG] Definition loaded, %u bytes\n", definition.length());
|
||
if (!found) {
|
||
result.word = currentWord;
|
||
result.definition = definition;
|
||
result.found = true;
|
||
found = true;
|
||
} else {
|
||
result.definition += "</html>" + definition;
|
||
}
|
||
} else {
|
||
Serial.printf("[DICT-DBG] Definition load FAILED!\n");
|
||
}
|
||
// Continue scanning for additional matches (same word, different case)
|
||
} else if (found) {
|
||
// We had matches but now moved past them - safe to stop
|
||
break;
|
||
}
|
||
// Note: Cannot use early-break before first match because prefix index
|
||
// may not land exactly at target position
|
||
}
|
||
|
||
Serial.printf("[DICT-DBG] Search complete: %lu words scanned, found=%s\n",
|
||
wordCount, found ? "YES" : "NO");
|
||
idxFile.close();
|
||
|
||
// If not found in main index, try synonym file with prefix jump
|
||
if (!found && info.synwordcount > 0) {
|
||
const std::string synPath = basePath + ".syn";
|
||
FsFile synFile;
|
||
if (SdMan.openFileForRead("DICT", synPath, synFile)) {
|
||
const uint32_t synFileSize = synFile.size();
|
||
|
||
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
|
||
uint32_t synPosition = 0;
|
||
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
|
||
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
|
||
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
|
||
synPosition = DictPrefixIndex::synPrefixOffsets[prefixIdx];
|
||
synFile.seek(synPosition);
|
||
}
|
||
|
||
while (synFile.position() < synFileSize) {
|
||
// Read synonym word (null-terminated)
|
||
std::string synWord;
|
||
char c;
|
||
while (synFile.read(&c, 1) == 1 && c != '\0') {
|
||
synWord += c;
|
||
}
|
||
|
||
// Read 4-byte big-endian index
|
||
uint8_t idxBytes[4];
|
||
if (synFile.read(idxBytes, 4) != 4) break;
|
||
const uint32_t mainIdx = readBE32(idxBytes);
|
||
|
||
// Use stardictStrcmp for case-insensitive comparison
|
||
const int cmp = stardictStrcmp(normalizedSearch, synWord);
|
||
|
||
if (cmp == 0) {
|
||
// Found synonym - look up the main entry by index
|
||
FsFile idxFile2;
|
||
if (SdMan.openFileForRead("DICT", idxPath, idxFile2)) {
|
||
uint32_t pos = 0;
|
||
uint32_t entryNum = 0;
|
||
while (entryNum < mainIdx && pos < info.idxfilesize) {
|
||
std::string w;
|
||
uint32_t off, sz;
|
||
if (!readWordAtPosition(idxFile2, pos, w, off, sz)) break;
|
||
entryNum++;
|
||
}
|
||
// Now read the target entry
|
||
if (entryNum == mainIdx) {
|
||
std::string mainWord;
|
||
uint32_t dictOffset, dictSize;
|
||
if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
|
||
std::string definition;
|
||
const bool loaded = useUncompressed
|
||
? readDefinitionDirect(dictOffset, dictSize, definition)
|
||
: decompressDefinition(dictOffset, dictSize, definition);
|
||
if (loaded) {
|
||
result.word = synWord;
|
||
result.definition = definition;
|
||
result.found = true;
|
||
found = true;
|
||
}
|
||
}
|
||
}
|
||
idxFile2.close();
|
||
}
|
||
break; // Found a match, stop searching
|
||
}
|
||
// Note: Cannot use early-break optimization here because prefix index
|
||
// may not land exactly at target position
|
||
}
|
||
synFile.close();
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
// Helper to decode a single HTML entity starting at position i (after the '&')
|
||
// Returns the decoded string and advances i past the entity (including ';')
|
||
static std::string decodeHtmlEntity(const std::string& html, size_t& i) {
|
||
const size_t start = i; // Position of '&'
|
||
const size_t remaining = html.length() - start;
|
||
|
||
// Numeric entities: &#NNN; or &#xHHH;
|
||
if (remaining > 2 && html[start + 1] == '#') {
|
||
size_t numStart = start + 2;
|
||
bool isHex = false;
|
||
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
|
||
isHex = true;
|
||
numStart++;
|
||
}
|
||
|
||
size_t numEnd = numStart;
|
||
while (numEnd < html.length() && html[numEnd] != ';') {
|
||
const char c = html[numEnd];
|
||
if (isHex) {
|
||
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
|
||
} else {
|
||
if (!std::isdigit(static_cast<unsigned char>(c))) break;
|
||
}
|
||
numEnd++;
|
||
}
|
||
|
||
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
|
||
const std::string numStr = html.substr(numStart, numEnd - numStart);
|
||
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
|
||
i = numEnd; // Will be incremented by caller's loop
|
||
|
||
// Convert codepoint to UTF-8
|
||
std::string utf8;
|
||
if (codepoint < 0x80) {
|
||
utf8 += static_cast<char>(codepoint);
|
||
} else if (codepoint < 0x800) {
|
||
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
|
||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||
} else if (codepoint < 0x10000) {
|
||
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
|
||
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||
} else if (codepoint < 0x110000) {
|
||
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
|
||
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
|
||
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||
}
|
||
return utf8;
|
||
}
|
||
}
|
||
|
||
// Named entities - find the semicolon first
|
||
size_t semicolon = html.find(';', start + 1);
|
||
if (semicolon != std::string::npos && semicolon - start < 12) {
|
||
const std::string entity = html.substr(start, semicolon - start + 1);
|
||
|
||
// Common named entities
|
||
struct EntityMapping {
|
||
const char* entity;
|
||
const char* replacement;
|
||
};
|
||
static const EntityMapping entities[] = {
|
||
{" ", " "},
|
||
{"<", "<"},
|
||
{">", ">"},
|
||
{"&", "&"},
|
||
{""", "\""},
|
||
{"'", "'"},
|
||
{"—", "\xe2\x80\x94"}, // —
|
||
{"–", "\xe2\x80\x93"}, // –
|
||
{"…", "\xe2\x80\xa6"}, // …
|
||
{"’", "\xe2\x80\x99"}, // '
|
||
{"‘", "\xe2\x80\x98"}, // '
|
||
{"”", "\xe2\x80\x9d"}, // "
|
||
{"“", "\xe2\x80\x9c"}, // "
|
||
{"°", "\xc2\xb0"}, // °
|
||
{"×", "\xc3\x97"}, // ×
|
||
{"÷", "\xc3\xb7"}, // ÷
|
||
{"±", "\xc2\xb1"}, // ±
|
||
{"½", "\xc2\xbd"}, // ½
|
||
{"¼", "\xc2\xbc"}, // ¼
|
||
{"¾", "\xc2\xbe"}, // ¾
|
||
{"¢", "\xc2\xa2"}, // ¢
|
||
{"£", "\xc2\xa3"}, // £
|
||
{"€", "\xe2\x82\xac"}, // €
|
||
{"¥", "\xc2\xa5"}, // ¥
|
||
{"©", "\xc2\xa9"}, // ©
|
||
{"®", "\xc2\xae"}, // ®
|
||
{"™", "\xe2\x84\xa2"}, // ™
|
||
{"•", "\xe2\x80\xa2"}, // •
|
||
{"·", "\xc2\xb7"}, // ·
|
||
{"§", "\xc2\xa7"}, // §
|
||
{"¶", "\xc2\xb6"}, // ¶
|
||
{"†", "\xe2\x80\xa0"}, // †
|
||
{"‡", "\xe2\x80\xa1"}, // ‡
|
||
{"¡", "\xc2\xa1"}, // ¡
|
||
{"¿", "\xc2\xbf"}, // ¿
|
||
{"«", "\xc2\xab"}, // «
|
||
{"»", "\xc2\xbb"}, // »
|
||
{"­", ""},
|
||
{" ", " "},
|
||
{" ", " "},
|
||
{" ", " "},
|
||
{"‍", ""},
|
||
{"‌", ""},
|
||
};
|
||
|
||
for (const auto& mapping : entities) {
|
||
if (entity == mapping.entity) {
|
||
i = semicolon; // Will be incremented by caller's loop
|
||
return mapping.replacement;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Unknown entity - return just the ampersand and let the rest be processed normally
|
||
return "&";
|
||
}
|
||
|
||
// Helper to check if a tag is a block-level element that needs line breaks
|
||
static bool isBlockTag(const std::string& tag, bool isClosing) {
|
||
// Normalize to lowercase for comparison
|
||
std::string lowerTag = tag;
|
||
for (char& c : lowerTag) {
|
||
c = std::tolower(static_cast<unsigned char>(c));
|
||
}
|
||
|
||
// Block-level tags that should have line breaks
|
||
if (lowerTag == "p" || lowerTag == "div" || lowerTag == "br" || lowerTag == "hr" || lowerTag == "li" ||
|
||
lowerTag == "dt" || lowerTag == "dd" || lowerTag == "tr" || lowerTag == "h1" || lowerTag == "h2" ||
|
||
lowerTag == "h3" || lowerTag == "h4" || lowerTag == "h5" || lowerTag == "h6" || lowerTag == "blockquote" ||
|
||
lowerTag == "pre" || lowerTag == "ol" || lowerTag == "ul") {
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
std::string StarDict::stripHtml(const std::string& html) {
|
||
std::string result;
|
||
result.reserve(html.length());
|
||
|
||
bool inTag = false;
|
||
bool lastWasSpace = false;
|
||
bool lastWasNewline = false;
|
||
|
||
for (size_t i = 0; i < html.length(); i++) {
|
||
const char c = html[i];
|
||
|
||
if (c == '<') {
|
||
// Parse the tag name
|
||
size_t tagStart = i + 1;
|
||
bool isClosing = false;
|
||
|
||
// Skip whitespace after <
|
||
while (tagStart < html.length() && std::isspace(static_cast<unsigned char>(html[tagStart]))) {
|
||
tagStart++;
|
||
}
|
||
|
||
// Check for closing tag
|
||
if (tagStart < html.length() && html[tagStart] == '/') {
|
||
isClosing = true;
|
||
tagStart++;
|
||
}
|
||
|
||
// Extract tag name
|
||
size_t tagEnd = tagStart;
|
||
while (tagEnd < html.length() && !std::isspace(static_cast<unsigned char>(html[tagEnd])) && html[tagEnd] != '>' &&
|
||
html[tagEnd] != '/') {
|
||
tagEnd++;
|
||
}
|
||
|
||
const std::string tagName = html.substr(tagStart, tagEnd - tagStart);
|
||
|
||
// Check if this is a block-level element
|
||
if (isBlockTag(tagName, isClosing)) {
|
||
// Add line break for block elements
|
||
if (!result.empty() && !lastWasNewline) {
|
||
result += '\n';
|
||
lastWasNewline = true;
|
||
lastWasSpace = true;
|
||
}
|
||
}
|
||
|
||
inTag = true;
|
||
} else if (c == '>') {
|
||
inTag = false;
|
||
} else if (!inTag) {
|
||
// Handle HTML entities
|
||
if (c == '&') {
|
||
const std::string decoded = decodeHtmlEntity(html, i);
|
||
if (!decoded.empty()) {
|
||
// Check if decoded content is whitespace
|
||
bool allSpace = true;
|
||
for (const char dc : decoded) {
|
||
if (!std::isspace(static_cast<unsigned char>(dc))) {
|
||
allSpace = false;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (allSpace) {
|
||
if (!lastWasSpace) {
|
||
result += ' ';
|
||
lastWasSpace = true;
|
||
}
|
||
} else {
|
||
result += decoded;
|
||
lastWasSpace = false;
|
||
lastWasNewline = false;
|
||
}
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// Collapse whitespace
|
||
if (std::isspace(static_cast<unsigned char>(c))) {
|
||
if (!lastWasSpace) {
|
||
result += ' ';
|
||
lastWasSpace = true;
|
||
}
|
||
} else {
|
||
result += c;
|
||
lastWasSpace = false;
|
||
lastWasNewline = false;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Trim trailing whitespace
|
||
while (!result.empty() && std::isspace(static_cast<unsigned char>(result.back()))) {
|
||
result.pop_back();
|
||
}
|
||
|
||
return result;
|
||
}
|