760 lines
23 KiB
C++
760 lines
23 KiB
C++
|
|
#include "StarDict.h"
|
|||
|
|
|
|||
|
|
#include <HardwareSerial.h>
|
|||
|
|
#include <SDCardManager.h>
|
|||
|
|
#include <miniz.h>
|
|||
|
|
|
|||
|
|
#include <algorithm>
|
|||
|
|
#include <cctype>
|
|||
|
|
|
|||
|
|
#include "DictPrefixIndex.generated.h"
|
|||
|
|
|
|||
|
|
StarDict::StarDict(const std::string& basePath) : basePath(basePath) {}
|
|||
|
|
|
|||
|
|
StarDict::~StarDict() {
|
|||
|
|
if (dzInfo.chunkSizes) {
|
|||
|
|
free(dzInfo.chunkSizes);
|
|||
|
|
dzInfo.chunkSizes = nullptr;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
uint32_t StarDict::readBE32(const uint8_t* data) {
|
|||
|
|
return (static_cast<uint32_t>(data[0]) << 24) | (static_cast<uint32_t>(data[1]) << 16) |
|
|||
|
|
(static_cast<uint32_t>(data[2]) << 8) | static_cast<uint32_t>(data[3]);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
bool StarDict::loadInfo() {
|
|||
|
|
const std::string ifoPath = basePath + ".ifo";
|
|||
|
|
FsFile file;
|
|||
|
|
if (!SdMan.openFileForRead("DICT", ifoPath, file)) {
|
|||
|
|
Serial.printf("[%lu] [DICT] Failed to open .ifo file: %s\n", millis(), ifoPath.c_str());
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
char buffer[256];
|
|||
|
|
while (file.available()) {
|
|||
|
|
const int len = file.fgets(buffer, sizeof(buffer));
|
|||
|
|
if (len <= 0) break;
|
|||
|
|
|
|||
|
|
// Remove newline
|
|||
|
|
char* newline = strchr(buffer, '\n');
|
|||
|
|
if (newline) *newline = '\0';
|
|||
|
|
newline = strchr(buffer, '\r');
|
|||
|
|
if (newline) *newline = '\0';
|
|||
|
|
|
|||
|
|
// Parse key=value
|
|||
|
|
char* eq = strchr(buffer, '=');
|
|||
|
|
if (!eq) continue;
|
|||
|
|
|
|||
|
|
*eq = '\0';
|
|||
|
|
const char* key = buffer;
|
|||
|
|
const char* value = eq + 1;
|
|||
|
|
|
|||
|
|
if (strcmp(key, "bookname") == 0) {
|
|||
|
|
info.bookname = value;
|
|||
|
|
} else if (strcmp(key, "wordcount") == 0) {
|
|||
|
|
info.wordcount = strtoul(value, nullptr, 10);
|
|||
|
|
} else if (strcmp(key, "idxfilesize") == 0) {
|
|||
|
|
info.idxfilesize = strtoul(value, nullptr, 10);
|
|||
|
|
} else if (strcmp(key, "sametypesequence") == 0) {
|
|||
|
|
info.sametypesequence = value[0];
|
|||
|
|
} else if (strcmp(key, "synwordcount") == 0) {
|
|||
|
|
info.synwordcount = strtoul(value, nullptr, 10);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
file.close();
|
|||
|
|
info.loaded = true;
|
|||
|
|
|
|||
|
|
Serial.printf("[%lu] [DICT] Loaded dictionary: %s (%u words)\n", millis(), info.bookname.c_str(), info.wordcount);
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
bool StarDict::loadDictzipHeader() {
|
|||
|
|
if (dzInfo.loaded) return true;
|
|||
|
|
|
|||
|
|
const std::string dzPath = basePath + ".dict.dz";
|
|||
|
|
FsFile file;
|
|||
|
|
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
|
|||
|
|
Serial.printf("[%lu] [DICT] Failed to open .dict.dz file\n", millis());
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Read gzip header
|
|||
|
|
uint8_t header[10];
|
|||
|
|
if (file.read(header, 10) != 10) {
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Verify gzip magic number
|
|||
|
|
if (header[0] != 0x1f || header[1] != 0x8b) {
|
|||
|
|
Serial.printf("[%lu] [DICT] Not a valid gzip file\n", millis());
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check for extra field flag (bit 2)
|
|||
|
|
const uint8_t flags = header[3];
|
|||
|
|
if (!(flags & 0x04)) {
|
|||
|
|
Serial.printf("[%lu] [DICT] No extra field - not a dictzip file\n", millis());
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Read extra field length
|
|||
|
|
uint8_t xlenBuf[2];
|
|||
|
|
if (file.read(xlenBuf, 2) != 2) {
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
const uint16_t xlen = xlenBuf[0] | (xlenBuf[1] << 8);
|
|||
|
|
|
|||
|
|
// Read extra field
|
|||
|
|
auto* extraField = static_cast<uint8_t*>(malloc(xlen));
|
|||
|
|
if (!extraField) {
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (file.read(extraField, xlen) != xlen) {
|
|||
|
|
free(extraField);
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Parse dictzip subfield (SI1='R', SI2='A')
|
|||
|
|
bool foundDictzip = false;
|
|||
|
|
uint16_t pos = 0;
|
|||
|
|
while (pos + 4 <= xlen) {
|
|||
|
|
const uint8_t si1 = extraField[pos];
|
|||
|
|
const uint8_t si2 = extraField[pos + 1];
|
|||
|
|
const uint16_t slen = extraField[pos + 2] | (extraField[pos + 3] << 8);
|
|||
|
|
|
|||
|
|
if (si1 == 'R' && si2 == 'A' && pos + 4 + slen <= xlen) {
|
|||
|
|
// Dictzip subfield found
|
|||
|
|
// Format: ver(2) + chlen(2) + count(2) + sizes[count](2 each)
|
|||
|
|
const uint8_t* data = &extraField[pos + 4];
|
|||
|
|
// uint16_t version = data[0] | (data[1] << 8); // Usually 1
|
|||
|
|
dzInfo.chunkLength = data[2] | (data[3] << 8);
|
|||
|
|
dzInfo.chunkCount = data[4] | (data[5] << 8);
|
|||
|
|
|
|||
|
|
dzInfo.chunkSizes = static_cast<uint16_t*>(malloc(dzInfo.chunkCount * sizeof(uint16_t)));
|
|||
|
|
if (!dzInfo.chunkSizes) {
|
|||
|
|
free(extraField);
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for (uint16_t i = 0; i < dzInfo.chunkCount; i++) {
|
|||
|
|
dzInfo.chunkSizes[i] = data[6 + i * 2] | (data[7 + i * 2] << 8);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
foundDictzip = true;
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
pos += 4 + slen;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
free(extraField);
|
|||
|
|
|
|||
|
|
if (!foundDictzip) {
|
|||
|
|
Serial.printf("[%lu] [DICT] Dictzip subfield not found\n", millis());
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Calculate header size (10 + 2 + xlen + optional fields)
|
|||
|
|
dzInfo.headerSize = 10 + 2 + xlen;
|
|||
|
|
|
|||
|
|
// Skip FNAME if present (bit 3)
|
|||
|
|
if (flags & 0x08) {
|
|||
|
|
file.seek(dzInfo.headerSize);
|
|||
|
|
while (file.available()) {
|
|||
|
|
uint8_t c;
|
|||
|
|
file.read(&c, 1);
|
|||
|
|
dzInfo.headerSize++;
|
|||
|
|
if (c == 0) break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Skip FCOMMENT if present (bit 4)
|
|||
|
|
if (flags & 0x10) {
|
|||
|
|
file.seek(dzInfo.headerSize);
|
|||
|
|
while (file.available()) {
|
|||
|
|
uint8_t c;
|
|||
|
|
file.read(&c, 1);
|
|||
|
|
dzInfo.headerSize++;
|
|||
|
|
if (c == 0) break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Skip FHCRC if present (bit 1)
|
|||
|
|
if (flags & 0x02) {
|
|||
|
|
dzInfo.headerSize += 2;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
file.close();
|
|||
|
|
dzInfo.loaded = true;
|
|||
|
|
|
|||
|
|
Serial.printf("[%lu] [DICT] Dictzip: %u chunks of %u bytes, header size %u\n", millis(), dzInfo.chunkCount,
|
|||
|
|
dzInfo.chunkLength, dzInfo.headerSize);
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
bool StarDict::begin() {
|
|||
|
|
if (!loadInfo()) return false;
|
|||
|
|
if (!loadDictzipHeader()) return false;
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
|
|||
|
|
uint32_t& dictSize) {
|
|||
|
|
idxFile.seek(position);
|
|||
|
|
|
|||
|
|
// Read null-terminated word
|
|||
|
|
word.clear();
|
|||
|
|
char c;
|
|||
|
|
while (idxFile.read(&c, 1) == 1) {
|
|||
|
|
if (c == '\0') break;
|
|||
|
|
word += c;
|
|||
|
|
if (word.length() > 256) {
|
|||
|
|
// Safety limit
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (word.empty()) return false;
|
|||
|
|
|
|||
|
|
// Read 4-byte big-endian offset
|
|||
|
|
uint8_t buf[8];
|
|||
|
|
if (idxFile.read(buf, 8) != 8) return false;
|
|||
|
|
|
|||
|
|
dictOffset = readBE32(buf);
|
|||
|
|
dictSize = readBE32(buf + 4);
|
|||
|
|
|
|||
|
|
position = idxFile.position();
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
|
|||
|
|
if (!dzInfo.loaded) return false;
|
|||
|
|
|
|||
|
|
const std::string dzPath = basePath + ".dict.dz";
|
|||
|
|
FsFile file;
|
|||
|
|
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Calculate which chunk(s) we need
|
|||
|
|
const uint32_t startChunk = offset / dzInfo.chunkLength;
|
|||
|
|
const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
|
|||
|
|
const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;
|
|||
|
|
|
|||
|
|
if (endChunk >= dzInfo.chunkCount) {
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Calculate file offset for start chunk
|
|||
|
|
uint32_t fileOffset = dzInfo.headerSize;
|
|||
|
|
for (uint32_t i = 0; i < startChunk; i++) {
|
|||
|
|
fileOffset += dzInfo.chunkSizes[i];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Allocate buffers
|
|||
|
|
const uint32_t maxCompressedSize = 65536; // Max compressed chunk size
|
|||
|
|
auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
|
|||
|
|
auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
|
|||
|
|
if (!compressedBuf || !decompressedBuf) {
|
|||
|
|
free(compressedBuf);
|
|||
|
|
free(decompressedBuf);
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
definition.clear();
|
|||
|
|
definition.reserve(size);
|
|||
|
|
|
|||
|
|
// Process each needed chunk
|
|||
|
|
for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
|
|||
|
|
const uint16_t compressedSize = dzInfo.chunkSizes[chunk];
|
|||
|
|
|
|||
|
|
// Seek and read compressed data
|
|||
|
|
file.seek(fileOffset);
|
|||
|
|
if (file.read(compressedBuf, compressedSize) != compressedSize) {
|
|||
|
|
free(compressedBuf);
|
|||
|
|
free(decompressedBuf);
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Decompress using raw inflate (no zlib header)
|
|||
|
|
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
|
|||
|
|
if (!inflator) {
|
|||
|
|
free(compressedBuf);
|
|||
|
|
free(decompressedBuf);
|
|||
|
|
file.close();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
tinfl_init(inflator);
|
|||
|
|
|
|||
|
|
size_t inBytes = compressedSize;
|
|||
|
|
size_t outBytes = dzInfo.chunkLength;
|
|||
|
|
const tinfl_status status =
|
|||
|
|
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
|||
|
|
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);
|
|||
|
|
|
|||
|
|
free(inflator);
|
|||
|
|
|
|||
|
|
if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
|
|||
|
|
// Try without zlib header flag
|
|||
|
|
inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
|
|||
|
|
if (inflator) {
|
|||
|
|
tinfl_init(inflator);
|
|||
|
|
inBytes = compressedSize;
|
|||
|
|
outBytes = dzInfo.chunkLength;
|
|||
|
|
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
|||
|
|
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
|
|||
|
|
free(inflator);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Extract the portion we need from this chunk
|
|||
|
|
uint32_t copyStart = 0;
|
|||
|
|
uint32_t copyEnd = outBytes;
|
|||
|
|
|
|||
|
|
if (chunk == startChunk) {
|
|||
|
|
copyStart = startOffsetInChunk;
|
|||
|
|
}
|
|||
|
|
if (chunk == endChunk) {
|
|||
|
|
const uint32_t endOffsetInChunk = (offset + size) - (endChunk * dzInfo.chunkLength);
|
|||
|
|
if (endOffsetInChunk < copyEnd) {
|
|||
|
|
copyEnd = endOffsetInChunk;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (copyEnd > copyStart) {
|
|||
|
|
definition.append(reinterpret_cast<char*>(decompressedBuf + copyStart), copyEnd - copyStart);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
fileOffset += compressedSize;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
free(compressedBuf);
|
|||
|
|
free(decompressedBuf);
|
|||
|
|
file.close();
|
|||
|
|
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StarDict comparison function: case-insensitive first, then case-sensitive as tiebreaker
|
|||
|
|
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
|
|||
|
|
// First: case-insensitive comparison (like g_ascii_strcasecmp)
|
|||
|
|
size_t i = 0;
|
|||
|
|
while (i < a.length() && i < b.length()) {
|
|||
|
|
const int ca = std::tolower(static_cast<unsigned char>(a[i]));
|
|||
|
|
const int cb = std::tolower(static_cast<unsigned char>(b[i]));
|
|||
|
|
if (ca != cb) return ca - cb;
|
|||
|
|
i++;
|
|||
|
|
}
|
|||
|
|
if (a.length() != b.length()) {
|
|||
|
|
return static_cast<int>(a.length()) - static_cast<int>(b.length());
|
|||
|
|
}
|
|||
|
|
// If case-insensitive equal, use case-sensitive as tiebreaker
|
|||
|
|
return a.compare(b);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::string StarDict::normalizeWord(const std::string& word) {
|
|||
|
|
std::string result;
|
|||
|
|
result.reserve(word.length());
|
|||
|
|
|
|||
|
|
// Trim leading whitespace
|
|||
|
|
size_t start = 0;
|
|||
|
|
while (start < word.length() && std::isspace(static_cast<unsigned char>(word[start]))) {
|
|||
|
|
start++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Trim trailing whitespace
|
|||
|
|
size_t end = word.length();
|
|||
|
|
while (end > start && std::isspace(static_cast<unsigned char>(word[end - 1]))) {
|
|||
|
|
end--;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Convert to lowercase
|
|||
|
|
for (size_t i = start; i < end; i++) {
|
|||
|
|
result += static_cast<char>(std::tolower(static_cast<unsigned char>(word[i])));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
|||
|
|
LookupResult result;
|
|||
|
|
result.word = word;
|
|||
|
|
|
|||
|
|
if (!info.loaded) {
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const std::string normalizedSearch = normalizeWord(word);
|
|||
|
|
if (normalizedSearch.empty()) {
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// First try .idx (main entries) - use prefix jump table for fast lookup
|
|||
|
|
const std::string idxPath = basePath + ".idx";
|
|||
|
|
FsFile idxFile;
|
|||
|
|
if (!SdMan.openFileForRead("DICT", idxPath, idxFile)) {
|
|||
|
|
Serial.printf("[%lu] [DICT] Failed to open index file\n", millis());
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
|
|||
|
|
uint32_t position = 0;
|
|||
|
|
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
|
|||
|
|
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
|
|||
|
|
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
|
|||
|
|
position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
|
|||
|
|
}
|
|||
|
|
bool found = false;
|
|||
|
|
|
|||
|
|
while (position < info.idxfilesize) {
|
|||
|
|
std::string currentWord;
|
|||
|
|
uint32_t dictOffset, dictSize;
|
|||
|
|
|
|||
|
|
if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Use stardictStrcmp for case-insensitive matching
|
|||
|
|
const int cmp = stardictStrcmp(normalizedSearch, currentWord);
|
|||
|
|
|
|||
|
|
if (cmp == 0) {
|
|||
|
|
std::string definition;
|
|||
|
|
if (decompressDefinition(dictOffset, dictSize, definition)) {
|
|||
|
|
if (!found) {
|
|||
|
|
result.word = currentWord;
|
|||
|
|
result.definition = definition;
|
|||
|
|
result.found = true;
|
|||
|
|
found = true;
|
|||
|
|
} else {
|
|||
|
|
result.definition += "</html>" + definition;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// Continue scanning for additional matches (same word, different case)
|
|||
|
|
} else if (cmp < 0) {
|
|||
|
|
// Passed where target would be (file is sorted)
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
idxFile.close();
|
|||
|
|
|
|||
|
|
// If not found in main index, try synonym file with prefix jump
|
|||
|
|
if (!found && info.synwordcount > 0) {
|
|||
|
|
const std::string synPath = basePath + ".syn";
|
|||
|
|
FsFile synFile;
|
|||
|
|
if (SdMan.openFileForRead("DICT", synPath, synFile)) {
|
|||
|
|
const uint32_t synFileSize = synFile.size();
|
|||
|
|
|
|||
|
|
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
|
|||
|
|
uint32_t synPosition = 0;
|
|||
|
|
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
|
|||
|
|
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
|
|||
|
|
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
|
|||
|
|
synPosition = DictPrefixIndex::synPrefixOffsets[prefixIdx];
|
|||
|
|
synFile.seek(synPosition);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
while (synFile.position() < synFileSize) {
|
|||
|
|
// Read synonym word (null-terminated)
|
|||
|
|
std::string synWord;
|
|||
|
|
char c;
|
|||
|
|
while (synFile.read(&c, 1) == 1 && c != '\0') {
|
|||
|
|
synWord += c;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Read 4-byte big-endian index
|
|||
|
|
uint8_t idxBytes[4];
|
|||
|
|
if (synFile.read(idxBytes, 4) != 4) break;
|
|||
|
|
const uint32_t mainIdx = readBE32(idxBytes);
|
|||
|
|
|
|||
|
|
// Use stardictStrcmp for case-insensitive comparison
|
|||
|
|
const int cmp = stardictStrcmp(normalizedSearch, synWord);
|
|||
|
|
|
|||
|
|
if (cmp == 0) {
|
|||
|
|
// Found synonym - look up the main entry by index
|
|||
|
|
FsFile idxFile2;
|
|||
|
|
if (SdMan.openFileForRead("DICT", idxPath, idxFile2)) {
|
|||
|
|
uint32_t pos = 0;
|
|||
|
|
uint32_t entryNum = 0;
|
|||
|
|
while (entryNum < mainIdx && pos < info.idxfilesize) {
|
|||
|
|
std::string w;
|
|||
|
|
uint32_t off, sz;
|
|||
|
|
if (!readWordAtPosition(idxFile2, pos, w, off, sz)) break;
|
|||
|
|
entryNum++;
|
|||
|
|
}
|
|||
|
|
// Now read the target entry
|
|||
|
|
if (entryNum == mainIdx) {
|
|||
|
|
std::string mainWord;
|
|||
|
|
uint32_t dictOffset, dictSize;
|
|||
|
|
if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
|
|||
|
|
std::string definition;
|
|||
|
|
if (decompressDefinition(dictOffset, dictSize, definition)) {
|
|||
|
|
result.word = synWord;
|
|||
|
|
result.definition = definition;
|
|||
|
|
result.found = true;
|
|||
|
|
found = true;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
idxFile2.close();
|
|||
|
|
}
|
|||
|
|
break; // Found a match, stop searching
|
|||
|
|
} else if (cmp < 0) {
|
|||
|
|
// Passed where it would be (file is sorted)
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
synFile.close();
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper to decode a single HTML entity starting at position i (after the '&')
|
|||
|
|
// Returns the decoded string and advances i past the entity (including ';')
|
|||
|
|
static std::string decodeHtmlEntity(const std::string& html, size_t& i) {
|
|||
|
|
const size_t start = i; // Position of '&'
|
|||
|
|
const size_t remaining = html.length() - start;
|
|||
|
|
|
|||
|
|
// Numeric entities: &#NNN; or &#xHHH;
|
|||
|
|
if (remaining > 2 && html[start + 1] == '#') {
|
|||
|
|
size_t numStart = start + 2;
|
|||
|
|
bool isHex = false;
|
|||
|
|
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
|
|||
|
|
isHex = true;
|
|||
|
|
numStart++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
size_t numEnd = numStart;
|
|||
|
|
while (numEnd < html.length() && html[numEnd] != ';') {
|
|||
|
|
const char c = html[numEnd];
|
|||
|
|
if (isHex) {
|
|||
|
|
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
|
|||
|
|
} else {
|
|||
|
|
if (!std::isdigit(static_cast<unsigned char>(c))) break;
|
|||
|
|
}
|
|||
|
|
numEnd++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
|
|||
|
|
const std::string numStr = html.substr(numStart, numEnd - numStart);
|
|||
|
|
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
|
|||
|
|
i = numEnd; // Will be incremented by caller's loop
|
|||
|
|
|
|||
|
|
// Convert codepoint to UTF-8
|
|||
|
|
std::string utf8;
|
|||
|
|
if (codepoint < 0x80) {
|
|||
|
|
utf8 += static_cast<char>(codepoint);
|
|||
|
|
} else if (codepoint < 0x800) {
|
|||
|
|
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
|
|||
|
|
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
|||
|
|
} else if (codepoint < 0x10000) {
|
|||
|
|
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
|
|||
|
|
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
|||
|
|
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
|||
|
|
} else if (codepoint < 0x110000) {
|
|||
|
|
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
|
|||
|
|
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
|
|||
|
|
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
|||
|
|
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
|||
|
|
}
|
|||
|
|
return utf8;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Named entities - find the semicolon first
|
|||
|
|
size_t semicolon = html.find(';', start + 1);
|
|||
|
|
if (semicolon != std::string::npos && semicolon - start < 12) {
|
|||
|
|
const std::string entity = html.substr(start, semicolon - start + 1);
|
|||
|
|
|
|||
|
|
// Common named entities
|
|||
|
|
struct EntityMapping {
|
|||
|
|
const char* entity;
|
|||
|
|
const char* replacement;
|
|||
|
|
};
|
|||
|
|
static const EntityMapping entities[] = {
|
|||
|
|
{" ", " "}, {"<", "<"}, {">", ">"},
|
|||
|
|
{"&", "&"}, {""", "\""}, {"'", "'"},
|
|||
|
|
{"—", "\xe2\x80\x94"}, // —
|
|||
|
|
{"–", "\xe2\x80\x93"}, // –
|
|||
|
|
{"…", "\xe2\x80\xa6"}, // …
|
|||
|
|
{"’", "\xe2\x80\x99"}, // '
|
|||
|
|
{"‘", "\xe2\x80\x98"}, // '
|
|||
|
|
{"”", "\xe2\x80\x9d"}, // "
|
|||
|
|
{"“", "\xe2\x80\x9c"}, // "
|
|||
|
|
{"°", "\xc2\xb0"}, // °
|
|||
|
|
{"×", "\xc3\x97"}, // ×
|
|||
|
|
{"÷", "\xc3\xb7"}, // ÷
|
|||
|
|
{"±", "\xc2\xb1"}, // ±
|
|||
|
|
{"½", "\xc2\xbd"}, // ½
|
|||
|
|
{"¼", "\xc2\xbc"}, // ¼
|
|||
|
|
{"¾", "\xc2\xbe"}, // ¾
|
|||
|
|
{"¢", "\xc2\xa2"}, // ¢
|
|||
|
|
{"£", "\xc2\xa3"}, // £
|
|||
|
|
{"€", "\xe2\x82\xac"}, // €
|
|||
|
|
{"¥", "\xc2\xa5"}, // ¥
|
|||
|
|
{"©", "\xc2\xa9"}, // ©
|
|||
|
|
{"®", "\xc2\xae"}, // ®
|
|||
|
|
{"™", "\xe2\x84\xa2"}, // ™
|
|||
|
|
{"•", "\xe2\x80\xa2"}, // •
|
|||
|
|
{"·", "\xc2\xb7"}, // ·
|
|||
|
|
{"§", "\xc2\xa7"}, // §
|
|||
|
|
{"¶", "\xc2\xb6"}, // ¶
|
|||
|
|
{"†", "\xe2\x80\xa0"}, // †
|
|||
|
|
{"‡", "\xe2\x80\xa1"}, // ‡
|
|||
|
|
{"¡", "\xc2\xa1"}, // ¡
|
|||
|
|
{"¿", "\xc2\xbf"}, // ¿
|
|||
|
|
{"«", "\xc2\xab"}, // «
|
|||
|
|
{"»", "\xc2\xbb"}, // »
|
|||
|
|
{"­", ""},
|
|||
|
|
{" ", " "},
|
|||
|
|
{" ", " "},
|
|||
|
|
{" ", " "},
|
|||
|
|
{"‍", ""},
|
|||
|
|
{"‌", ""},
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
for (const auto& mapping : entities) {
|
|||
|
|
if (entity == mapping.entity) {
|
|||
|
|
i = semicolon; // Will be incremented by caller's loop
|
|||
|
|
return mapping.replacement;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Unknown entity - return just the ampersand and let the rest be processed normally
|
|||
|
|
return "&";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper to check if a tag is a block-level element that needs line breaks
|
|||
|
|
static bool isBlockTag(const std::string& tag, bool isClosing) {
|
|||
|
|
// Normalize to lowercase for comparison
|
|||
|
|
std::string lowerTag = tag;
|
|||
|
|
for (char& c : lowerTag) {
|
|||
|
|
c = std::tolower(static_cast<unsigned char>(c));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Block-level tags that should have line breaks
|
|||
|
|
if (lowerTag == "p" || lowerTag == "div" || lowerTag == "br" || lowerTag == "hr" || lowerTag == "li" ||
|
|||
|
|
lowerTag == "dt" || lowerTag == "dd" || lowerTag == "tr" || lowerTag == "h1" || lowerTag == "h2" ||
|
|||
|
|
lowerTag == "h3" || lowerTag == "h4" || lowerTag == "h5" || lowerTag == "h6" || lowerTag == "blockquote" ||
|
|||
|
|
lowerTag == "pre" || lowerTag == "ol" || lowerTag == "ul") {
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::string StarDict::stripHtml(const std::string& html) {
|
|||
|
|
std::string result;
|
|||
|
|
result.reserve(html.length());
|
|||
|
|
|
|||
|
|
bool inTag = false;
|
|||
|
|
bool lastWasSpace = false;
|
|||
|
|
bool lastWasNewline = false;
|
|||
|
|
|
|||
|
|
for (size_t i = 0; i < html.length(); i++) {
|
|||
|
|
const char c = html[i];
|
|||
|
|
|
|||
|
|
if (c == '<') {
|
|||
|
|
// Parse the tag name
|
|||
|
|
size_t tagStart = i + 1;
|
|||
|
|
bool isClosing = false;
|
|||
|
|
|
|||
|
|
// Skip whitespace after <
|
|||
|
|
while (tagStart < html.length() && std::isspace(static_cast<unsigned char>(html[tagStart]))) {
|
|||
|
|
tagStart++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check for closing tag
|
|||
|
|
if (tagStart < html.length() && html[tagStart] == '/') {
|
|||
|
|
isClosing = true;
|
|||
|
|
tagStart++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Extract tag name
|
|||
|
|
size_t tagEnd = tagStart;
|
|||
|
|
while (tagEnd < html.length() && !std::isspace(static_cast<unsigned char>(html[tagEnd])) &&
|
|||
|
|
html[tagEnd] != '>' && html[tagEnd] != '/') {
|
|||
|
|
tagEnd++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const std::string tagName = html.substr(tagStart, tagEnd - tagStart);
|
|||
|
|
|
|||
|
|
// Check if this is a block-level element
|
|||
|
|
if (isBlockTag(tagName, isClosing)) {
|
|||
|
|
// Add line break for block elements
|
|||
|
|
if (!result.empty() && !lastWasNewline) {
|
|||
|
|
result += '\n';
|
|||
|
|
lastWasNewline = true;
|
|||
|
|
lastWasSpace = true;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
inTag = true;
|
|||
|
|
} else if (c == '>') {
|
|||
|
|
inTag = false;
|
|||
|
|
} else if (!inTag) {
|
|||
|
|
// Handle HTML entities
|
|||
|
|
if (c == '&') {
|
|||
|
|
const std::string decoded = decodeHtmlEntity(html, i);
|
|||
|
|
if (!decoded.empty()) {
|
|||
|
|
// Check if decoded content is whitespace
|
|||
|
|
bool allSpace = true;
|
|||
|
|
for (const char dc : decoded) {
|
|||
|
|
if (!std::isspace(static_cast<unsigned char>(dc))) {
|
|||
|
|
allSpace = false;
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (allSpace) {
|
|||
|
|
if (!lastWasSpace) {
|
|||
|
|
result += ' ';
|
|||
|
|
lastWasSpace = true;
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
result += decoded;
|
|||
|
|
lastWasSpace = false;
|
|||
|
|
lastWasNewline = false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Collapse whitespace
|
|||
|
|
if (std::isspace(static_cast<unsigned char>(c))) {
|
|||
|
|
if (!lastWasSpace) {
|
|||
|
|
result += ' ';
|
|||
|
|
lastWasSpace = true;
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
result += c;
|
|||
|
|
lastWasSpace = false;
|
|||
|
|
lastWasNewline = false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Trim trailing whitespace
|
|||
|
|
while (!result.empty() && std::isspace(static_cast<unsigned char>(result.back()))) {
|
|||
|
|
result.pop_back();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return result;
|
|||
|
|
}
|