#include "StarDict.h" #include #include #include #include #include #include "DictPrefixIndex.generated.h" StarDict::StarDict(const std::string& basePath) : basePath(basePath) {} StarDict::~StarDict() { if (dzInfo.chunkSizes) { free(dzInfo.chunkSizes); dzInfo.chunkSizes = nullptr; } } uint32_t StarDict::readBE32(const uint8_t* data) { return (static_cast(data[0]) << 24) | (static_cast(data[1]) << 16) | (static_cast(data[2]) << 8) | static_cast(data[3]); } bool StarDict::loadInfo() { const std::string ifoPath = basePath + ".ifo"; FsFile file; if (!SdMan.openFileForRead("DICT", ifoPath, file)) { Serial.printf("[%lu] [DICT] Failed to open .ifo file: %s\n", millis(), ifoPath.c_str()); return false; } char buffer[256]; while (file.available()) { const int len = file.fgets(buffer, sizeof(buffer)); if (len <= 0) break; // Remove newline char* newline = strchr(buffer, '\n'); if (newline) *newline = '\0'; newline = strchr(buffer, '\r'); if (newline) *newline = '\0'; // Parse key=value char* eq = strchr(buffer, '='); if (!eq) continue; *eq = '\0'; const char* key = buffer; const char* value = eq + 1; if (strcmp(key, "bookname") == 0) { info.bookname = value; } else if (strcmp(key, "wordcount") == 0) { info.wordcount = strtoul(value, nullptr, 10); } else if (strcmp(key, "idxfilesize") == 0) { info.idxfilesize = strtoul(value, nullptr, 10); } else if (strcmp(key, "sametypesequence") == 0) { info.sametypesequence = value[0]; } else if (strcmp(key, "synwordcount") == 0) { info.synwordcount = strtoul(value, nullptr, 10); } } file.close(); info.loaded = true; Serial.printf("[%lu] [DICT] Loaded dictionary: %s (%u words)\n", millis(), info.bookname.c_str(), info.wordcount); return true; } bool StarDict::loadDictzipHeader() { if (dzInfo.loaded) return true; const std::string dzPath = basePath + ".dict.dz"; FsFile file; if (!SdMan.openFileForRead("DICT", dzPath, file)) { Serial.printf("[%lu] [DICT] Failed to open .dict.dz file\n", millis()); return false; } // Read gzip header uint8_t header[10]; if (file.read(header, 10) != 10) { file.close(); return false; } // Verify gzip magic number if (header[0] != 0x1f || header[1] != 0x8b) { Serial.printf("[%lu] [DICT] Not a valid gzip file\n", millis()); file.close(); return false; } // Check for extra field flag (bit 2) const uint8_t flags = header[3]; if (!(flags & 0x04)) { Serial.printf("[%lu] [DICT] No extra field - not a dictzip file\n", millis()); file.close(); return false; } // Read extra field length uint8_t xlenBuf[2]; if (file.read(xlenBuf, 2) != 2) { file.close(); return false; } const uint16_t xlen = xlenBuf[0] | (xlenBuf[1] << 8); // Read extra field auto* extraField = static_cast(malloc(xlen)); if (!extraField) { file.close(); return false; } if (file.read(extraField, xlen) != xlen) { free(extraField); file.close(); return false; } // Parse dictzip subfield (SI1='R', SI2='A') bool foundDictzip = false; uint16_t pos = 0; while (pos + 4 <= xlen) { const uint8_t si1 = extraField[pos]; const uint8_t si2 = extraField[pos + 1]; const uint16_t slen = extraField[pos + 2] | (extraField[pos + 3] << 8); if (si1 == 'R' && si2 == 'A' && pos + 4 + slen <= xlen) { // Dictzip subfield found // Format: ver(2) + chlen(2) + count(2) + sizes[count](2 each) const uint8_t* data = &extraField[pos + 4]; // uint16_t version = data[0] | (data[1] << 8); // Usually 1 dzInfo.chunkLength = data[2] | (data[3] << 8); dzInfo.chunkCount = data[4] | (data[5] << 8); dzInfo.chunkSizes = static_cast(malloc(dzInfo.chunkCount * sizeof(uint16_t))); if (!dzInfo.chunkSizes) { free(extraField); file.close(); return false; } for (uint16_t i = 0; i < dzInfo.chunkCount; i++) { dzInfo.chunkSizes[i] = data[6 + i * 2] | (data[7 + i * 2] << 8); } foundDictzip = true; break; } pos += 4 + slen; } free(extraField); if (!foundDictzip) { Serial.printf("[%lu] [DICT] Dictzip subfield not found\n", millis()); file.close(); return false; } // Calculate header size (10 + 2 + xlen + optional fields) dzInfo.headerSize = 10 + 2 + xlen; // Skip FNAME if present (bit 3) if (flags & 0x08) { file.seek(dzInfo.headerSize); while (file.available()) { uint8_t c; file.read(&c, 1); dzInfo.headerSize++; if (c == 0) break; } } // Skip FCOMMENT if present (bit 4) if (flags & 0x10) { file.seek(dzInfo.headerSize); while (file.available()) { uint8_t c; file.read(&c, 1); dzInfo.headerSize++; if (c == 0) break; } } // Skip FHCRC if present (bit 1) if (flags & 0x02) { dzInfo.headerSize += 2; } file.close(); dzInfo.loaded = true; Serial.printf("[%lu] [DICT] Dictzip: %u chunks of %u bytes, header size %u\n", millis(), dzInfo.chunkCount, dzInfo.chunkLength, dzInfo.headerSize); return true; } bool StarDict::begin() { if (!loadInfo()) return false; if (!loadDictzipHeader()) return false; return true; } bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset, uint32_t& dictSize) { idxFile.seek(position); // Read null-terminated word word.clear(); char c; while (idxFile.read(&c, 1) == 1) { if (c == '\0') break; word += c; if (word.length() > 256) { // Safety limit return false; } } if (word.empty()) return false; // Read 4-byte big-endian offset uint8_t buf[8]; if (idxFile.read(buf, 8) != 8) return false; dictOffset = readBE32(buf); dictSize = readBE32(buf + 4); position = idxFile.position(); return true; } bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) { if (!dzInfo.loaded) return false; const std::string dzPath = basePath + ".dict.dz"; FsFile file; if (!SdMan.openFileForRead("DICT", dzPath, file)) { return false; } // Calculate which chunk(s) we need const uint32_t startChunk = offset / dzInfo.chunkLength; const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength; const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength; if (endChunk >= dzInfo.chunkCount) { file.close(); return false; } // Calculate file offset for start chunk uint32_t fileOffset = dzInfo.headerSize; for (uint32_t i = 0; i < startChunk; i++) { fileOffset += dzInfo.chunkSizes[i]; } // Allocate buffers const uint32_t maxCompressedSize = 65536; // Max compressed chunk size auto* compressedBuf = static_cast(malloc(maxCompressedSize)); auto* decompressedBuf = static_cast(malloc(dzInfo.chunkLength)); if (!compressedBuf || !decompressedBuf) { free(compressedBuf); free(decompressedBuf); file.close(); return false; } definition.clear(); definition.reserve(size); // Process each needed chunk for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) { const uint16_t compressedSize = dzInfo.chunkSizes[chunk]; // Seek and read compressed data file.seek(fileOffset); if (file.read(compressedBuf, compressedSize) != compressedSize) { free(compressedBuf); free(decompressedBuf); file.close(); return false; } // Decompress using raw inflate (no zlib header) auto* inflator = static_cast(malloc(sizeof(tinfl_decompressor))); if (!inflator) { free(compressedBuf); free(decompressedBuf); file.close(); return false; } tinfl_init(inflator); size_t inBytes = compressedSize; size_t outBytes = dzInfo.chunkLength; const tinfl_status status = tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes, TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER); free(inflator); if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) { // Try without zlib header flag inflator = static_cast(malloc(sizeof(tinfl_decompressor))); if (inflator) { tinfl_init(inflator); inBytes = compressedSize; outBytes = dzInfo.chunkLength; tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes, TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF); free(inflator); } } // Extract the portion we need from this chunk uint32_t copyStart = 0; uint32_t copyEnd = outBytes; if (chunk == startChunk) { copyStart = startOffsetInChunk; } if (chunk == endChunk) { const uint32_t endOffsetInChunk = (offset + size) - (endChunk * dzInfo.chunkLength); if (endOffsetInChunk < copyEnd) { copyEnd = endOffsetInChunk; } } if (copyEnd > copyStart) { definition.append(reinterpret_cast(decompressedBuf + copyStart), copyEnd - copyStart); } fileOffset += compressedSize; } free(compressedBuf); free(decompressedBuf); file.close(); return true; } // StarDict comparison function: case-insensitive first, then case-sensitive as tiebreaker int StarDict::stardictStrcmp(const std::string& a, const std::string& b) { // First: case-insensitive comparison (like g_ascii_strcasecmp) size_t i = 0; while (i < a.length() && i < b.length()) { const int ca = std::tolower(static_cast(a[i])); const int cb = std::tolower(static_cast(b[i])); if (ca != cb) return ca - cb; i++; } if (a.length() != b.length()) { return static_cast(a.length()) - static_cast(b.length()); } // If case-insensitive equal, use case-sensitive as tiebreaker return a.compare(b); } std::string StarDict::normalizeWord(const std::string& word) { std::string result; result.reserve(word.length()); // Trim leading whitespace size_t start = 0; while (start < word.length() && std::isspace(static_cast(word[start]))) { start++; } // Trim trailing whitespace size_t end = word.length(); while (end > start && std::isspace(static_cast(word[end - 1]))) { end--; } // Convert to lowercase for (size_t i = start; i < end; i++) { result += static_cast(std::tolower(static_cast(word[i]))); } return result; } StarDict::LookupResult StarDict::lookup(const std::string& word) { LookupResult result; result.word = word; if (!info.loaded) { return result; } const std::string normalizedSearch = normalizeWord(word); if (normalizedSearch.empty()) { return result; } // First try .idx (main entries) - use prefix jump table for fast lookup const std::string idxPath = basePath + ".idx"; FsFile idxFile; if (!SdMan.openFileForRead("DICT", idxPath, idxFile)) { Serial.printf("[%lu] [DICT] Failed to open index file\n", millis()); return result; } // Jump to the relevant section using prefix index (if word has 2+ alpha chars) uint32_t position = 0; if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) && DictPrefixIndex::isAlpha(normalizedSearch[1])) { const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]); position = DictPrefixIndex::dictPrefixOffsets[prefixIdx]; } bool found = false; while (position < info.idxfilesize) { std::string currentWord; uint32_t dictOffset, dictSize; if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) { break; } // Use stardictStrcmp for case-insensitive matching const int cmp = stardictStrcmp(normalizedSearch, currentWord); if (cmp == 0) { std::string definition; if (decompressDefinition(dictOffset, dictSize, definition)) { if (!found) { result.word = currentWord; result.definition = definition; result.found = true; found = true; } else { result.definition += "" + definition; } } // Continue scanning for additional matches (same word, different case) } else if (cmp < 0) { // Passed where target would be (file is sorted) break; } } idxFile.close(); // If not found in main index, try synonym file with prefix jump if (!found && info.synwordcount > 0) { const std::string synPath = basePath + ".syn"; FsFile synFile; if (SdMan.openFileForRead("DICT", synPath, synFile)) { const uint32_t synFileSize = synFile.size(); // Jump to the relevant section using prefix index (if word has 2+ alpha chars) uint32_t synPosition = 0; if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) && DictPrefixIndex::isAlpha(normalizedSearch[1])) { const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]); synPosition = DictPrefixIndex::synPrefixOffsets[prefixIdx]; synFile.seek(synPosition); } while (synFile.position() < synFileSize) { // Read synonym word (null-terminated) std::string synWord; char c; while (synFile.read(&c, 1) == 1 && c != '\0') { synWord += c; } // Read 4-byte big-endian index uint8_t idxBytes[4]; if (synFile.read(idxBytes, 4) != 4) break; const uint32_t mainIdx = readBE32(idxBytes); // Use stardictStrcmp for case-insensitive comparison const int cmp = stardictStrcmp(normalizedSearch, synWord); if (cmp == 0) { // Found synonym - look up the main entry by index FsFile idxFile2; if (SdMan.openFileForRead("DICT", idxPath, idxFile2)) { uint32_t pos = 0; uint32_t entryNum = 0; while (entryNum < mainIdx && pos < info.idxfilesize) { std::string w; uint32_t off, sz; if (!readWordAtPosition(idxFile2, pos, w, off, sz)) break; entryNum++; } // Now read the target entry if (entryNum == mainIdx) { std::string mainWord; uint32_t dictOffset, dictSize; if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) { std::string definition; if (decompressDefinition(dictOffset, dictSize, definition)) { result.word = synWord; result.definition = definition; result.found = true; found = true; } } } idxFile2.close(); } break; // Found a match, stop searching } else if (cmp < 0) { // Passed where it would be (file is sorted) break; } } synFile.close(); } } return result; } // Helper to decode a single HTML entity starting at position i (after the '&') // Returns the decoded string and advances i past the entity (including ';') static std::string decodeHtmlEntity(const std::string& html, size_t& i) { const size_t start = i; // Position of '&' const size_t remaining = html.length() - start; // Numeric entities: &#NNN; or &#xHHH; if (remaining > 2 && html[start + 1] == '#') { size_t numStart = start + 2; bool isHex = false; if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) { isHex = true; numStart++; } size_t numEnd = numStart; while (numEnd < html.length() && html[numEnd] != ';') { const char c = html[numEnd]; if (isHex) { if (!std::isxdigit(static_cast(c))) break; } else { if (!std::isdigit(static_cast(c))) break; } numEnd++; } if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') { const std::string numStr = html.substr(numStart, numEnd - numStart); unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10); i = numEnd; // Will be incremented by caller's loop // Convert codepoint to UTF-8 std::string utf8; if (codepoint < 0x80) { utf8 += static_cast(codepoint); } else if (codepoint < 0x800) { utf8 += static_cast(0xC0 | (codepoint >> 6)); utf8 += static_cast(0x80 | (codepoint & 0x3F)); } else if (codepoint < 0x10000) { utf8 += static_cast(0xE0 | (codepoint >> 12)); utf8 += static_cast(0x80 | ((codepoint >> 6) & 0x3F)); utf8 += static_cast(0x80 | (codepoint & 0x3F)); } else if (codepoint < 0x110000) { utf8 += static_cast(0xF0 | (codepoint >> 18)); utf8 += static_cast(0x80 | ((codepoint >> 12) & 0x3F)); utf8 += static_cast(0x80 | ((codepoint >> 6) & 0x3F)); utf8 += static_cast(0x80 | (codepoint & 0x3F)); } return utf8; } } // Named entities - find the semicolon first size_t semicolon = html.find(';', start + 1); if (semicolon != std::string::npos && semicolon - start < 12) { const std::string entity = html.substr(start, semicolon - start + 1); // Common named entities struct EntityMapping { const char* entity; const char* replacement; }; static const EntityMapping entities[] = { {" ", " "}, {"<", "<"}, {">", ">"}, {"&", "&"}, {""", "\""}, {"'", "'"}, {"—", "\xe2\x80\x94"}, // — {"–", "\xe2\x80\x93"}, // – {"…", "\xe2\x80\xa6"}, // … {"’", "\xe2\x80\x99"}, // ' {"‘", "\xe2\x80\x98"}, // ' {"”", "\xe2\x80\x9d"}, // " {"“", "\xe2\x80\x9c"}, // " {"°", "\xc2\xb0"}, // ° {"×", "\xc3\x97"}, // × {"÷", "\xc3\xb7"}, // ÷ {"±", "\xc2\xb1"}, // ± {"½", "\xc2\xbd"}, // ½ {"¼", "\xc2\xbc"}, // ¼ {"¾", "\xc2\xbe"}, // ¾ {"¢", "\xc2\xa2"}, // ¢ {"£", "\xc2\xa3"}, // £ {"€", "\xe2\x82\xac"}, // € {"¥", "\xc2\xa5"}, // ¥ {"©", "\xc2\xa9"}, // © {"®", "\xc2\xae"}, // ® {"™", "\xe2\x84\xa2"}, // ™ {"•", "\xe2\x80\xa2"}, // • {"·", "\xc2\xb7"}, // · {"§", "\xc2\xa7"}, // § {"¶", "\xc2\xb6"}, // ¶ {"†", "\xe2\x80\xa0"}, // † {"‡", "\xe2\x80\xa1"}, // ‡ {"¡", "\xc2\xa1"}, // ¡ {"¿", "\xc2\xbf"}, // ¿ {"«", "\xc2\xab"}, // « {"»", "\xc2\xbb"}, // » {"", ""}, {" ", " "}, {" ", " "}, {" ", " "}, {"‍", ""}, {"‌", ""}, }; for (const auto& mapping : entities) { if (entity == mapping.entity) { i = semicolon; // Will be incremented by caller's loop return mapping.replacement; } } } // Unknown entity - return just the ampersand and let the rest be processed normally return "&"; } // Helper to check if a tag is a block-level element that needs line breaks static bool isBlockTag(const std::string& tag, bool isClosing) { // Normalize to lowercase for comparison std::string lowerTag = tag; for (char& c : lowerTag) { c = std::tolower(static_cast(c)); } // Block-level tags that should have line breaks if (lowerTag == "p" || lowerTag == "div" || lowerTag == "br" || lowerTag == "hr" || lowerTag == "li" || lowerTag == "dt" || lowerTag == "dd" || lowerTag == "tr" || lowerTag == "h1" || lowerTag == "h2" || lowerTag == "h3" || lowerTag == "h4" || lowerTag == "h5" || lowerTag == "h6" || lowerTag == "blockquote" || lowerTag == "pre" || lowerTag == "ol" || lowerTag == "ul") { return true; } return false; } std::string StarDict::stripHtml(const std::string& html) { std::string result; result.reserve(html.length()); bool inTag = false; bool lastWasSpace = false; bool lastWasNewline = false; for (size_t i = 0; i < html.length(); i++) { const char c = html[i]; if (c == '<') { // Parse the tag name size_t tagStart = i + 1; bool isClosing = false; // Skip whitespace after < while (tagStart < html.length() && std::isspace(static_cast(html[tagStart]))) { tagStart++; } // Check for closing tag if (tagStart < html.length() && html[tagStart] == '/') { isClosing = true; tagStart++; } // Extract tag name size_t tagEnd = tagStart; while (tagEnd < html.length() && !std::isspace(static_cast(html[tagEnd])) && html[tagEnd] != '>' && html[tagEnd] != '/') { tagEnd++; } const std::string tagName = html.substr(tagStart, tagEnd - tagStart); // Check if this is a block-level element if (isBlockTag(tagName, isClosing)) { // Add line break for block elements if (!result.empty() && !lastWasNewline) { result += '\n'; lastWasNewline = true; lastWasSpace = true; } } inTag = true; } else if (c == '>') { inTag = false; } else if (!inTag) { // Handle HTML entities if (c == '&') { const std::string decoded = decodeHtmlEntity(html, i); if (!decoded.empty()) { // Check if decoded content is whitespace bool allSpace = true; for (const char dc : decoded) { if (!std::isspace(static_cast(dc))) { allSpace = false; break; } } if (allSpace) { if (!lastWasSpace) { result += ' '; lastWasSpace = true; } } else { result += decoded; lastWasSpace = false; lastWasNewline = false; } } continue; } // Collapse whitespace if (std::isspace(static_cast(c))) { if (!lastWasSpace) { result += ' '; lastWasSpace = true; } } else { result += c; lastWasSpace = false; lastWasNewline = false; } } } // Trim trailing whitespace while (!result.empty() && std::isspace(static_cast(result.back()))) { result.pop_back(); } return result; }