cottongin 4db384edb6
All checks were successful
CI / build (push) Successful in 2m23s
fix: prevent Serial.printf from blocking when USB disconnected
On ESP32-C3 with USB CDC, Serial.printf() blocks indefinitely when USB
is not connected. This caused device freezes when booted without USB.

Solution: Call Serial.setTxTimeoutMs(0) after Serial.begin() to make
all Serial output non-blocking.

Also added if (Serial) guards to high-traffic logging paths in
EpubReaderActivity as belt-and-suspenders protection.

Includes documentation of the debugging process and Serial call inventory.

Also applies clang-format to fix pre-existing formatting issues.
2026-01-28 16:16:11 -05:00

764 lines
23 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "StarDict.h"
#include <HardwareSerial.h>
#include <SDCardManager.h>
#include <miniz.h>
#include <algorithm>
#include <cctype>
#include "DictPrefixIndex.generated.h"
StarDict::StarDict(const std::string& basePath) : basePath(basePath) {}
StarDict::~StarDict() {
if (dzInfo.chunkSizes) {
free(dzInfo.chunkSizes);
dzInfo.chunkSizes = nullptr;
}
}
uint32_t StarDict::readBE32(const uint8_t* data) {
return (static_cast<uint32_t>(data[0]) << 24) | (static_cast<uint32_t>(data[1]) << 16) |
(static_cast<uint32_t>(data[2]) << 8) | static_cast<uint32_t>(data[3]);
}
bool StarDict::loadInfo() {
const std::string ifoPath = basePath + ".ifo";
FsFile file;
if (!SdMan.openFileForRead("DICT", ifoPath, file)) {
Serial.printf("[%lu] [DICT] Failed to open .ifo file: %s\n", millis(), ifoPath.c_str());
return false;
}
char buffer[256];
while (file.available()) {
const int len = file.fgets(buffer, sizeof(buffer));
if (len <= 0) break;
// Remove newline
char* newline = strchr(buffer, '\n');
if (newline) *newline = '\0';
newline = strchr(buffer, '\r');
if (newline) *newline = '\0';
// Parse key=value
char* eq = strchr(buffer, '=');
if (!eq) continue;
*eq = '\0';
const char* key = buffer;
const char* value = eq + 1;
if (strcmp(key, "bookname") == 0) {
info.bookname = value;
} else if (strcmp(key, "wordcount") == 0) {
info.wordcount = strtoul(value, nullptr, 10);
} else if (strcmp(key, "idxfilesize") == 0) {
info.idxfilesize = strtoul(value, nullptr, 10);
} else if (strcmp(key, "sametypesequence") == 0) {
info.sametypesequence = value[0];
} else if (strcmp(key, "synwordcount") == 0) {
info.synwordcount = strtoul(value, nullptr, 10);
}
}
file.close();
info.loaded = true;
Serial.printf("[%lu] [DICT] Loaded dictionary: %s (%u words)\n", millis(), info.bookname.c_str(), info.wordcount);
return true;
}
bool StarDict::loadDictzipHeader() {
if (dzInfo.loaded) return true;
const std::string dzPath = basePath + ".dict.dz";
FsFile file;
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
Serial.printf("[%lu] [DICT] Failed to open .dict.dz file\n", millis());
return false;
}
// Read gzip header
uint8_t header[10];
if (file.read(header, 10) != 10) {
file.close();
return false;
}
// Verify gzip magic number
if (header[0] != 0x1f || header[1] != 0x8b) {
Serial.printf("[%lu] [DICT] Not a valid gzip file\n", millis());
file.close();
return false;
}
// Check for extra field flag (bit 2)
const uint8_t flags = header[3];
if (!(flags & 0x04)) {
Serial.printf("[%lu] [DICT] No extra field - not a dictzip file\n", millis());
file.close();
return false;
}
// Read extra field length
uint8_t xlenBuf[2];
if (file.read(xlenBuf, 2) != 2) {
file.close();
return false;
}
const uint16_t xlen = xlenBuf[0] | (xlenBuf[1] << 8);
// Read extra field
auto* extraField = static_cast<uint8_t*>(malloc(xlen));
if (!extraField) {
file.close();
return false;
}
if (file.read(extraField, xlen) != xlen) {
free(extraField);
file.close();
return false;
}
// Parse dictzip subfield (SI1='R', SI2='A')
bool foundDictzip = false;
uint16_t pos = 0;
while (pos + 4 <= xlen) {
const uint8_t si1 = extraField[pos];
const uint8_t si2 = extraField[pos + 1];
const uint16_t slen = extraField[pos + 2] | (extraField[pos + 3] << 8);
if (si1 == 'R' && si2 == 'A' && pos + 4 + slen <= xlen) {
// Dictzip subfield found
// Format: ver(2) + chlen(2) + count(2) + sizes[count](2 each)
const uint8_t* data = &extraField[pos + 4];
// uint16_t version = data[0] | (data[1] << 8); // Usually 1
dzInfo.chunkLength = data[2] | (data[3] << 8);
dzInfo.chunkCount = data[4] | (data[5] << 8);
dzInfo.chunkSizes = static_cast<uint16_t*>(malloc(dzInfo.chunkCount * sizeof(uint16_t)));
if (!dzInfo.chunkSizes) {
free(extraField);
file.close();
return false;
}
for (uint16_t i = 0; i < dzInfo.chunkCount; i++) {
dzInfo.chunkSizes[i] = data[6 + i * 2] | (data[7 + i * 2] << 8);
}
foundDictzip = true;
break;
}
pos += 4 + slen;
}
free(extraField);
if (!foundDictzip) {
Serial.printf("[%lu] [DICT] Dictzip subfield not found\n", millis());
file.close();
return false;
}
// Calculate header size (10 + 2 + xlen + optional fields)
dzInfo.headerSize = 10 + 2 + xlen;
// Skip FNAME if present (bit 3)
if (flags & 0x08) {
file.seek(dzInfo.headerSize);
while (file.available()) {
uint8_t c;
file.read(&c, 1);
dzInfo.headerSize++;
if (c == 0) break;
}
}
// Skip FCOMMENT if present (bit 4)
if (flags & 0x10) {
file.seek(dzInfo.headerSize);
while (file.available()) {
uint8_t c;
file.read(&c, 1);
dzInfo.headerSize++;
if (c == 0) break;
}
}
// Skip FHCRC if present (bit 1)
if (flags & 0x02) {
dzInfo.headerSize += 2;
}
file.close();
dzInfo.loaded = true;
Serial.printf("[%lu] [DICT] Dictzip: %u chunks of %u bytes, header size %u\n", millis(), dzInfo.chunkCount,
dzInfo.chunkLength, dzInfo.headerSize);
return true;
}
bool StarDict::begin() {
if (!loadInfo()) return false;
if (!loadDictzipHeader()) return false;
return true;
}
bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
uint32_t& dictSize) {
idxFile.seek(position);
// Read null-terminated word
word.clear();
char c;
while (idxFile.read(&c, 1) == 1) {
if (c == '\0') break;
word += c;
if (word.length() > 256) {
// Safety limit
return false;
}
}
if (word.empty()) return false;
// Read 4-byte big-endian offset
uint8_t buf[8];
if (idxFile.read(buf, 8) != 8) return false;
dictOffset = readBE32(buf);
dictSize = readBE32(buf + 4);
position = idxFile.position();
return true;
}
bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
if (!dzInfo.loaded) return false;
const std::string dzPath = basePath + ".dict.dz";
FsFile file;
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
return false;
}
// Calculate which chunk(s) we need
const uint32_t startChunk = offset / dzInfo.chunkLength;
const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;
if (endChunk >= dzInfo.chunkCount) {
file.close();
return false;
}
// Calculate file offset for start chunk
uint32_t fileOffset = dzInfo.headerSize;
for (uint32_t i = 0; i < startChunk; i++) {
fileOffset += dzInfo.chunkSizes[i];
}
// Allocate buffers
const uint32_t maxCompressedSize = 65536; // Max compressed chunk size
auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
if (!compressedBuf || !decompressedBuf) {
free(compressedBuf);
free(decompressedBuf);
file.close();
return false;
}
definition.clear();
definition.reserve(size);
// Process each needed chunk
for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
const uint16_t compressedSize = dzInfo.chunkSizes[chunk];
// Seek and read compressed data
file.seek(fileOffset);
if (file.read(compressedBuf, compressedSize) != compressedSize) {
free(compressedBuf);
free(decompressedBuf);
file.close();
return false;
}
// Decompress using raw inflate (no zlib header)
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (!inflator) {
free(compressedBuf);
free(decompressedBuf);
file.close();
return false;
}
tinfl_init(inflator);
size_t inBytes = compressedSize;
size_t outBytes = dzInfo.chunkLength;
const tinfl_status status =
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);
free(inflator);
if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
// Try without zlib header flag
inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
if (inflator) {
tinfl_init(inflator);
inBytes = compressedSize;
outBytes = dzInfo.chunkLength;
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
free(inflator);
}
}
// Extract the portion we need from this chunk
uint32_t copyStart = 0;
uint32_t copyEnd = outBytes;
if (chunk == startChunk) {
copyStart = startOffsetInChunk;
}
if (chunk == endChunk) {
const uint32_t endOffsetInChunk = (offset + size) - (endChunk * dzInfo.chunkLength);
if (endOffsetInChunk < copyEnd) {
copyEnd = endOffsetInChunk;
}
}
if (copyEnd > copyStart) {
definition.append(reinterpret_cast<char*>(decompressedBuf + copyStart), copyEnd - copyStart);
}
fileOffset += compressedSize;
}
free(compressedBuf);
free(decompressedBuf);
file.close();
return true;
}
// StarDict comparison function: case-insensitive first, then case-sensitive as tiebreaker
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
// First: case-insensitive comparison (like g_ascii_strcasecmp)
size_t i = 0;
while (i < a.length() && i < b.length()) {
const int ca = std::tolower(static_cast<unsigned char>(a[i]));
const int cb = std::tolower(static_cast<unsigned char>(b[i]));
if (ca != cb) return ca - cb;
i++;
}
if (a.length() != b.length()) {
return static_cast<int>(a.length()) - static_cast<int>(b.length());
}
// If case-insensitive equal, use case-sensitive as tiebreaker
return a.compare(b);
}
std::string StarDict::normalizeWord(const std::string& word) {
std::string result;
result.reserve(word.length());
// Trim leading whitespace
size_t start = 0;
while (start < word.length() && std::isspace(static_cast<unsigned char>(word[start]))) {
start++;
}
// Trim trailing whitespace
size_t end = word.length();
while (end > start && std::isspace(static_cast<unsigned char>(word[end - 1]))) {
end--;
}
// Convert to lowercase
for (size_t i = start; i < end; i++) {
result += static_cast<char>(std::tolower(static_cast<unsigned char>(word[i])));
}
return result;
}
StarDict::LookupResult StarDict::lookup(const std::string& word) {
LookupResult result;
result.word = word;
if (!info.loaded) {
return result;
}
const std::string normalizedSearch = normalizeWord(word);
if (normalizedSearch.empty()) {
return result;
}
// First try .idx (main entries) - use prefix jump table for fast lookup
const std::string idxPath = basePath + ".idx";
FsFile idxFile;
if (!SdMan.openFileForRead("DICT", idxPath, idxFile)) {
Serial.printf("[%lu] [DICT] Failed to open index file\n", millis());
return result;
}
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
uint32_t position = 0;
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
}
bool found = false;
while (position < info.idxfilesize) {
std::string currentWord;
uint32_t dictOffset, dictSize;
if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
break;
}
// Use stardictStrcmp for case-insensitive matching
const int cmp = stardictStrcmp(normalizedSearch, currentWord);
if (cmp == 0) {
std::string definition;
if (decompressDefinition(dictOffset, dictSize, definition)) {
if (!found) {
result.word = currentWord;
result.definition = definition;
result.found = true;
found = true;
} else {
result.definition += "</html>" + definition;
}
}
// Continue scanning for additional matches (same word, different case)
} else if (cmp < 0) {
// Passed where target would be (file is sorted)
break;
}
}
idxFile.close();
// If not found in main index, try synonym file with prefix jump
if (!found && info.synwordcount > 0) {
const std::string synPath = basePath + ".syn";
FsFile synFile;
if (SdMan.openFileForRead("DICT", synPath, synFile)) {
const uint32_t synFileSize = synFile.size();
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
uint32_t synPosition = 0;
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
synPosition = DictPrefixIndex::synPrefixOffsets[prefixIdx];
synFile.seek(synPosition);
}
while (synFile.position() < synFileSize) {
// Read synonym word (null-terminated)
std::string synWord;
char c;
while (synFile.read(&c, 1) == 1 && c != '\0') {
synWord += c;
}
// Read 4-byte big-endian index
uint8_t idxBytes[4];
if (synFile.read(idxBytes, 4) != 4) break;
const uint32_t mainIdx = readBE32(idxBytes);
// Use stardictStrcmp for case-insensitive comparison
const int cmp = stardictStrcmp(normalizedSearch, synWord);
if (cmp == 0) {
// Found synonym - look up the main entry by index
FsFile idxFile2;
if (SdMan.openFileForRead("DICT", idxPath, idxFile2)) {
uint32_t pos = 0;
uint32_t entryNum = 0;
while (entryNum < mainIdx && pos < info.idxfilesize) {
std::string w;
uint32_t off, sz;
if (!readWordAtPosition(idxFile2, pos, w, off, sz)) break;
entryNum++;
}
// Now read the target entry
if (entryNum == mainIdx) {
std::string mainWord;
uint32_t dictOffset, dictSize;
if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
std::string definition;
if (decompressDefinition(dictOffset, dictSize, definition)) {
result.word = synWord;
result.definition = definition;
result.found = true;
found = true;
}
}
}
idxFile2.close();
}
break; // Found a match, stop searching
} else if (cmp < 0) {
// Passed where it would be (file is sorted)
break;
}
}
synFile.close();
}
}
return result;
}
// Helper to decode a single HTML entity starting at position i (after the '&')
// Returns the decoded string and advances i past the entity (including ';')
static std::string decodeHtmlEntity(const std::string& html, size_t& i) {
const size_t start = i; // Position of '&'
const size_t remaining = html.length() - start;
// Numeric entities: &#NNN; or &#xHHH;
if (remaining > 2 && html[start + 1] == '#') {
size_t numStart = start + 2;
bool isHex = false;
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
isHex = true;
numStart++;
}
size_t numEnd = numStart;
while (numEnd < html.length() && html[numEnd] != ';') {
const char c = html[numEnd];
if (isHex) {
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
} else {
if (!std::isdigit(static_cast<unsigned char>(c))) break;
}
numEnd++;
}
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
const std::string numStr = html.substr(numStart, numEnd - numStart);
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
i = numEnd; // Will be incremented by caller's loop
// Convert codepoint to UTF-8
std::string utf8;
if (codepoint < 0x80) {
utf8 += static_cast<char>(codepoint);
} else if (codepoint < 0x800) {
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x10000) {
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint < 0x110000) {
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
}
return utf8;
}
}
// Named entities - find the semicolon first
size_t semicolon = html.find(';', start + 1);
if (semicolon != std::string::npos && semicolon - start < 12) {
const std::string entity = html.substr(start, semicolon - start + 1);
// Common named entities
struct EntityMapping {
const char* entity;
const char* replacement;
};
static const EntityMapping entities[] = {
{"&nbsp;", " "},
{"&lt;", "<"},
{"&gt;", ">"},
{"&amp;", "&"},
{"&quot;", "\""},
{"&apos;", "'"},
{"&mdash;", "\xe2\x80\x94"}, // —
{"&ndash;", "\xe2\x80\x93"}, //
{"&hellip;", "\xe2\x80\xa6"}, // …
{"&rsquo;", "\xe2\x80\x99"}, // '
{"&lsquo;", "\xe2\x80\x98"}, // '
{"&rdquo;", "\xe2\x80\x9d"}, // "
{"&ldquo;", "\xe2\x80\x9c"}, // "
{"&deg;", "\xc2\xb0"}, // °
{"&times;", "\xc3\x97"}, // ×
{"&divide;", "\xc3\xb7"}, // ÷
{"&plusmn;", "\xc2\xb1"}, // ±
{"&frac12;", "\xc2\xbd"}, // ½
{"&frac14;", "\xc2\xbc"}, // ¼
{"&frac34;", "\xc2\xbe"}, // ¾
{"&cent;", "\xc2\xa2"}, // ¢
{"&pound;", "\xc2\xa3"}, // £
{"&euro;", "\xe2\x82\xac"}, // €
{"&yen;", "\xc2\xa5"}, // ¥
{"&copy;", "\xc2\xa9"}, // ©
{"&reg;", "\xc2\xae"}, // ®
{"&trade;", "\xe2\x84\xa2"}, // ™
{"&bull;", "\xe2\x80\xa2"}, // •
{"&middot;", "\xc2\xb7"}, // ·
{"&sect;", "\xc2\xa7"}, // §
{"&para;", "\xc2\xb6"}, // ¶
{"&dagger;", "\xe2\x80\xa0"}, // †
{"&Dagger;", "\xe2\x80\xa1"}, // ‡
{"&iexcl;", "\xc2\xa1"}, // ¡
{"&iquest;", "\xc2\xbf"}, // ¿
{"&laquo;", "\xc2\xab"}, // «
{"&raquo;", "\xc2\xbb"}, // »
{"&shy;", ""},
{"&ensp;", " "},
{"&emsp;", " "},
{"&thinsp;", " "},
{"&zwj;", ""},
{"&zwnj;", ""},
};
for (const auto& mapping : entities) {
if (entity == mapping.entity) {
i = semicolon; // Will be incremented by caller's loop
return mapping.replacement;
}
}
}
// Unknown entity - return just the ampersand and let the rest be processed normally
return "&";
}
// Helper to check if a tag is a block-level element that needs line breaks
static bool isBlockTag(const std::string& tag, bool isClosing) {
// Normalize to lowercase for comparison
std::string lowerTag = tag;
for (char& c : lowerTag) {
c = std::tolower(static_cast<unsigned char>(c));
}
// Block-level tags that should have line breaks
if (lowerTag == "p" || lowerTag == "div" || lowerTag == "br" || lowerTag == "hr" || lowerTag == "li" ||
lowerTag == "dt" || lowerTag == "dd" || lowerTag == "tr" || lowerTag == "h1" || lowerTag == "h2" ||
lowerTag == "h3" || lowerTag == "h4" || lowerTag == "h5" || lowerTag == "h6" || lowerTag == "blockquote" ||
lowerTag == "pre" || lowerTag == "ol" || lowerTag == "ul") {
return true;
}
return false;
}
std::string StarDict::stripHtml(const std::string& html) {
std::string result;
result.reserve(html.length());
bool inTag = false;
bool lastWasSpace = false;
bool lastWasNewline = false;
for (size_t i = 0; i < html.length(); i++) {
const char c = html[i];
if (c == '<') {
// Parse the tag name
size_t tagStart = i + 1;
bool isClosing = false;
// Skip whitespace after <
while (tagStart < html.length() && std::isspace(static_cast<unsigned char>(html[tagStart]))) {
tagStart++;
}
// Check for closing tag
if (tagStart < html.length() && html[tagStart] == '/') {
isClosing = true;
tagStart++;
}
// Extract tag name
size_t tagEnd = tagStart;
while (tagEnd < html.length() && !std::isspace(static_cast<unsigned char>(html[tagEnd])) && html[tagEnd] != '>' &&
html[tagEnd] != '/') {
tagEnd++;
}
const std::string tagName = html.substr(tagStart, tagEnd - tagStart);
// Check if this is a block-level element
if (isBlockTag(tagName, isClosing)) {
// Add line break for block elements
if (!result.empty() && !lastWasNewline) {
result += '\n';
lastWasNewline = true;
lastWasSpace = true;
}
}
inTag = true;
} else if (c == '>') {
inTag = false;
} else if (!inTag) {
// Handle HTML entities
if (c == '&') {
const std::string decoded = decodeHtmlEntity(html, i);
if (!decoded.empty()) {
// Check if decoded content is whitespace
bool allSpace = true;
for (const char dc : decoded) {
if (!std::isspace(static_cast<unsigned char>(dc))) {
allSpace = false;
break;
}
}
if (allSpace) {
if (!lastWasSpace) {
result += ' ';
lastWasSpace = true;
}
} else {
result += decoded;
lastWasSpace = false;
lastWasNewline = false;
}
}
continue;
}
// Collapse whitespace
if (std::isspace(static_cast<unsigned char>(c))) {
if (!lastWasSpace) {
result += ' ';
lastWasSpace = true;
}
} else {
result += c;
lastWasSpace = false;
lastWasNewline = false;
}
}
}
// Trim trailing whitespace
while (!result.empty() && std::isspace(static_cast<unsigned char>(result.back()))) {
result.pop_back();
}
return result;
}