sort of working dictionary
This commit is contained in:
@@ -31,6 +31,9 @@ class PageLine final : public PageElement {
|
||||
void render(GfxRenderer& renderer, int fontId, int xOffset, int yOffset) override;
|
||||
bool serialize(FsFile& file) override;
|
||||
static std::unique_ptr<PageLine> deserialize(FsFile& file);
|
||||
|
||||
// Getter for word selection support
|
||||
const std::shared_ptr<TextBlock>& getTextBlock() const { return block; }
|
||||
};
|
||||
|
||||
class Page {
|
||||
|
||||
@@ -48,6 +48,12 @@ class TextBlock final : public Block {
|
||||
Style getStyle() const { return style; }
|
||||
const BlockStyle& getBlockStyle() const { return blockStyle; }
|
||||
bool isEmpty() override { return words.empty(); }
|
||||
|
||||
// Getters for word selection support
|
||||
const std::list<std::string>& getWords() const { return words; }
|
||||
const std::list<uint16_t>& getWordXPositions() const { return wordXpos; }
|
||||
const std::list<EpdFontFamily::Style>& getWordStyles() const { return wordStyles; }
|
||||
size_t getWordCount() const { return words.size(); }
|
||||
void layout(GfxRenderer& renderer) override {};
|
||||
// given a renderer works out where to break the words into lines
|
||||
void render(const GfxRenderer& renderer, int fontId, int x, int y) const;
|
||||
|
||||
@@ -510,7 +510,10 @@ void GfxRenderer::drawButtonHints(const int fontId, const char* btn1, const char
|
||||
setOrientation(orig_orientation);
|
||||
}
|
||||
|
||||
void GfxRenderer::drawSideButtonHints(const int fontId, const char* topBtn, const char* bottomBtn) const {
|
||||
void GfxRenderer::drawSideButtonHints(const int fontId, const char* topBtn, const char* bottomBtn) {
|
||||
const Orientation orig_orientation = getOrientation();
|
||||
setOrientation(Orientation::Portrait);
|
||||
|
||||
const int screenWidth = getScreenWidth();
|
||||
constexpr int buttonWidth = 40; // Width on screen (height when rotated)
|
||||
constexpr int buttonHeight = 80; // Height on screen (width when rotated)
|
||||
@@ -559,6 +562,8 @@ void GfxRenderer::drawSideButtonHints(const int fontId, const char* topBtn, cons
|
||||
drawTextRotated90CW(fontId, textX, textY, labels[i]);
|
||||
}
|
||||
}
|
||||
|
||||
setOrientation(orig_orientation);
|
||||
}
|
||||
|
||||
int GfxRenderer::getTextHeight(const int fontId) const {
|
||||
@@ -862,3 +867,4 @@ void GfxRenderer::getOrientedViewableTRBL(int* outTop, int* outRight, int* outBo
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ class GfxRenderer {
|
||||
|
||||
// UI Components
|
||||
void drawButtonHints(int fontId, const char* btn1, const char* btn2, const char* btn3, const char* btn4);
|
||||
void drawSideButtonHints(int fontId, const char* topBtn, const char* bottomBtn) const;
|
||||
void drawSideButtonHints(int fontId, const char* topBtn, const char* bottomBtn);
|
||||
|
||||
private:
|
||||
// Helper for drawing rotated text (90 degrees clockwise, for side buttons)
|
||||
|
||||
370
lib/StarDict/DictHtmlParser.cpp
Normal file
370
lib/StarDict/DictHtmlParser.cpp
Normal file
@@ -0,0 +1,370 @@
|
||||
#include "DictHtmlParser.h"
|
||||
|
||||
#include <Epub/ParsedText.h>
|
||||
#include <GfxRenderer.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <stack>
|
||||
|
||||
std::string DictHtmlParser::decodeEntity(const std::string& html, size_t& i) {
|
||||
const size_t start = i; // Position of '&'
|
||||
const size_t remaining = html.length() - start;
|
||||
|
||||
// Numeric entities: &#NNN; or &#xHHH;
|
||||
if (remaining > 2 && html[start + 1] == '#') {
|
||||
size_t numStart = start + 2;
|
||||
bool isHex = false;
|
||||
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
|
||||
isHex = true;
|
||||
numStart++;
|
||||
}
|
||||
|
||||
size_t numEnd = numStart;
|
||||
while (numEnd < html.length() && html[numEnd] != ';') {
|
||||
const char c = html[numEnd];
|
||||
if (isHex) {
|
||||
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
|
||||
} else {
|
||||
if (!std::isdigit(static_cast<unsigned char>(c))) break;
|
||||
}
|
||||
numEnd++;
|
||||
}
|
||||
|
||||
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
|
||||
const std::string numStr = html.substr(numStart, numEnd - numStart);
|
||||
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
|
||||
i = numEnd; // Will be incremented by caller's loop
|
||||
|
||||
// Convert codepoint to UTF-8
|
||||
std::string utf8;
|
||||
if (codepoint < 0x80) {
|
||||
utf8 += static_cast<char>(codepoint);
|
||||
} else if (codepoint < 0x800) {
|
||||
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
|
||||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||||
} else if (codepoint < 0x10000) {
|
||||
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
|
||||
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
||||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||||
} else if (codepoint < 0x110000) {
|
||||
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
|
||||
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
|
||||
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
||||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||||
}
|
||||
return utf8;
|
||||
}
|
||||
}
|
||||
|
||||
// Named entities - find the semicolon first
|
||||
size_t semicolon = html.find(';', start + 1);
|
||||
if (semicolon != std::string::npos && semicolon - start < 12) {
|
||||
const std::string entity = html.substr(start, semicolon - start + 1);
|
||||
|
||||
// Common named entities
|
||||
struct EntityMapping {
|
||||
const char* entity;
|
||||
const char* replacement;
|
||||
};
|
||||
static const EntityMapping entities[] = {
|
||||
{" ", " "},
|
||||
{"<", "<"},
|
||||
{">", ">"},
|
||||
{"&", "&"},
|
||||
{""", "\""},
|
||||
{"'", "'"},
|
||||
{"—", "\xe2\x80\x94"}, // —
|
||||
{"–", "\xe2\x80\x93"}, // –
|
||||
{"…", "\xe2\x80\xa6"}, // …
|
||||
{"’", "\xe2\x80\x99"}, // '
|
||||
{"‘", "\xe2\x80\x98"}, // '
|
||||
{"”", "\xe2\x80\x9d"}, // "
|
||||
{"“", "\xe2\x80\x9c"}, // "
|
||||
{"°", "\xc2\xb0"}, // °
|
||||
{"×", "\xc3\x97"}, // ×
|
||||
{"÷", "\xc3\xb7"}, // ÷
|
||||
{"±", "\xc2\xb1"}, // ±
|
||||
{"½", "\xc2\xbd"}, // ½
|
||||
{"¼", "\xc2\xbc"}, // ¼
|
||||
{"¾", "\xc2\xbe"}, // ¾
|
||||
{"¢", "\xc2\xa2"}, // ¢
|
||||
{"£", "\xc2\xa3"}, // £
|
||||
{"€", "\xe2\x82\xac"}, // €
|
||||
{"¥", "\xc2\xa5"}, // ¥
|
||||
{"©", "\xc2\xa9"}, // ©
|
||||
{"®", "\xc2\xae"}, // ®
|
||||
{"™", "\xe2\x84\xa2"}, // ™
|
||||
{"•", "\xe2\x80\xa2"}, // •
|
||||
{"·", "\xc2\xb7"}, // ·
|
||||
{"§", "\xc2\xa7"}, // §
|
||||
{"¶", "\xc2\xb6"}, // ¶
|
||||
{"†", "\xe2\x80\xa0"}, // †
|
||||
{"‡", "\xe2\x80\xa1"}, // ‡
|
||||
{"¡", "\xc2\xa1"}, // ¡
|
||||
{"¿", "\xc2\xbf"}, // ¿
|
||||
{"«", "\xc2\xab"}, // «
|
||||
{"»", "\xc2\xbb"}, // »
|
||||
{"‎", ""}, // Left-to-right mark (invisible)
|
||||
{"‏", ""}, // Right-to-left mark (invisible)
|
||||
{"­", ""}, // Soft hyphen
|
||||
{" ", " "},
|
||||
{" ", " "},
|
||||
{" ", " "},
|
||||
{"‍", ""},
|
||||
{"‌", ""},
|
||||
};
|
||||
|
||||
for (const auto& mapping : entities) {
|
||||
if (entity == mapping.entity) {
|
||||
i = semicolon; // Will be incremented by caller's loop
|
||||
return mapping.replacement;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Unknown entity - return just the ampersand
|
||||
return "&";
|
||||
}
|
||||
|
||||
std::string DictHtmlParser::extractTagName(const std::string& html, size_t start, bool& isClosing) {
|
||||
isClosing = false;
|
||||
size_t pos = start;
|
||||
|
||||
// Skip whitespace after '<'
|
||||
while (pos < html.length() && std::isspace(static_cast<unsigned char>(html[pos]))) {
|
||||
pos++;
|
||||
}
|
||||
|
||||
// Check for closing tag
|
||||
if (pos < html.length() && html[pos] == '/') {
|
||||
isClosing = true;
|
||||
pos++;
|
||||
}
|
||||
|
||||
// Extract tag name (alphanumeric characters)
|
||||
size_t nameStart = pos;
|
||||
while (pos < html.length() && (std::isalnum(static_cast<unsigned char>(html[pos])) || html[pos] == '!')) {
|
||||
pos++;
|
||||
}
|
||||
|
||||
std::string tagName = html.substr(nameStart, pos - nameStart);
|
||||
// Convert to lowercase
|
||||
std::transform(tagName.begin(), tagName.end(), tagName.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
return tagName;
|
||||
}
|
||||
|
||||
bool DictHtmlParser::isBlockTag(const std::string& tagName) {
|
||||
return tagName == "p" || tagName == "div" || tagName == "br" || tagName == "hr" || tagName == "li" ||
|
||||
tagName == "ol" || tagName == "ul" || tagName == "dt" || tagName == "dd" || tagName == "html";
|
||||
}
|
||||
|
||||
bool DictHtmlParser::isBoldTag(const std::string& tagName) {
|
||||
return tagName == "b" || tagName == "strong";
|
||||
}
|
||||
|
||||
bool DictHtmlParser::isItalicTag(const std::string& tagName) {
|
||||
return tagName == "i" || tagName == "em";
|
||||
}
|
||||
|
||||
bool DictHtmlParser::isUnderlineTag(const std::string& tagName) {
|
||||
return tagName == "u" || tagName == "ins";
|
||||
}
|
||||
|
||||
bool DictHtmlParser::isSuperscriptTag(const std::string& tagName) { return tagName == "sup"; }
|
||||
|
||||
bool DictHtmlParser::isListItemTag(const std::string& tagName) { return tagName == "li"; }
|
||||
|
||||
bool DictHtmlParser::isOrderedListTag(const std::string& tagName) { return tagName == "ol"; }
|
||||
|
||||
void DictHtmlParser::parse(const std::string& html, int fontId, const GfxRenderer& renderer, uint16_t viewportWidth,
|
||||
const std::function<void(std::shared_ptr<TextBlock>)>& onTextBlock) {
|
||||
// Current paragraph being built
|
||||
ParsedText currentParagraph(TextBlock::Style::LEFT_ALIGN, false, false);
|
||||
|
||||
// State tracking
|
||||
int boldDepth = 0;
|
||||
int italicDepth = 0;
|
||||
int underlineDepth = 0;
|
||||
bool inSuperscript = false;
|
||||
bool inTag = false;
|
||||
|
||||
// List tracking
|
||||
std::stack<int> listCounters; // Stack for nested lists (0 = unordered, >0 = ordered counter)
|
||||
|
||||
// Current word being accumulated
|
||||
std::string currentWord;
|
||||
bool lastWasSpace = true; // Start true to skip leading spaces
|
||||
|
||||
// Helper to flush current word to paragraph
|
||||
auto flushWord = [&]() {
|
||||
if (currentWord.empty()) return;
|
||||
|
||||
// Determine font style
|
||||
EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
|
||||
if (boldDepth > 0 && italicDepth > 0) {
|
||||
fontStyle = EpdFontFamily::BOLD_ITALIC;
|
||||
} else if (boldDepth > 0) {
|
||||
fontStyle = EpdFontFamily::BOLD;
|
||||
} else if (italicDepth > 0) {
|
||||
fontStyle = EpdFontFamily::ITALIC;
|
||||
}
|
||||
|
||||
currentParagraph.addWord(currentWord, fontStyle, underlineDepth > 0);
|
||||
currentWord.clear();
|
||||
lastWasSpace = false;
|
||||
};
|
||||
|
||||
// Helper to flush current paragraph (create TextBlocks)
|
||||
auto flushParagraph = [&]() {
|
||||
flushWord();
|
||||
if (!currentParagraph.isEmpty()) {
|
||||
currentParagraph.layoutAndExtractLines(renderer, fontId, viewportWidth, onTextBlock);
|
||||
currentParagraph = ParsedText(TextBlock::Style::LEFT_ALIGN, false, false);
|
||||
}
|
||||
lastWasSpace = true;
|
||||
};
|
||||
|
||||
// Parse the HTML
|
||||
for (size_t i = 0; i < html.length(); i++) {
|
||||
const char c = html[i];
|
||||
|
||||
if (c == '<') {
|
||||
// Start of tag - flush current word first
|
||||
flushWord();
|
||||
|
||||
// Find end of tag
|
||||
size_t tagEnd = html.find('>', i);
|
||||
if (tagEnd == std::string::npos) {
|
||||
// Malformed HTML - treat rest as text
|
||||
currentWord += c;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract tag name
|
||||
bool isClosing = false;
|
||||
std::string tagName = extractTagName(html, i + 1, isClosing);
|
||||
|
||||
// Handle different tag types
|
||||
if (isBoldTag(tagName)) {
|
||||
if (isClosing) {
|
||||
boldDepth = std::max(0, boldDepth - 1);
|
||||
} else {
|
||||
boldDepth++;
|
||||
}
|
||||
} else if (isItalicTag(tagName)) {
|
||||
if (isClosing) {
|
||||
italicDepth = std::max(0, italicDepth - 1);
|
||||
} else {
|
||||
italicDepth++;
|
||||
}
|
||||
} else if (isUnderlineTag(tagName)) {
|
||||
if (isClosing) {
|
||||
underlineDepth = std::max(0, underlineDepth - 1);
|
||||
} else {
|
||||
underlineDepth++;
|
||||
}
|
||||
} else if (isSuperscriptTag(tagName)) {
|
||||
if (isClosing) {
|
||||
inSuperscript = false;
|
||||
} else {
|
||||
inSuperscript = true;
|
||||
// Add caret prefix for superscript
|
||||
currentWord += '^';
|
||||
}
|
||||
} else if (isOrderedListTag(tagName)) {
|
||||
if (isClosing) {
|
||||
if (!listCounters.empty()) {
|
||||
listCounters.pop();
|
||||
}
|
||||
} else {
|
||||
// Check if it's an unordered list style
|
||||
std::string tagContent = html.substr(i, tagEnd - i);
|
||||
if (tagContent.find("list-style-type:lower-alpha") != std::string::npos) {
|
||||
listCounters.push(-1); // -1 = alphabetic
|
||||
} else {
|
||||
listCounters.push(1); // Start at 1 for ordered
|
||||
}
|
||||
}
|
||||
} else if (tagName == "ul") {
|
||||
if (isClosing) {
|
||||
if (!listCounters.empty()) {
|
||||
listCounters.pop();
|
||||
}
|
||||
} else {
|
||||
listCounters.push(0); // 0 = unordered (bullet)
|
||||
}
|
||||
} else if (isListItemTag(tagName) && !isClosing) {
|
||||
// Start of list item - flush paragraph and add bullet/number
|
||||
flushParagraph();
|
||||
|
||||
std::string prefix;
|
||||
if (!listCounters.empty()) {
|
||||
int counter = listCounters.top();
|
||||
if (counter == 0) {
|
||||
// Unordered - bullet point
|
||||
prefix = "\xe2\x80\xa2 "; // • bullet
|
||||
} else if (counter == -1) {
|
||||
// Alphabetic - not fully supported, just use bullet
|
||||
prefix = " ";
|
||||
} else {
|
||||
// Ordered - number
|
||||
char numBuf[8];
|
||||
snprintf(numBuf, sizeof(numBuf), "%d. ", counter);
|
||||
prefix = numBuf;
|
||||
listCounters.pop();
|
||||
listCounters.push(counter + 1); // Increment for next item
|
||||
}
|
||||
} else {
|
||||
// No list context - just indent
|
||||
prefix = "\xe2\x80\xa2 "; // • bullet
|
||||
}
|
||||
|
||||
// Add prefix as a word (em-space for indent + prefix)
|
||||
currentParagraph.addWord("\xe2\x80\x83" + prefix, EpdFontFamily::REGULAR, false);
|
||||
lastWasSpace = true;
|
||||
} else if (isBlockTag(tagName)) {
|
||||
// Block element - flush paragraph
|
||||
flushParagraph();
|
||||
|
||||
// Special handling for </html> which separates dictionary entries
|
||||
if (tagName == "html" && isClosing) {
|
||||
// Add extra spacing between entries
|
||||
flushParagraph();
|
||||
}
|
||||
}
|
||||
|
||||
// Skip to end of tag
|
||||
i = tagEnd;
|
||||
} else if (c == '&') {
|
||||
// HTML entity
|
||||
std::string decoded = decodeEntity(html, i);
|
||||
if (!decoded.empty()) {
|
||||
if (decoded == " ") {
|
||||
// Space entity - treat as space
|
||||
if (!lastWasSpace) {
|
||||
flushWord();
|
||||
lastWasSpace = true;
|
||||
}
|
||||
} else {
|
||||
currentWord += decoded;
|
||||
lastWasSpace = false;
|
||||
}
|
||||
}
|
||||
} else if (std::isspace(static_cast<unsigned char>(c))) {
|
||||
// Whitespace - flush word and collapse
|
||||
if (!lastWasSpace) {
|
||||
flushWord();
|
||||
lastWasSpace = true;
|
||||
}
|
||||
} else {
|
||||
// Regular character
|
||||
currentWord += c;
|
||||
lastWasSpace = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush any remaining content
|
||||
flushParagraph();
|
||||
}
|
||||
64
lib/StarDict/DictHtmlParser.h
Normal file
64
lib/StarDict/DictHtmlParser.h
Normal file
@@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
|
||||
#include <Epub/blocks/TextBlock.h>
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
class GfxRenderer;
|
||||
|
||||
/**
|
||||
* DictHtmlParser parses HTML dictionary definitions into ParsedText.
|
||||
*
|
||||
* Supports:
|
||||
* - Bold: <b>, <strong>
|
||||
* - Italic: <i>, <em>
|
||||
* - Underline: <u>, <ins>
|
||||
* - Lists: <ol>, <li> with numbering/bullets
|
||||
* - Block elements: <p>, <br>, <hr>, </html> (entry separator)
|
||||
* - HTML entities: numeric (&#NNN;, &#xHHH;) and named (&, etc.)
|
||||
* - Superscript: <sup> rendered as ^text
|
||||
*/
|
||||
class DictHtmlParser {
|
||||
public:
|
||||
/**
|
||||
* Parse HTML definition and populate ParsedText with styled words.
|
||||
* Each paragraph/block creates a separate ParsedText via the callback.
|
||||
*
|
||||
* @param html The HTML definition text
|
||||
* @param fontId Font ID for text width calculations
|
||||
* @param renderer Reference to renderer for layout
|
||||
* @param onParagraph Callback invoked for each paragraph/block of text
|
||||
*/
|
||||
static void parse(const std::string& html, int fontId, const GfxRenderer& renderer, uint16_t viewportWidth,
|
||||
const std::function<void(std::shared_ptr<TextBlock>)>& onTextBlock);
|
||||
|
||||
private:
|
||||
// Decode HTML entity at position i (starting with '&')
|
||||
static std::string decodeEntity(const std::string& html, size_t& i);
|
||||
|
||||
// Extract tag name from position (after '<')
|
||||
static std::string extractTagName(const std::string& html, size_t start, bool& isClosing);
|
||||
|
||||
// Check if tag is a block-level element
|
||||
static bool isBlockTag(const std::string& tagName);
|
||||
|
||||
// Check if tag starts/ends bold
|
||||
static bool isBoldTag(const std::string& tagName);
|
||||
|
||||
// Check if tag starts/ends italic
|
||||
static bool isItalicTag(const std::string& tagName);
|
||||
|
||||
// Check if tag starts/ends underline
|
||||
static bool isUnderlineTag(const std::string& tagName);
|
||||
|
||||
// Check if tag is superscript
|
||||
static bool isSuperscriptTag(const std::string& tagName);
|
||||
|
||||
// Check if tag is list item
|
||||
static bool isListItemTag(const std::string& tagName);
|
||||
|
||||
// Check if tag starts ordered list
|
||||
static bool isOrderedListTag(const std::string& tagName);
|
||||
};
|
||||
759
lib/StarDict/StarDict.cpp
Normal file
759
lib/StarDict/StarDict.cpp
Normal file
@@ -0,0 +1,759 @@
|
||||
#include "StarDict.h"
|
||||
|
||||
#include <HardwareSerial.h>
|
||||
#include <SDCardManager.h>
|
||||
#include <miniz.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
|
||||
#include "DictPrefixIndex.generated.h"
|
||||
|
||||
StarDict::StarDict(const std::string& basePath) : basePath(basePath) {}
|
||||
|
||||
StarDict::~StarDict() {
|
||||
if (dzInfo.chunkSizes) {
|
||||
free(dzInfo.chunkSizes);
|
||||
dzInfo.chunkSizes = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t StarDict::readBE32(const uint8_t* data) {
|
||||
return (static_cast<uint32_t>(data[0]) << 24) | (static_cast<uint32_t>(data[1]) << 16) |
|
||||
(static_cast<uint32_t>(data[2]) << 8) | static_cast<uint32_t>(data[3]);
|
||||
}
|
||||
|
||||
bool StarDict::loadInfo() {
|
||||
const std::string ifoPath = basePath + ".ifo";
|
||||
FsFile file;
|
||||
if (!SdMan.openFileForRead("DICT", ifoPath, file)) {
|
||||
Serial.printf("[%lu] [DICT] Failed to open .ifo file: %s\n", millis(), ifoPath.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
char buffer[256];
|
||||
while (file.available()) {
|
||||
const int len = file.fgets(buffer, sizeof(buffer));
|
||||
if (len <= 0) break;
|
||||
|
||||
// Remove newline
|
||||
char* newline = strchr(buffer, '\n');
|
||||
if (newline) *newline = '\0';
|
||||
newline = strchr(buffer, '\r');
|
||||
if (newline) *newline = '\0';
|
||||
|
||||
// Parse key=value
|
||||
char* eq = strchr(buffer, '=');
|
||||
if (!eq) continue;
|
||||
|
||||
*eq = '\0';
|
||||
const char* key = buffer;
|
||||
const char* value = eq + 1;
|
||||
|
||||
if (strcmp(key, "bookname") == 0) {
|
||||
info.bookname = value;
|
||||
} else if (strcmp(key, "wordcount") == 0) {
|
||||
info.wordcount = strtoul(value, nullptr, 10);
|
||||
} else if (strcmp(key, "idxfilesize") == 0) {
|
||||
info.idxfilesize = strtoul(value, nullptr, 10);
|
||||
} else if (strcmp(key, "sametypesequence") == 0) {
|
||||
info.sametypesequence = value[0];
|
||||
} else if (strcmp(key, "synwordcount") == 0) {
|
||||
info.synwordcount = strtoul(value, nullptr, 10);
|
||||
}
|
||||
}
|
||||
|
||||
file.close();
|
||||
info.loaded = true;
|
||||
|
||||
Serial.printf("[%lu] [DICT] Loaded dictionary: %s (%u words)\n", millis(), info.bookname.c_str(), info.wordcount);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StarDict::loadDictzipHeader() {
|
||||
if (dzInfo.loaded) return true;
|
||||
|
||||
const std::string dzPath = basePath + ".dict.dz";
|
||||
FsFile file;
|
||||
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
|
||||
Serial.printf("[%lu] [DICT] Failed to open .dict.dz file\n", millis());
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read gzip header
|
||||
uint8_t header[10];
|
||||
if (file.read(header, 10) != 10) {
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Verify gzip magic number
|
||||
if (header[0] != 0x1f || header[1] != 0x8b) {
|
||||
Serial.printf("[%lu] [DICT] Not a valid gzip file\n", millis());
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for extra field flag (bit 2)
|
||||
const uint8_t flags = header[3];
|
||||
if (!(flags & 0x04)) {
|
||||
Serial.printf("[%lu] [DICT] No extra field - not a dictzip file\n", millis());
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read extra field length
|
||||
uint8_t xlenBuf[2];
|
||||
if (file.read(xlenBuf, 2) != 2) {
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
const uint16_t xlen = xlenBuf[0] | (xlenBuf[1] << 8);
|
||||
|
||||
// Read extra field
|
||||
auto* extraField = static_cast<uint8_t*>(malloc(xlen));
|
||||
if (!extraField) {
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (file.read(extraField, xlen) != xlen) {
|
||||
free(extraField);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Parse dictzip subfield (SI1='R', SI2='A')
|
||||
bool foundDictzip = false;
|
||||
uint16_t pos = 0;
|
||||
while (pos + 4 <= xlen) {
|
||||
const uint8_t si1 = extraField[pos];
|
||||
const uint8_t si2 = extraField[pos + 1];
|
||||
const uint16_t slen = extraField[pos + 2] | (extraField[pos + 3] << 8);
|
||||
|
||||
if (si1 == 'R' && si2 == 'A' && pos + 4 + slen <= xlen) {
|
||||
// Dictzip subfield found
|
||||
// Format: ver(2) + chlen(2) + count(2) + sizes[count](2 each)
|
||||
const uint8_t* data = &extraField[pos + 4];
|
||||
// uint16_t version = data[0] | (data[1] << 8); // Usually 1
|
||||
dzInfo.chunkLength = data[2] | (data[3] << 8);
|
||||
dzInfo.chunkCount = data[4] | (data[5] << 8);
|
||||
|
||||
dzInfo.chunkSizes = static_cast<uint16_t*>(malloc(dzInfo.chunkCount * sizeof(uint16_t)));
|
||||
if (!dzInfo.chunkSizes) {
|
||||
free(extraField);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
for (uint16_t i = 0; i < dzInfo.chunkCount; i++) {
|
||||
dzInfo.chunkSizes[i] = data[6 + i * 2] | (data[7 + i * 2] << 8);
|
||||
}
|
||||
|
||||
foundDictzip = true;
|
||||
break;
|
||||
}
|
||||
|
||||
pos += 4 + slen;
|
||||
}
|
||||
|
||||
free(extraField);
|
||||
|
||||
if (!foundDictzip) {
|
||||
Serial.printf("[%lu] [DICT] Dictzip subfield not found\n", millis());
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate header size (10 + 2 + xlen + optional fields)
|
||||
dzInfo.headerSize = 10 + 2 + xlen;
|
||||
|
||||
// Skip FNAME if present (bit 3)
|
||||
if (flags & 0x08) {
|
||||
file.seek(dzInfo.headerSize);
|
||||
while (file.available()) {
|
||||
uint8_t c;
|
||||
file.read(&c, 1);
|
||||
dzInfo.headerSize++;
|
||||
if (c == 0) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip FCOMMENT if present (bit 4)
|
||||
if (flags & 0x10) {
|
||||
file.seek(dzInfo.headerSize);
|
||||
while (file.available()) {
|
||||
uint8_t c;
|
||||
file.read(&c, 1);
|
||||
dzInfo.headerSize++;
|
||||
if (c == 0) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip FHCRC if present (bit 1)
|
||||
if (flags & 0x02) {
|
||||
dzInfo.headerSize += 2;
|
||||
}
|
||||
|
||||
file.close();
|
||||
dzInfo.loaded = true;
|
||||
|
||||
Serial.printf("[%lu] [DICT] Dictzip: %u chunks of %u bytes, header size %u\n", millis(), dzInfo.chunkCount,
|
||||
dzInfo.chunkLength, dzInfo.headerSize);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StarDict::begin() {
|
||||
if (!loadInfo()) return false;
|
||||
if (!loadDictzipHeader()) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
|
||||
uint32_t& dictSize) {
|
||||
idxFile.seek(position);
|
||||
|
||||
// Read null-terminated word
|
||||
word.clear();
|
||||
char c;
|
||||
while (idxFile.read(&c, 1) == 1) {
|
||||
if (c == '\0') break;
|
||||
word += c;
|
||||
if (word.length() > 256) {
|
||||
// Safety limit
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (word.empty()) return false;
|
||||
|
||||
// Read 4-byte big-endian offset
|
||||
uint8_t buf[8];
|
||||
if (idxFile.read(buf, 8) != 8) return false;
|
||||
|
||||
dictOffset = readBE32(buf);
|
||||
dictSize = readBE32(buf + 4);
|
||||
|
||||
position = idxFile.position();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
|
||||
if (!dzInfo.loaded) return false;
|
||||
|
||||
const std::string dzPath = basePath + ".dict.dz";
|
||||
FsFile file;
|
||||
if (!SdMan.openFileForRead("DICT", dzPath, file)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate which chunk(s) we need
|
||||
const uint32_t startChunk = offset / dzInfo.chunkLength;
|
||||
const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
|
||||
const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;
|
||||
|
||||
if (endChunk >= dzInfo.chunkCount) {
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate file offset for start chunk
|
||||
uint32_t fileOffset = dzInfo.headerSize;
|
||||
for (uint32_t i = 0; i < startChunk; i++) {
|
||||
fileOffset += dzInfo.chunkSizes[i];
|
||||
}
|
||||
|
||||
// Allocate buffers
|
||||
const uint32_t maxCompressedSize = 65536; // Max compressed chunk size
|
||||
auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
|
||||
auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
|
||||
if (!compressedBuf || !decompressedBuf) {
|
||||
free(compressedBuf);
|
||||
free(decompressedBuf);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
definition.clear();
|
||||
definition.reserve(size);
|
||||
|
||||
// Process each needed chunk
|
||||
for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
|
||||
const uint16_t compressedSize = dzInfo.chunkSizes[chunk];
|
||||
|
||||
// Seek and read compressed data
|
||||
file.seek(fileOffset);
|
||||
if (file.read(compressedBuf, compressedSize) != compressedSize) {
|
||||
free(compressedBuf);
|
||||
free(decompressedBuf);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Decompress using raw inflate (no zlib header)
|
||||
auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
|
||||
if (!inflator) {
|
||||
free(compressedBuf);
|
||||
free(decompressedBuf);
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
tinfl_init(inflator);
|
||||
|
||||
size_t inBytes = compressedSize;
|
||||
size_t outBytes = dzInfo.chunkLength;
|
||||
const tinfl_status status =
|
||||
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
||||
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);
|
||||
|
||||
free(inflator);
|
||||
|
||||
if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
|
||||
// Try without zlib header flag
|
||||
inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
|
||||
if (inflator) {
|
||||
tinfl_init(inflator);
|
||||
inBytes = compressedSize;
|
||||
outBytes = dzInfo.chunkLength;
|
||||
tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
|
||||
TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
|
||||
free(inflator);
|
||||
}
|
||||
}
|
||||
|
||||
// Extract the portion we need from this chunk
|
||||
uint32_t copyStart = 0;
|
||||
uint32_t copyEnd = outBytes;
|
||||
|
||||
if (chunk == startChunk) {
|
||||
copyStart = startOffsetInChunk;
|
||||
}
|
||||
if (chunk == endChunk) {
|
||||
const uint32_t endOffsetInChunk = (offset + size) - (endChunk * dzInfo.chunkLength);
|
||||
if (endOffsetInChunk < copyEnd) {
|
||||
copyEnd = endOffsetInChunk;
|
||||
}
|
||||
}
|
||||
|
||||
if (copyEnd > copyStart) {
|
||||
definition.append(reinterpret_cast<char*>(decompressedBuf + copyStart), copyEnd - copyStart);
|
||||
}
|
||||
|
||||
fileOffset += compressedSize;
|
||||
}
|
||||
|
||||
free(compressedBuf);
|
||||
free(decompressedBuf);
|
||||
file.close();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// StarDict comparison function: case-insensitive first, then case-sensitive as tiebreaker
|
||||
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
|
||||
// First: case-insensitive comparison (like g_ascii_strcasecmp)
|
||||
size_t i = 0;
|
||||
while (i < a.length() && i < b.length()) {
|
||||
const int ca = std::tolower(static_cast<unsigned char>(a[i]));
|
||||
const int cb = std::tolower(static_cast<unsigned char>(b[i]));
|
||||
if (ca != cb) return ca - cb;
|
||||
i++;
|
||||
}
|
||||
if (a.length() != b.length()) {
|
||||
return static_cast<int>(a.length()) - static_cast<int>(b.length());
|
||||
}
|
||||
// If case-insensitive equal, use case-sensitive as tiebreaker
|
||||
return a.compare(b);
|
||||
}
|
||||
|
||||
std::string StarDict::normalizeWord(const std::string& word) {
|
||||
std::string result;
|
||||
result.reserve(word.length());
|
||||
|
||||
// Trim leading whitespace
|
||||
size_t start = 0;
|
||||
while (start < word.length() && std::isspace(static_cast<unsigned char>(word[start]))) {
|
||||
start++;
|
||||
}
|
||||
|
||||
// Trim trailing whitespace
|
||||
size_t end = word.length();
|
||||
while (end > start && std::isspace(static_cast<unsigned char>(word[end - 1]))) {
|
||||
end--;
|
||||
}
|
||||
|
||||
// Convert to lowercase
|
||||
for (size_t i = start; i < end; i++) {
|
||||
result += static_cast<char>(std::tolower(static_cast<unsigned char>(word[i])));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
StarDict::LookupResult StarDict::lookup(const std::string& word) {
|
||||
LookupResult result;
|
||||
result.word = word;
|
||||
|
||||
if (!info.loaded) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const std::string normalizedSearch = normalizeWord(word);
|
||||
if (normalizedSearch.empty()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// First try .idx (main entries) - use prefix jump table for fast lookup
|
||||
const std::string idxPath = basePath + ".idx";
|
||||
FsFile idxFile;
|
||||
if (!SdMan.openFileForRead("DICT", idxPath, idxFile)) {
|
||||
Serial.printf("[%lu] [DICT] Failed to open index file\n", millis());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
|
||||
uint32_t position = 0;
|
||||
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
|
||||
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
|
||||
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
|
||||
position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
|
||||
}
|
||||
bool found = false;
|
||||
|
||||
while (position < info.idxfilesize) {
|
||||
std::string currentWord;
|
||||
uint32_t dictOffset, dictSize;
|
||||
|
||||
if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Use stardictStrcmp for case-insensitive matching
|
||||
const int cmp = stardictStrcmp(normalizedSearch, currentWord);
|
||||
|
||||
if (cmp == 0) {
|
||||
std::string definition;
|
||||
if (decompressDefinition(dictOffset, dictSize, definition)) {
|
||||
if (!found) {
|
||||
result.word = currentWord;
|
||||
result.definition = definition;
|
||||
result.found = true;
|
||||
found = true;
|
||||
} else {
|
||||
result.definition += "</html>" + definition;
|
||||
}
|
||||
}
|
||||
// Continue scanning for additional matches (same word, different case)
|
||||
} else if (cmp < 0) {
|
||||
// Passed where target would be (file is sorted)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
idxFile.close();
|
||||
|
||||
// If not found in main index, try synonym file with prefix jump
|
||||
if (!found && info.synwordcount > 0) {
|
||||
const std::string synPath = basePath + ".syn";
|
||||
FsFile synFile;
|
||||
if (SdMan.openFileForRead("DICT", synPath, synFile)) {
|
||||
const uint32_t synFileSize = synFile.size();
|
||||
|
||||
// Jump to the relevant section using prefix index (if word has 2+ alpha chars)
|
||||
uint32_t synPosition = 0;
|
||||
if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
|
||||
DictPrefixIndex::isAlpha(normalizedSearch[1])) {
|
||||
const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
|
||||
synPosition = DictPrefixIndex::synPrefixOffsets[prefixIdx];
|
||||
synFile.seek(synPosition);
|
||||
}
|
||||
|
||||
while (synFile.position() < synFileSize) {
|
||||
// Read synonym word (null-terminated)
|
||||
std::string synWord;
|
||||
char c;
|
||||
while (synFile.read(&c, 1) == 1 && c != '\0') {
|
||||
synWord += c;
|
||||
}
|
||||
|
||||
// Read 4-byte big-endian index
|
||||
uint8_t idxBytes[4];
|
||||
if (synFile.read(idxBytes, 4) != 4) break;
|
||||
const uint32_t mainIdx = readBE32(idxBytes);
|
||||
|
||||
// Use stardictStrcmp for case-insensitive comparison
|
||||
const int cmp = stardictStrcmp(normalizedSearch, synWord);
|
||||
|
||||
if (cmp == 0) {
|
||||
// Found synonym - look up the main entry by index
|
||||
FsFile idxFile2;
|
||||
if (SdMan.openFileForRead("DICT", idxPath, idxFile2)) {
|
||||
uint32_t pos = 0;
|
||||
uint32_t entryNum = 0;
|
||||
while (entryNum < mainIdx && pos < info.idxfilesize) {
|
||||
std::string w;
|
||||
uint32_t off, sz;
|
||||
if (!readWordAtPosition(idxFile2, pos, w, off, sz)) break;
|
||||
entryNum++;
|
||||
}
|
||||
// Now read the target entry
|
||||
if (entryNum == mainIdx) {
|
||||
std::string mainWord;
|
||||
uint32_t dictOffset, dictSize;
|
||||
if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
|
||||
std::string definition;
|
||||
if (decompressDefinition(dictOffset, dictSize, definition)) {
|
||||
result.word = synWord;
|
||||
result.definition = definition;
|
||||
result.found = true;
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
idxFile2.close();
|
||||
}
|
||||
break; // Found a match, stop searching
|
||||
} else if (cmp < 0) {
|
||||
// Passed where it would be (file is sorted)
|
||||
break;
|
||||
}
|
||||
}
|
||||
synFile.close();
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helper to decode a single HTML entity starting at position i (after the '&')
|
||||
// Returns the decoded string and advances i past the entity (including ';')
|
||||
static std::string decodeHtmlEntity(const std::string& html, size_t& i) {
|
||||
const size_t start = i; // Position of '&'
|
||||
const size_t remaining = html.length() - start;
|
||||
|
||||
// Numeric entities: &#NNN; or &#xHHH;
|
||||
if (remaining > 2 && html[start + 1] == '#') {
|
||||
size_t numStart = start + 2;
|
||||
bool isHex = false;
|
||||
if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
|
||||
isHex = true;
|
||||
numStart++;
|
||||
}
|
||||
|
||||
size_t numEnd = numStart;
|
||||
while (numEnd < html.length() && html[numEnd] != ';') {
|
||||
const char c = html[numEnd];
|
||||
if (isHex) {
|
||||
if (!std::isxdigit(static_cast<unsigned char>(c))) break;
|
||||
} else {
|
||||
if (!std::isdigit(static_cast<unsigned char>(c))) break;
|
||||
}
|
||||
numEnd++;
|
||||
}
|
||||
|
||||
if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
|
||||
const std::string numStr = html.substr(numStart, numEnd - numStart);
|
||||
unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
|
||||
i = numEnd; // Will be incremented by caller's loop
|
||||
|
||||
// Convert codepoint to UTF-8
|
||||
std::string utf8;
|
||||
if (codepoint < 0x80) {
|
||||
utf8 += static_cast<char>(codepoint);
|
||||
} else if (codepoint < 0x800) {
|
||||
utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
|
||||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||||
} else if (codepoint < 0x10000) {
|
||||
utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
|
||||
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
||||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||||
} else if (codepoint < 0x110000) {
|
||||
utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
|
||||
utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
|
||||
utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
|
||||
utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
|
||||
}
|
||||
return utf8;
|
||||
}
|
||||
}
|
||||
|
||||
// Named entities - find the semicolon first
|
||||
size_t semicolon = html.find(';', start + 1);
|
||||
if (semicolon != std::string::npos && semicolon - start < 12) {
|
||||
const std::string entity = html.substr(start, semicolon - start + 1);
|
||||
|
||||
// Common named entities
|
||||
struct EntityMapping {
|
||||
const char* entity;
|
||||
const char* replacement;
|
||||
};
|
||||
static const EntityMapping entities[] = {
|
||||
{" ", " "}, {"<", "<"}, {">", ">"},
|
||||
{"&", "&"}, {""", "\""}, {"'", "'"},
|
||||
{"—", "\xe2\x80\x94"}, // —
|
||||
{"–", "\xe2\x80\x93"}, // –
|
||||
{"…", "\xe2\x80\xa6"}, // …
|
||||
{"’", "\xe2\x80\x99"}, // '
|
||||
{"‘", "\xe2\x80\x98"}, // '
|
||||
{"”", "\xe2\x80\x9d"}, // "
|
||||
{"“", "\xe2\x80\x9c"}, // "
|
||||
{"°", "\xc2\xb0"}, // °
|
||||
{"×", "\xc3\x97"}, // ×
|
||||
{"÷", "\xc3\xb7"}, // ÷
|
||||
{"±", "\xc2\xb1"}, // ±
|
||||
{"½", "\xc2\xbd"}, // ½
|
||||
{"¼", "\xc2\xbc"}, // ¼
|
||||
{"¾", "\xc2\xbe"}, // ¾
|
||||
{"¢", "\xc2\xa2"}, // ¢
|
||||
{"£", "\xc2\xa3"}, // £
|
||||
{"€", "\xe2\x82\xac"}, // €
|
||||
{"¥", "\xc2\xa5"}, // ¥
|
||||
{"©", "\xc2\xa9"}, // ©
|
||||
{"®", "\xc2\xae"}, // ®
|
||||
{"™", "\xe2\x84\xa2"}, // ™
|
||||
{"•", "\xe2\x80\xa2"}, // •
|
||||
{"·", "\xc2\xb7"}, // ·
|
||||
{"§", "\xc2\xa7"}, // §
|
||||
{"¶", "\xc2\xb6"}, // ¶
|
||||
{"†", "\xe2\x80\xa0"}, // †
|
||||
{"‡", "\xe2\x80\xa1"}, // ‡
|
||||
{"¡", "\xc2\xa1"}, // ¡
|
||||
{"¿", "\xc2\xbf"}, // ¿
|
||||
{"«", "\xc2\xab"}, // «
|
||||
{"»", "\xc2\xbb"}, // »
|
||||
{"­", ""},
|
||||
{" ", " "},
|
||||
{" ", " "},
|
||||
{" ", " "},
|
||||
{"‍", ""},
|
||||
{"‌", ""},
|
||||
};
|
||||
|
||||
for (const auto& mapping : entities) {
|
||||
if (entity == mapping.entity) {
|
||||
i = semicolon; // Will be incremented by caller's loop
|
||||
return mapping.replacement;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Unknown entity - return just the ampersand and let the rest be processed normally
|
||||
return "&";
|
||||
}
|
||||
|
||||
// Helper to check if a tag is a block-level element that needs line breaks
|
||||
static bool isBlockTag(const std::string& tag, bool isClosing) {
|
||||
// Normalize to lowercase for comparison
|
||||
std::string lowerTag = tag;
|
||||
for (char& c : lowerTag) {
|
||||
c = std::tolower(static_cast<unsigned char>(c));
|
||||
}
|
||||
|
||||
// Block-level tags that should have line breaks
|
||||
if (lowerTag == "p" || lowerTag == "div" || lowerTag == "br" || lowerTag == "hr" || lowerTag == "li" ||
|
||||
lowerTag == "dt" || lowerTag == "dd" || lowerTag == "tr" || lowerTag == "h1" || lowerTag == "h2" ||
|
||||
lowerTag == "h3" || lowerTag == "h4" || lowerTag == "h5" || lowerTag == "h6" || lowerTag == "blockquote" ||
|
||||
lowerTag == "pre" || lowerTag == "ol" || lowerTag == "ul") {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string StarDict::stripHtml(const std::string& html) {
|
||||
std::string result;
|
||||
result.reserve(html.length());
|
||||
|
||||
bool inTag = false;
|
||||
bool lastWasSpace = false;
|
||||
bool lastWasNewline = false;
|
||||
|
||||
for (size_t i = 0; i < html.length(); i++) {
|
||||
const char c = html[i];
|
||||
|
||||
if (c == '<') {
|
||||
// Parse the tag name
|
||||
size_t tagStart = i + 1;
|
||||
bool isClosing = false;
|
||||
|
||||
// Skip whitespace after <
|
||||
while (tagStart < html.length() && std::isspace(static_cast<unsigned char>(html[tagStart]))) {
|
||||
tagStart++;
|
||||
}
|
||||
|
||||
// Check for closing tag
|
||||
if (tagStart < html.length() && html[tagStart] == '/') {
|
||||
isClosing = true;
|
||||
tagStart++;
|
||||
}
|
||||
|
||||
// Extract tag name
|
||||
size_t tagEnd = tagStart;
|
||||
while (tagEnd < html.length() && !std::isspace(static_cast<unsigned char>(html[tagEnd])) &&
|
||||
html[tagEnd] != '>' && html[tagEnd] != '/') {
|
||||
tagEnd++;
|
||||
}
|
||||
|
||||
const std::string tagName = html.substr(tagStart, tagEnd - tagStart);
|
||||
|
||||
// Check if this is a block-level element
|
||||
if (isBlockTag(tagName, isClosing)) {
|
||||
// Add line break for block elements
|
||||
if (!result.empty() && !lastWasNewline) {
|
||||
result += '\n';
|
||||
lastWasNewline = true;
|
||||
lastWasSpace = true;
|
||||
}
|
||||
}
|
||||
|
||||
inTag = true;
|
||||
} else if (c == '>') {
|
||||
inTag = false;
|
||||
} else if (!inTag) {
|
||||
// Handle HTML entities
|
||||
if (c == '&') {
|
||||
const std::string decoded = decodeHtmlEntity(html, i);
|
||||
if (!decoded.empty()) {
|
||||
// Check if decoded content is whitespace
|
||||
bool allSpace = true;
|
||||
for (const char dc : decoded) {
|
||||
if (!std::isspace(static_cast<unsigned char>(dc))) {
|
||||
allSpace = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (allSpace) {
|
||||
if (!lastWasSpace) {
|
||||
result += ' ';
|
||||
lastWasSpace = true;
|
||||
}
|
||||
} else {
|
||||
result += decoded;
|
||||
lastWasSpace = false;
|
||||
lastWasNewline = false;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Collapse whitespace
|
||||
if (std::isspace(static_cast<unsigned char>(c))) {
|
||||
if (!lastWasSpace) {
|
||||
result += ' ';
|
||||
lastWasSpace = true;
|
||||
}
|
||||
} else {
|
||||
result += c;
|
||||
lastWasSpace = false;
|
||||
lastWasNewline = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Trim trailing whitespace
|
||||
while (!result.empty() && std::isspace(static_cast<unsigned char>(result.back()))) {
|
||||
result.pop_back();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
81
lib/StarDict/StarDict.h
Normal file
81
lib/StarDict/StarDict.h
Normal file
@@ -0,0 +1,81 @@
|
||||
#pragma once
|
||||
|
||||
#include <SdFat.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
// StarDict dictionary lookup library
|
||||
// Supports .ifo/.idx/.dict.dz format with linear scan lookup
|
||||
class StarDict {
|
||||
public:
|
||||
struct DictInfo {
|
||||
std::string bookname;
|
||||
uint32_t wordcount = 0;
|
||||
uint32_t idxfilesize = 0;
|
||||
char sametypesequence = '\0'; // 'h' for HTML, 'm' for plain text, etc.
|
||||
uint32_t synwordcount = 0;
|
||||
bool loaded = false;
|
||||
};
|
||||
|
||||
struct LookupResult {
|
||||
std::string word;
|
||||
std::string definition;
|
||||
bool found = false;
|
||||
};
|
||||
|
||||
private:
|
||||
std::string basePath; // Path without extension (e.g., "/dictionaries/dict-data")
|
||||
DictInfo info;
|
||||
|
||||
// Dictzip chunk info for random access decompression
|
||||
struct DictzipInfo {
|
||||
uint32_t chunkLength = 0; // Uncompressed chunk size (usually 58315)
|
||||
uint16_t chunkCount = 0;
|
||||
uint32_t headerSize = 0; // Total header size to skip
|
||||
uint16_t* chunkSizes = nullptr; // Array of compressed chunk sizes
|
||||
bool loaded = false;
|
||||
};
|
||||
DictzipInfo dzInfo;
|
||||
|
||||
// Parse .ifo file
|
||||
bool loadInfo();
|
||||
|
||||
// Load dictzip header for random access
|
||||
bool loadDictzipHeader();
|
||||
|
||||
// Read word at given index file position, returns word and advances position
|
||||
bool readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
|
||||
uint32_t& dictSize);
|
||||
|
||||
// Decompress a portion of the .dict.dz file
|
||||
bool decompressDefinition(uint32_t offset, uint32_t size, std::string& definition);
|
||||
|
||||
// Convert 4-byte big-endian to uint32
|
||||
static uint32_t readBE32(const uint8_t* data);
|
||||
|
||||
public:
|
||||
explicit StarDict(const std::string& basePath);
|
||||
~StarDict();
|
||||
|
||||
// Initialize dictionary (loads .ifo)
|
||||
bool begin();
|
||||
|
||||
// Get dictionary info
|
||||
const DictInfo& getInfo() const { return info; }
|
||||
|
||||
// Look up a word (case-insensitive)
|
||||
LookupResult lookup(const std::string& word);
|
||||
|
||||
// Check if dictionary is ready
|
||||
bool isReady() const { return info.loaded; }
|
||||
|
||||
// Strip HTML tags from definition for plain text display
|
||||
static std::string stripHtml(const std::string& html);
|
||||
|
||||
// Normalize word for comparison (lowercase, trim)
|
||||
static std::string normalizeWord(const std::string& word);
|
||||
|
||||
// StarDict comparison (case-insensitive first, then case-sensitive tiebreaker)
|
||||
static int stardictStrcmp(const std::string& a, const std::string& b);
|
||||
};
|
||||
Reference in New Issue
Block a user