Merge branch 'master' into hyphenation-v2

This commit is contained in:
Arthur Tazhitdinov
2026-01-07 20:42:53 +05:00
32 changed files with 2386 additions and 288 deletions

View File

@@ -1,163 +0,0 @@
// from
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
#include "htmlEntities.h"
#include <cstring>
#include <unordered_map>
const int MAX_ENTITY_LENGTH = 10;
// Use book: entities_ww2.epub to test this (Page 7: Entities parser test)
// Note the supported keys are only in lowercase
// Store the mappings in a unordered hash map
static std::unordered_map<std::string, std::string> entity_lookup(
{{"&quot;", "\""}, {"&frasl;", ""}, {"&amp;", "&"}, {"&lt;", "<"}, {"&gt;", ">"},
{"&Agrave;", "À"}, {"&Aacute;", "Á"}, {"&Acirc;", "Â"}, {"&Atilde;", "Ã"}, {"&Auml;", "Ä"},
{"&Aring;", "Å"}, {"&AElig;", "Æ"}, {"&Ccedil;", "Ç"}, {"&Egrave;", "È"}, {"&Eacute;", "É"},
{"&Ecirc;", "Ê"}, {"&Euml;", "Ë"}, {"&Igrave;", "Ì"}, {"&Iacute;", "Í"}, {"&Icirc;", "Î"},
{"&Iuml;", "Ï"}, {"&ETH;", "Ð"}, {"&Ntilde;", "Ñ"}, {"&Ograve;", "Ò"}, {"&Oacute;", "Ó"},
{"&Ocirc;", "Ô"}, {"&Otilde;", "Õ"}, {"&Ouml;", "Ö"}, {"&Oslash;", "Ø"}, {"&Ugrave;", "Ù"},
{"&Uacute;", "Ú"}, {"&Ucirc;", "Û"}, {"&Uuml;", "Ü"}, {"&Yacute;", "Ý"}, {"&THORN;", "Þ"},
{"&szlig;", "ß"}, {"&agrave;", "à"}, {"&aacute;", "á"}, {"&acirc;", "â"}, {"&atilde;", "ã"},
{"&auml;", "ä"}, {"&aring;", "å"}, {"&aelig;", "æ"}, {"&ccedil;", "ç"}, {"&egrave;", "è"},
{"&eacute;", "é"}, {"&ecirc;", "ê"}, {"&euml;", "ë"}, {"&igrave;", "ì"}, {"&iacute;", "í"},
{"&icirc;", "î"}, {"&iuml;", "ï"}, {"&eth;", "ð"}, {"&ntilde;", "ñ"}, {"&ograve;", "ò"},
{"&oacute;", "ó"}, {"&ocirc;", "ô"}, {"&otilde;", "õ"}, {"&ouml;", "ö"}, {"&oslash;", "ø"},
{"&ugrave;", "ù"}, {"&uacute;", "ú"}, {"&ucirc;", "û"}, {"&uuml;", "ü"}, {"&yacute;", "ý"},
{"&thorn;", "þ"}, {"&yuml;", "ÿ"}, {"&nbsp;", " "}, {"&iexcl;", "¡"}, {"&cent;", "¢"},
{"&pound;", "£"}, {"&curren;", "¤"}, {"&yen;", "¥"}, {"&brvbar;", "¦"}, {"&sect;", "§"},
{"&uml;", "¨"}, {"&copy;", "©"}, {"&ordf;", "ª"}, {"&laquo;", "«"}, {"&not;", "¬"},
{"&shy;", "­"}, {"&reg;", "®"}, {"&macr;", "¯"}, {"&deg;", "°"}, {"&plusmn;", "±"},
{"&sup2;", "²"}, {"&sup3;", "³"}, {"&acute;", "´"}, {"&micro;", "µ"}, {"&para;", ""},
{"&cedil;", "¸"}, {"&sup1;", "¹"}, {"&ordm;", "º"}, {"&raquo;", "»"}, {"&frac14;", "¼"},
{"&frac12;", "½"}, {"&frac34;", "¾"}, {"&iquest;", "¿"}, {"&times;", "×"}, {"&divide;", "÷"},
{"&forall;", ""}, {"&part;", ""}, {"&exist;", ""}, {"&empty;", ""}, {"&nabla;", ""},
{"&isin;", ""}, {"&notin;", ""}, {"&ni;", ""}, {"&prod;", ""}, {"&sum;", ""},
{"&minus;", ""}, {"&lowast;", ""}, {"&radic;", ""}, {"&prop;", ""}, {"&infin;", ""},
{"&ang;", ""}, {"&and;", ""}, {"&or;", ""}, {"&cap;", ""}, {"&cup;", ""},
{"&int;", ""}, {"&there4;", ""}, {"&sim;", ""}, {"&cong;", ""}, {"&asymp;", ""},
{"&ne;", ""}, {"&equiv;", ""}, {"&le;", ""}, {"&ge;", ""}, {"&sub;", ""},
{"&sup;", ""}, {"&nsub;", ""}, {"&sube;", ""}, {"&supe;", ""}, {"&oplus;", ""},
{"&otimes;", ""}, {"&perp;", ""}, {"&sdot;", ""}, {"&Alpha;", "Α"}, {"&Beta;", "Β"},
{"&Gamma;", "Γ"}, {"&Delta;", "Δ"}, {"&Epsilon;", "Ε"}, {"&Zeta;", "Ζ"}, {"&Eta;", "Η"},
{"&Theta;", "Θ"}, {"&Iota;", "Ι"}, {"&Kappa;", "Κ"}, {"&Lambda;", "Λ"}, {"&Mu;", "Μ"},
{"&Nu;", "Ν"}, {"&Xi;", "Ξ"}, {"&Omicron;", "Ο"}, {"&Pi;", "Π"}, {"&Rho;", "Ρ"},
{"&Sigma;", "Σ"}, {"&Tau;", "Τ"}, {"&Upsilon;", "Υ"}, {"&Phi;", "Φ"}, {"&Chi;", "Χ"},
{"&Psi;", "Ψ"}, {"&Omega;", "Ω"}, {"&alpha;", "α"}, {"&beta;", "β"}, {"&gamma;", "γ"},
{"&delta;", "δ"}, {"&epsilon;", "ε"}, {"&zeta;", "ζ"}, {"&eta;", "η"}, {"&theta;", "θ"},
{"&iota;", "ι"}, {"&kappa;", "κ"}, {"&lambda;", "λ"}, {"&mu;", "μ"}, {"&nu;", "ν"},
{"&xi;", "ξ"}, {"&omicron;", "ο"}, {"&pi;", "π"}, {"&rho;", "ρ"}, {"&sigmaf;", "ς"},
{"&sigma;", "σ"}, {"&tau;", "τ"}, {"&upsilon;", "υ"}, {"&phi;", "φ"}, {"&chi;", "χ"},
{"&psi;", "ψ"}, {"&omega;", "ω"}, {"&thetasym;", "ϑ"}, {"&upsih;", "ϒ"}, {"&piv;", "ϖ"},
{"&OElig;", "Œ"}, {"&oelig;", "œ"}, {"&Scaron;", "Š"}, {"&scaron;", "š"}, {"&Yuml;", "Ÿ"},
{"&fnof;", "ƒ"}, {"&circ;", "ˆ"}, {"&tilde;", "˜"}, {"&ensp;", ""}, {"&emsp;", ""},
{"&thinsp;", ""}, {"&zwnj;", ""}, {"&zwj;", ""}, {"&lrm;", ""}, {"&rlm;", ""},
{"&ndash;", ""}, {"&mdash;", ""}, {"&lsquo;", ""}, {"&rsquo;", ""}, {"&sbquo;", ""},
{"&ldquo;", ""}, {"&rdquo;", ""}, {"&bdquo;", ""}, {"&dagger;", ""}, {"&Dagger;", ""},
{"&bull;", ""}, {"&hellip;", ""}, {"&permil;", ""}, {"&prime;", ""}, {"&Prime;", ""},
{"&lsaquo;", ""}, {"&rsaquo;", ""}, {"&oline;", ""}, {"&euro;", ""}, {"&trade;", ""},
{"&larr;", ""}, {"&uarr;", ""}, {"&rarr;", ""}, {"&darr;", ""}, {"&harr;", ""},
{"&crarr;", ""}, {"&lceil;", ""}, {"&rceil;", ""}, {"&lfloor;", ""}, {"&rfloor;", ""},
{"&loz;", ""}, {"&spades;", ""}, {"&clubs;", ""}, {"&hearts;", ""}, {"&diams;", ""}});
// converts from a unicode code point to the utf8 equivalent
void convert_to_utf8(const int code, std::string& res) {
// convert to a utf8 sequence
if (code < 0x80) {
res += static_cast<char>(code);
} else if (code < 0x800) {
res += static_cast<char>(0xc0 | (code >> 6));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x10000) {
res += static_cast<char>(0xe0 | (code >> 12));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x200000) {
res += static_cast<char>(0xf0 | (code >> 18));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x4000000) {
res += static_cast<char>(0xf8 | (code >> 24));
res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
res += static_cast<char>(0x80 | (code & 0x3f));
} else if (code < 0x80000000) {
res += static_cast<char>(0xfc | (code >> 30));
res += static_cast<char>(0x80 | ((code >> 24) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 18) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 12) & 0x3f));
res += static_cast<char>(0x80 | ((code >> 6) & 0x3f));
}
}
// handles numeric entities - e.g. &#1234; or &#x1234;
bool process_numeric_entity(const std::string& entity, std::string& res) {
int code = 0;
// is it hex?
if (entity[2] == 'x' || entity[2] == 'X') {
// parse the hex code
code = strtol(entity.substr(3, entity.size() - 3).c_str(), nullptr, 16);
} else {
code = strtol(entity.substr(2, entity.size() - 3).c_str(), nullptr, 10);
}
if (code != 0) {
// special handling for nbsp
if (code == 0xA0) {
res += " ";
} else {
convert_to_utf8(code, res);
}
return true;
}
return false;
}
// handles named entities - e.g. &amp;
bool process_string_entity(const std::string& entity, std::string& res) {
// it's a named entity - find it in the lookup table
// find it in the map
const auto it = entity_lookup.find(entity);
if (it != entity_lookup.end()) {
res += it->second;
return true;
}
return false;
}
// replace all the entities in the string
std::string replaceHtmlEntities(const char* text) {
std::string res;
res.reserve(strlen(text));
for (int i = 0; i < strlen(text); ++i) {
bool flag = false;
// do we have a potential entity?
if (text[i] == '&') {
// find the end of the entity
int j = i + 1;
while (j < strlen(text) && text[j] != ';' && j - i < MAX_ENTITY_LENGTH) {
j++;
}
if (j - i > 2) {
char entity[j - i + 1];
strncpy(entity, text + i, j - i);
// is it a numeric code?
if (entity[1] == '#') {
flag = process_numeric_entity(entity, res);
} else {
flag = process_string_entity(entity, res);
}
// skip past the entity if we successfully decoded it
if (flag) {
i = j;
}
}
}
if (!flag) {
res += text[i];
}
}
return res;
}

View File

@@ -1,7 +0,0 @@
// from
// https://github.com/atomic14/diy-esp32-epub-reader/blob/2c2f57fdd7e2a788d14a0bcb26b9e845a47aac42/lib/Epub/RubbishHtmlParser/htmlEntities.cpp
#pragma once
#include <string>
std::string replaceHtmlEntities(const char* text);

View File

@@ -6,7 +6,6 @@
#include <expat.h>
#include "../Page.h"
#include "../htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
@@ -130,7 +129,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
// Skip the whitespace char
@@ -155,7 +154,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
// If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
@@ -197,7 +196,7 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
}
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
}

View File

@@ -0,0 +1,219 @@
#include "OpdsParser.h"
#include <HardwareSerial.h>
#include <cstring>
OpdsParser::~OpdsParser() {
if (parser) {
XML_StopParser(parser, XML_FALSE);
XML_SetElementHandler(parser, nullptr, nullptr);
XML_SetCharacterDataHandler(parser, nullptr);
XML_ParserFree(parser);
parser = nullptr;
}
}
bool OpdsParser::parse(const char* xmlData, const size_t length) {
clear();
parser = XML_ParserCreate(nullptr);
if (!parser) {
Serial.printf("[%lu] [OPDS] Couldn't allocate memory for parser\n", millis());
return false;
}
XML_SetUserData(parser, this);
XML_SetElementHandler(parser, startElement, endElement);
XML_SetCharacterDataHandler(parser, characterData);
// Parse in chunks to avoid large buffer allocations
const char* currentPos = xmlData;
size_t remaining = length;
constexpr size_t chunkSize = 1024;
while (remaining > 0) {
void* const buf = XML_GetBuffer(parser, chunkSize);
if (!buf) {
Serial.printf("[%lu] [OPDS] Couldn't allocate memory for buffer\n", millis());
XML_ParserFree(parser);
parser = nullptr;
return false;
}
const size_t toRead = remaining < chunkSize ? remaining : chunkSize;
memcpy(buf, currentPos, toRead);
const bool isFinal = (remaining == toRead);
if (XML_ParseBuffer(parser, static_cast<int>(toRead), isFinal) == XML_STATUS_ERROR) {
Serial.printf("[%lu] [OPDS] Parse error at line %lu: %s\n", millis(), XML_GetCurrentLineNumber(parser),
XML_ErrorString(XML_GetErrorCode(parser)));
XML_ParserFree(parser);
parser = nullptr;
return false;
}
currentPos += toRead;
remaining -= toRead;
}
// Clean up parser
XML_ParserFree(parser);
parser = nullptr;
Serial.printf("[%lu] [OPDS] Parsed %zu entries\n", millis(), entries.size());
return true;
}
void OpdsParser::clear() {
entries.clear();
currentEntry = OpdsEntry{};
currentText.clear();
inEntry = false;
inTitle = false;
inAuthor = false;
inAuthorName = false;
inId = false;
}
std::vector<OpdsEntry> OpdsParser::getBooks() const {
std::vector<OpdsEntry> books;
for (const auto& entry : entries) {
if (entry.type == OpdsEntryType::BOOK) {
books.push_back(entry);
}
}
return books;
}
const char* OpdsParser::findAttribute(const XML_Char** atts, const char* name) {
for (int i = 0; atts[i]; i += 2) {
if (strcmp(atts[i], name) == 0) {
return atts[i + 1];
}
}
return nullptr;
}
void XMLCALL OpdsParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
auto* self = static_cast<OpdsParser*>(userData);
// Check for entry element (with or without namespace prefix)
if (strcmp(name, "entry") == 0 || strstr(name, ":entry") != nullptr) {
self->inEntry = true;
self->currentEntry = OpdsEntry{};
return;
}
if (!self->inEntry) return;
// Check for title element
if (strcmp(name, "title") == 0 || strstr(name, ":title") != nullptr) {
self->inTitle = true;
self->currentText.clear();
return;
}
// Check for author element
if (strcmp(name, "author") == 0 || strstr(name, ":author") != nullptr) {
self->inAuthor = true;
return;
}
// Check for author name element
if (self->inAuthor && (strcmp(name, "name") == 0 || strstr(name, ":name") != nullptr)) {
self->inAuthorName = true;
self->currentText.clear();
return;
}
// Check for id element
if (strcmp(name, "id") == 0 || strstr(name, ":id") != nullptr) {
self->inId = true;
self->currentText.clear();
return;
}
// Check for link element
if (strcmp(name, "link") == 0 || strstr(name, ":link") != nullptr) {
const char* rel = findAttribute(atts, "rel");
const char* type = findAttribute(atts, "type");
const char* href = findAttribute(atts, "href");
if (href) {
// Check for acquisition link with epub type (this is a downloadable book)
if (rel && type && strstr(rel, "opds-spec.org/acquisition") != nullptr &&
strcmp(type, "application/epub+zip") == 0) {
self->currentEntry.type = OpdsEntryType::BOOK;
self->currentEntry.href = href;
}
// Check for navigation link (subsection or no rel specified with atom+xml type)
else if (type && strstr(type, "application/atom+xml") != nullptr) {
// Only set navigation link if we don't already have an epub link
if (self->currentEntry.type != OpdsEntryType::BOOK) {
self->currentEntry.type = OpdsEntryType::NAVIGATION;
self->currentEntry.href = href;
}
}
}
}
}
void XMLCALL OpdsParser::endElement(void* userData, const XML_Char* name) {
auto* self = static_cast<OpdsParser*>(userData);
// Check for entry end
if (strcmp(name, "entry") == 0 || strstr(name, ":entry") != nullptr) {
// Only add entry if it has required fields (title and href)
if (!self->currentEntry.title.empty() && !self->currentEntry.href.empty()) {
self->entries.push_back(self->currentEntry);
}
self->inEntry = false;
self->currentEntry = OpdsEntry{};
return;
}
if (!self->inEntry) return;
// Check for title end
if (strcmp(name, "title") == 0 || strstr(name, ":title") != nullptr) {
if (self->inTitle) {
self->currentEntry.title = self->currentText;
}
self->inTitle = false;
return;
}
// Check for author end
if (strcmp(name, "author") == 0 || strstr(name, ":author") != nullptr) {
self->inAuthor = false;
return;
}
// Check for author name end
if (self->inAuthor && (strcmp(name, "name") == 0 || strstr(name, ":name") != nullptr)) {
if (self->inAuthorName) {
self->currentEntry.author = self->currentText;
}
self->inAuthorName = false;
return;
}
// Check for id end
if (strcmp(name, "id") == 0 || strstr(name, ":id") != nullptr) {
if (self->inId) {
self->currentEntry.id = self->currentText;
}
self->inId = false;
return;
}
}
void XMLCALL OpdsParser::characterData(void* userData, const XML_Char* s, const int len) {
auto* self = static_cast<OpdsParser*>(userData);
// Only accumulate text when in a text element
if (self->inTitle || self->inAuthorName || self->inId) {
self->currentText.append(s, len);
}
}

View File

@@ -0,0 +1,99 @@
#pragma once
#include <expat.h>
#include <string>
#include <vector>
/**
* Type of OPDS entry.
*/
enum class OpdsEntryType {
NAVIGATION, // Link to another catalog
BOOK // Downloadable book
};
/**
* Represents an entry from an OPDS feed (either a navigation link or a book).
*/
struct OpdsEntry {
OpdsEntryType type = OpdsEntryType::NAVIGATION;
std::string title;
std::string author; // Only for books
std::string href; // Navigation URL or epub download URL
std::string id;
};
// Legacy alias for backward compatibility
using OpdsBook = OpdsEntry;
/**
* Parser for OPDS (Open Publication Distribution System) Atom feeds.
* Uses the Expat XML parser to parse OPDS catalog entries.
*
* Usage:
* OpdsParser parser;
* if (parser.parse(xmlData, xmlLength)) {
* for (const auto& entry : parser.getEntries()) {
* if (entry.type == OpdsEntryType::BOOK) {
* // Downloadable book
* } else {
* // Navigation link to another catalog
* }
* }
* }
*/
class OpdsParser {
public:
OpdsParser() = default;
~OpdsParser();
// Disable copy
OpdsParser(const OpdsParser&) = delete;
OpdsParser& operator=(const OpdsParser&) = delete;
/**
* Parse an OPDS XML feed.
* @param xmlData Pointer to the XML data
* @param length Length of the XML data
* @return true if parsing succeeded, false on error
*/
bool parse(const char* xmlData, size_t length);
/**
* Get the parsed entries (both navigation and book entries).
* @return Vector of OpdsEntry entries
*/
const std::vector<OpdsEntry>& getEntries() const { return entries; }
/**
* Get only book entries (legacy compatibility).
* @return Vector of book entries
*/
std::vector<OpdsEntry> getBooks() const;
/**
* Clear all parsed entries.
*/
void clear();
private:
// Expat callbacks
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void XMLCALL endElement(void* userData, const XML_Char* name);
static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
// Helper to find attribute value
static const char* findAttribute(const XML_Char** atts, const char* name);
XML_Parser parser = nullptr;
std::vector<OpdsEntry> entries;
OpdsEntry currentEntry;
std::string currentText;
// Parser state
bool inEntry = false;
bool inTitle = false;
bool inAuthor = false;
bool inAuthorName = false;
bool inId = false;
};