feat: dict based Hyphenation (#305)

## Summary

* Adds (optional) Hyphenation for English, French, German, Russian
languages

## Additional Context

* Included hyphenation dictionaries add approximately 280kb to the flash
usage (German alone takes 200kb)
* Trie encoded dictionaries are adopted from hypher project
(https://github.com/typst/hypher)
* Soft hyphens (and other explicit hyphens) take precedence over
dict-based hyphenation. Overall, the hyphenation rules are quite
aggressive, as I believe it makes more sense on our smaller screen.

---------

Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
Arthur Tazhitdinov
2026-01-19 17:56:26 +05:00
committed by GitHub
parent 5fef99c641
commit 8824c87490
40 changed files with 36465 additions and 52 deletions

View File

@@ -51,7 +51,7 @@ void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::Style style) {
makePages();
}
currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing));
currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing, hyphenationEnabled));
}
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
@@ -170,21 +170,6 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
continue;
}
// Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD
const XML_Char SHY_BYTE_1 = static_cast<XML_Char>(0xC2);
const XML_Char SHY_BYTE_2 = static_cast<XML_Char>(0xAD);
// 1. Check for the start of the 2-byte Soft Hyphen sequence
if (s[i] == SHY_BYTE_1) {
// 2. Check if the next byte exists AND if it completes the sequence
// We must check i + 1 < len to prevent reading past the end of the buffer.
if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) {
// Sequence 0xC2 0xAD found!
// Skip the current byte (0xC2) and the next byte (0xAD)
i++; // Increment 'i' one more time to skip the 0xAD byte
continue; // Skip the rest of the loop and move to the next iteration
}
}
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);

View File

@@ -36,6 +36,7 @@ class ChapterHtmlSlimParser {
uint8_t paragraphAlignment;
uint16_t viewportWidth;
uint16_t viewportHeight;
bool hyphenationEnabled;
void startNewTextBlock(TextBlock::Style style);
void makePages();
@@ -48,7 +49,7 @@ class ChapterHtmlSlimParser {
explicit ChapterHtmlSlimParser(const std::string& filepath, GfxRenderer& renderer, const int fontId,
const float lineCompression, const bool extraParagraphSpacing,
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight,
const uint16_t viewportHeight, const bool hyphenationEnabled,
const std::function<void(std::unique_ptr<Page>)>& completePageFn,
const std::function<void(int)>& progressFn = nullptr)
: filepath(filepath),
@@ -59,6 +60,7 @@ class ChapterHtmlSlimParser {
paragraphAlignment(paragraphAlignment),
viewportWidth(viewportWidth),
viewportHeight(viewportHeight),
hyphenationEnabled(hyphenationEnabled),
completePageFn(completePageFn),
progressFn(progressFn) {}
~ChapterHtmlSlimParser() = default;

View File

@@ -107,6 +107,11 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
return;
}
if (self->state == IN_METADATA && strcmp(name, "dc:language") == 0) {
self->state = IN_BOOK_LANGUAGE;
return;
}
if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
self->state = IN_MANIFEST;
if (!SdMan.openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
@@ -266,6 +271,11 @@ void XMLCALL ContentOpfParser::characterData(void* userData, const XML_Char* s,
self->author.append(s, len);
return;
}
if (self->state == IN_BOOK_LANGUAGE) {
self->language.append(s, len);
return;
}
}
void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) {
@@ -300,6 +310,11 @@ void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name)
return;
}
if (self->state == IN_BOOK_LANGUAGE && strcmp(name, "dc:language") == 0) {
self->state = IN_METADATA;
return;
}
if (self->state == IN_METADATA && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) {
self->state = IN_PACKAGE;
return;

View File

@@ -13,6 +13,7 @@ class ContentOpfParser final : public Print {
IN_METADATA,
IN_BOOK_TITLE,
IN_BOOK_AUTHOR,
IN_BOOK_LANGUAGE,
IN_MANIFEST,
IN_SPINE,
IN_GUIDE,
@@ -34,6 +35,7 @@ class ContentOpfParser final : public Print {
public:
std::string title;
std::string author;
std::string language;
std::string tocNcxPath;
std::string tocNavPath; // EPUB 3 nav document path
std::string coverItemHref;