feat: dict based Hyphenation (#305)
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
committed by
GitHub
parent
5fef99c641
commit
8824c87490
@@ -51,7 +51,7 @@ void ChapterHtmlSlimParser::startNewTextBlock(const TextBlock::Style style) {
|
||||
|
||||
makePages();
|
||||
}
|
||||
currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing));
|
||||
currentTextBlock.reset(new ParsedText(style, extraParagraphSpacing, hyphenationEnabled));
|
||||
}
|
||||
|
||||
void XMLCALL ChapterHtmlSlimParser::startElement(void* userData, const XML_Char* name, const XML_Char** atts) {
|
||||
@@ -170,21 +170,6 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD
|
||||
const XML_Char SHY_BYTE_1 = static_cast<XML_Char>(0xC2);
|
||||
const XML_Char SHY_BYTE_2 = static_cast<XML_Char>(0xAD);
|
||||
// 1. Check for the start of the 2-byte Soft Hyphen sequence
|
||||
if (s[i] == SHY_BYTE_1) {
|
||||
// 2. Check if the next byte exists AND if it completes the sequence
|
||||
// We must check i + 1 < len to prevent reading past the end of the buffer.
|
||||
if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) {
|
||||
// Sequence 0xC2 0xAD found!
|
||||
// Skip the current byte (0xC2) and the next byte (0xAD)
|
||||
i++; // Increment 'i' one more time to skip the 0xAD byte
|
||||
continue; // Skip the rest of the loop and move to the next iteration
|
||||
}
|
||||
}
|
||||
|
||||
// Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
|
||||
const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
|
||||
const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
|
||||
|
||||
@@ -36,6 +36,7 @@ class ChapterHtmlSlimParser {
|
||||
uint8_t paragraphAlignment;
|
||||
uint16_t viewportWidth;
|
||||
uint16_t viewportHeight;
|
||||
bool hyphenationEnabled;
|
||||
|
||||
void startNewTextBlock(TextBlock::Style style);
|
||||
void makePages();
|
||||
@@ -48,7 +49,7 @@ class ChapterHtmlSlimParser {
|
||||
explicit ChapterHtmlSlimParser(const std::string& filepath, GfxRenderer& renderer, const int fontId,
|
||||
const float lineCompression, const bool extraParagraphSpacing,
|
||||
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
|
||||
const uint16_t viewportHeight,
|
||||
const uint16_t viewportHeight, const bool hyphenationEnabled,
|
||||
const std::function<void(std::unique_ptr<Page>)>& completePageFn,
|
||||
const std::function<void(int)>& progressFn = nullptr)
|
||||
: filepath(filepath),
|
||||
@@ -59,6 +60,7 @@ class ChapterHtmlSlimParser {
|
||||
paragraphAlignment(paragraphAlignment),
|
||||
viewportWidth(viewportWidth),
|
||||
viewportHeight(viewportHeight),
|
||||
hyphenationEnabled(hyphenationEnabled),
|
||||
completePageFn(completePageFn),
|
||||
progressFn(progressFn) {}
|
||||
~ChapterHtmlSlimParser() = default;
|
||||
|
||||
@@ -107,6 +107,11 @@ void XMLCALL ContentOpfParser::startElement(void* userData, const XML_Char* name
|
||||
return;
|
||||
}
|
||||
|
||||
if (self->state == IN_METADATA && strcmp(name, "dc:language") == 0) {
|
||||
self->state = IN_BOOK_LANGUAGE;
|
||||
return;
|
||||
}
|
||||
|
||||
if (self->state == IN_PACKAGE && (strcmp(name, "manifest") == 0 || strcmp(name, "opf:manifest") == 0)) {
|
||||
self->state = IN_MANIFEST;
|
||||
if (!SdMan.openFileForWrite("COF", self->cachePath + itemCacheFile, self->tempItemStore)) {
|
||||
@@ -266,6 +271,11 @@ void XMLCALL ContentOpfParser::characterData(void* userData, const XML_Char* s,
|
||||
self->author.append(s, len);
|
||||
return;
|
||||
}
|
||||
|
||||
if (self->state == IN_BOOK_LANGUAGE) {
|
||||
self->language.append(s, len);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name) {
|
||||
@@ -300,6 +310,11 @@ void XMLCALL ContentOpfParser::endElement(void* userData, const XML_Char* name)
|
||||
return;
|
||||
}
|
||||
|
||||
if (self->state == IN_BOOK_LANGUAGE && strcmp(name, "dc:language") == 0) {
|
||||
self->state = IN_METADATA;
|
||||
return;
|
||||
}
|
||||
|
||||
if (self->state == IN_METADATA && (strcmp(name, "metadata") == 0 || strcmp(name, "opf:metadata") == 0)) {
|
||||
self->state = IN_PACKAGE;
|
||||
return;
|
||||
|
||||
@@ -13,6 +13,7 @@ class ContentOpfParser final : public Print {
|
||||
IN_METADATA,
|
||||
IN_BOOK_TITLE,
|
||||
IN_BOOK_AUTHOR,
|
||||
IN_BOOK_LANGUAGE,
|
||||
IN_MANIFEST,
|
||||
IN_SPINE,
|
||||
IN_GUIDE,
|
||||
@@ -34,6 +35,7 @@ class ContentOpfParser final : public Print {
|
||||
public:
|
||||
std::string title;
|
||||
std::string author;
|
||||
std::string language;
|
||||
std::string tocNcxPath;
|
||||
std::string tocNavPath; // EPUB 3 nav document path
|
||||
std::string coverItemHref;
|
||||
|
||||
Reference in New Issue
Block a user