feat: dict based Hyphenation (#305)

## Summary

* Adds (optional) Hyphenation for English, French, German, Russian
languages

## Additional Context

* Included hyphenation dictionaries add approximately 280kb to the flash
usage (German alone takes 200kb)
* Trie encoded dictionaries are adopted from hypher project
(https://github.com/typst/hypher)
* Soft hyphens (and other explicit hyphens) take precedence over
dict-based hyphenation. Overall, the hyphenation rules are quite
aggressive, as I believe it makes more sense on our smaller screen.

---------

Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
Arthur Tazhitdinov
2026-01-19 17:56:26 +05:00
committed by GitHub
parent 5fef99c641
commit 8824c87490
40 changed files with 36465 additions and 52 deletions

View File

@@ -4,12 +4,14 @@
#include <Serialization.h>
#include "Page.h"
#include "hyphenation/Hyphenator.h"
#include "parsers/ChapterHtmlSlimParser.h"
namespace {
constexpr uint8_t SECTION_FILE_VERSION = 9;
constexpr uint8_t SECTION_FILE_VERSION = 10;
constexpr uint32_t HEADER_SIZE = sizeof(uint8_t) + sizeof(int) + sizeof(float) + sizeof(bool) + sizeof(uint8_t) +
sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t);
sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(bool) +
sizeof(uint32_t);
} // namespace
uint32_t Section::onPageComplete(std::unique_ptr<Page> page) {
@@ -31,14 +33,15 @@ uint32_t Section::onPageComplete(std::unique_ptr<Page> page) {
void Section::writeSectionFileHeader(const int fontId, const float lineCompression, const bool extraParagraphSpacing,
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight) {
const uint16_t viewportHeight, const bool hyphenationEnabled) {
if (!file) {
Serial.printf("[%lu] [SCT] File not open for writing header\n", millis());
return;
}
static_assert(HEADER_SIZE == sizeof(SECTION_FILE_VERSION) + sizeof(fontId) + sizeof(lineCompression) +
sizeof(extraParagraphSpacing) + sizeof(paragraphAlignment) + sizeof(viewportWidth) +
sizeof(viewportHeight) + sizeof(pageCount) + sizeof(uint32_t),
sizeof(viewportHeight) + sizeof(pageCount) + sizeof(hyphenationEnabled) +
sizeof(uint32_t),
"Header size mismatch");
serialization::writePod(file, SECTION_FILE_VERSION);
serialization::writePod(file, fontId);
@@ -47,13 +50,14 @@ void Section::writeSectionFileHeader(const int fontId, const float lineCompressi
serialization::writePod(file, paragraphAlignment);
serialization::writePod(file, viewportWidth);
serialization::writePod(file, viewportHeight);
serialization::writePod(file, hyphenationEnabled);
serialization::writePod(file, pageCount); // Placeholder for page count (will be initially 0 when written)
serialization::writePod(file, static_cast<uint32_t>(0)); // Placeholder for LUT offset
}
bool Section::loadSectionFile(const int fontId, const float lineCompression, const bool extraParagraphSpacing,
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight) {
const uint16_t viewportHeight, const bool hyphenationEnabled) {
if (!SdMan.openFileForRead("SCT", filePath, file)) {
return false;
}
@@ -74,16 +78,19 @@ bool Section::loadSectionFile(const int fontId, const float lineCompression, con
float fileLineCompression;
bool fileExtraParagraphSpacing;
uint8_t fileParagraphAlignment;
bool fileHyphenationEnabled;
serialization::readPod(file, fileFontId);
serialization::readPod(file, fileLineCompression);
serialization::readPod(file, fileExtraParagraphSpacing);
serialization::readPod(file, fileParagraphAlignment);
serialization::readPod(file, fileViewportWidth);
serialization::readPod(file, fileViewportHeight);
serialization::readPod(file, fileHyphenationEnabled);
if (fontId != fileFontId || lineCompression != fileLineCompression ||
extraParagraphSpacing != fileExtraParagraphSpacing || paragraphAlignment != fileParagraphAlignment ||
viewportWidth != fileViewportWidth || viewportHeight != fileViewportHeight) {
viewportWidth != fileViewportWidth || viewportHeight != fileViewportHeight ||
hyphenationEnabled != fileHyphenationEnabled) {
file.close();
Serial.printf("[%lu] [SCT] Deserialization failed: Parameters do not match\n", millis());
clearCache();
@@ -115,7 +122,8 @@ bool Section::clearCache() const {
bool Section::createSectionFile(const int fontId, const float lineCompression, const bool extraParagraphSpacing,
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight, const std::function<void()>& progressSetupFn,
const uint16_t viewportHeight, const bool hyphenationEnabled,
const std::function<void()>& progressSetupFn,
const std::function<void(int)>& progressFn) {
constexpr uint32_t MIN_SIZE_FOR_PROGRESS = 50 * 1024; // 50KB
const auto localPath = epub->getSpineItem(spineIndex).href;
@@ -172,14 +180,15 @@ bool Section::createSectionFile(const int fontId, const float lineCompression, c
return false;
}
writeSectionFileHeader(fontId, lineCompression, extraParagraphSpacing, paragraphAlignment, viewportWidth,
viewportHeight);
viewportHeight, hyphenationEnabled);
std::vector<uint32_t> lut = {};
ChapterHtmlSlimParser visitor(
tmpHtmlPath, renderer, fontId, lineCompression, extraParagraphSpacing, paragraphAlignment, viewportWidth,
viewportHeight,
viewportHeight, hyphenationEnabled,
[this, &lut](std::unique_ptr<Page> page) { lut.emplace_back(this->onPageComplete(std::move(page))); },
progressFn);
Hyphenator::setPreferredLanguage(epub->getLanguage());
success = visitor.parseAndBuildPages();
SdMan.remove(tmpHtmlPath.c_str());