feat: dict based Hyphenation (#305)

## Summary

* Adds (optional) Hyphenation for English, French, German, Russian
languages

## Additional Context

* Included hyphenation dictionaries add approximately 280kb to the flash
usage (German alone takes 200kb)
* Trie encoded dictionaries are adopted from hypher project
(https://github.com/typst/hypher)
* Soft hyphens (and other explicit hyphens) take precedence over
dict-based hyphenation. Overall, the hyphenation rules are quite
aggressive, as I believe it makes more sense on our smaller screen.

---------

Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
Arthur Tazhitdinov
2026-01-19 17:56:26 +05:00
committed by GitHub
parent 5fef99c641
commit 8824c87490
40 changed files with 36465 additions and 52 deletions

View File

@@ -0,0 +1,179 @@
#include "HyphenationCommon.h"
#include <Utf8.h>
namespace {
// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
uint32_t toLowerLatinImpl(const uint32_t cp) {
if (cp >= 'A' && cp <= 'Z') {
return cp - 'A' + 'a';
}
if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
return cp + 0x20;
}
switch (cp) {
case 0x0152: // Œ
return 0x0153; // œ
case 0x0178: // Ÿ
return 0x00FF; // ÿ
case 0x1E9E: // ẞ
return 0x00DF; // ß
default:
return cp;
}
}
// Convert Cyrillic uppercase letters to lowercase
// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
uint32_t toLowerCyrillicImpl(const uint32_t cp) {
if (cp >= 0x0410 && cp <= 0x042F) {
return cp + 0x20;
}
if (cp == 0x0401) {
return 0x0451;
}
return cp;
}
} // namespace
uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
bool isLatinLetter(const uint32_t cp) {
if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
return true;
}
if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
cp != 0x00D7 && cp != 0x00F7) {
return true;
}
switch (cp) {
case 0x0152: // Œ
case 0x0153: // œ
case 0x0178: // Ÿ
case 0x1E9E: // ẞ
return true;
default:
return false;
}
}
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
bool isPunctuation(const uint32_t cp) {
switch (cp) {
case '-':
case '.':
case ',':
case '!':
case '?':
case ';':
case ':':
case '"':
case '\'':
case ')':
case '(':
case 0x00AB: // «
case 0x00BB: // »
case 0x2018: //
case 0x2019: //
case 0x201C: // “
case 0x201D: // ”
case 0x00A0: // no-break space
case '{':
case '}':
case '[':
case ']':
case '/':
case 0x203A: //
case 0x2026: // …
return true;
default:
return false;
}
}
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
bool isExplicitHyphen(const uint32_t cp) {
switch (cp) {
case '-':
case 0x00AD: // soft hyphen
case 0x058A: // Armenian hyphen
case 0x2010: // hyphen
case 0x2011: // non-breaking hyphen
case 0x2012: // figure dash
case 0x2013: // en dash
case 0x2014: // em dash
case 0x2015: // horizontal bar
case 0x2043: // hyphen bullet
case 0x207B: // superscript minus
case 0x208B: // subscript minus
case 0x2212: // minus sign
case 0x2E17: // double oblique hyphen
case 0x2E3A: // two-em dash
case 0x2E3B: // three-em dash
case 0xFE58: // small em dash
case 0xFE63: // small hyphen-minus
case 0xFF0D: // fullwidth hyphen-minus
return true;
default:
return false;
}
}
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
if (cps.empty()) {
return;
}
// Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
if (cps.size() >= 3) {
int end = static_cast<int>(cps.size()) - 1;
while (end >= 0 && isPunctuation(cps[end].value)) {
--end;
}
int pos = end;
if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
--pos;
}
if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
cps.erase(cps.begin() + pos, cps.end());
}
}
}
while (!cps.empty() && isPunctuation(cps.front().value)) {
cps.erase(cps.begin());
}
while (!cps.empty() && isPunctuation(cps.back().value)) {
cps.pop_back();
}
}
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
std::vector<CodepointInfo> cps;
cps.reserve(word.size());
const unsigned char* base = reinterpret_cast<const unsigned char*>(word.c_str());
const unsigned char* ptr = base;
while (*ptr != 0) {
const unsigned char* current = ptr;
const uint32_t cp = utf8NextCodepoint(&ptr);
cps.push_back({cp, static_cast<size_t>(current - base)});
}
return cps;
}