feat: dict based Hyphenation (#305)
## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
This commit is contained in:
committed by
GitHub
parent
5fef99c641
commit
8824c87490
179
lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
Normal file
179
lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
Normal file
@@ -0,0 +1,179 @@
|
||||
#include "HyphenationCommon.h"
|
||||
|
||||
#include <Utf8.h>
|
||||
|
||||
namespace {
|
||||
|
||||
// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
|
||||
uint32_t toLowerLatinImpl(const uint32_t cp) {
|
||||
if (cp >= 'A' && cp <= 'Z') {
|
||||
return cp - 'A' + 'a';
|
||||
}
|
||||
if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
|
||||
return cp + 0x20;
|
||||
}
|
||||
|
||||
switch (cp) {
|
||||
case 0x0152: // Œ
|
||||
return 0x0153; // œ
|
||||
case 0x0178: // Ÿ
|
||||
return 0x00FF; // ÿ
|
||||
case 0x1E9E: // ẞ
|
||||
return 0x00DF; // ß
|
||||
default:
|
||||
return cp;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert Cyrillic uppercase letters to lowercase
|
||||
// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
|
||||
// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
|
||||
uint32_t toLowerCyrillicImpl(const uint32_t cp) {
|
||||
if (cp >= 0x0410 && cp <= 0x042F) {
|
||||
return cp + 0x20;
|
||||
}
|
||||
if (cp == 0x0401) {
|
||||
return 0x0451;
|
||||
}
|
||||
return cp;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
|
||||
|
||||
uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
|
||||
|
||||
bool isLatinLetter(const uint32_t cp) {
|
||||
if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
|
||||
cp != 0x00D7 && cp != 0x00F7) {
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (cp) {
|
||||
case 0x0152: // Œ
|
||||
case 0x0153: // œ
|
||||
case 0x0178: // Ÿ
|
||||
case 0x1E9E: // ẞ
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
|
||||
|
||||
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
|
||||
|
||||
bool isPunctuation(const uint32_t cp) {
|
||||
switch (cp) {
|
||||
case '-':
|
||||
case '.':
|
||||
case ',':
|
||||
case '!':
|
||||
case '?':
|
||||
case ';':
|
||||
case ':':
|
||||
case '"':
|
||||
case '\'':
|
||||
case ')':
|
||||
case '(':
|
||||
case 0x00AB: // «
|
||||
case 0x00BB: // »
|
||||
case 0x2018: // ‘
|
||||
case 0x2019: // ’
|
||||
case 0x201C: // “
|
||||
case 0x201D: // ”
|
||||
case 0x00A0: // no-break space
|
||||
case '{':
|
||||
case '}':
|
||||
case '[':
|
||||
case ']':
|
||||
case '/':
|
||||
case 0x203A: // ›
|
||||
case 0x2026: // …
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
|
||||
|
||||
bool isExplicitHyphen(const uint32_t cp) {
|
||||
switch (cp) {
|
||||
case '-':
|
||||
case 0x00AD: // soft hyphen
|
||||
case 0x058A: // Armenian hyphen
|
||||
case 0x2010: // hyphen
|
||||
case 0x2011: // non-breaking hyphen
|
||||
case 0x2012: // figure dash
|
||||
case 0x2013: // en dash
|
||||
case 0x2014: // em dash
|
||||
case 0x2015: // horizontal bar
|
||||
case 0x2043: // hyphen bullet
|
||||
case 0x207B: // superscript minus
|
||||
case 0x208B: // subscript minus
|
||||
case 0x2212: // minus sign
|
||||
case 0x2E17: // double oblique hyphen
|
||||
case 0x2E3A: // two-em dash
|
||||
case 0x2E3B: // three-em dash
|
||||
case 0xFE58: // small em dash
|
||||
case 0xFE63: // small hyphen-minus
|
||||
case 0xFF0D: // fullwidth hyphen-minus
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
|
||||
|
||||
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
|
||||
if (cps.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
|
||||
if (cps.size() >= 3) {
|
||||
int end = static_cast<int>(cps.size()) - 1;
|
||||
while (end >= 0 && isPunctuation(cps[end].value)) {
|
||||
--end;
|
||||
}
|
||||
int pos = end;
|
||||
if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||||
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||||
--pos;
|
||||
}
|
||||
if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
|
||||
cps.erase(cps.begin() + pos, cps.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
||||
cps.erase(cps.begin());
|
||||
}
|
||||
while (!cps.empty() && isPunctuation(cps.back().value)) {
|
||||
cps.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
||||
std::vector<CodepointInfo> cps;
|
||||
cps.reserve(word.size());
|
||||
|
||||
const unsigned char* base = reinterpret_cast<const unsigned char*>(word.c_str());
|
||||
const unsigned char* ptr = base;
|
||||
while (*ptr != 0) {
|
||||
const unsigned char* current = ptr;
|
||||
const uint32_t cp = utf8NextCodepoint(&ptr);
|
||||
cps.push_back({cp, static_cast<size_t>(current - base)});
|
||||
}
|
||||
|
||||
return cps;
|
||||
}
|
||||
Reference in New Issue
Block a user