feat: dict based Hyphenation (#305)

## Summary * Adds (optional) Hyphenation for English, French, German, Russian languages ## Additional Context * Included hyphenation dictionaries add approximately 280kb to the flash usage (German alone takes 200kb) * Trie encoded dictionaries are adopted from hypher project (https://github.com/typst/hypher) * Soft hyphens (and other explicit hyphens) take precedence over dict-based hyphenation. Overall, the hyphenation rules are quite aggressive, as I believe it makes more sense on our smaller screen. --------- Co-authored-by: Dave Allie <dave@daveallie.com>
2026-01-19 17:56:26 +05:00
parent 5fef99c641
commit 8824c87490
40 changed files with 36465 additions and 52 deletions
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@@ -0,0 +1,179 @@
+#include "HyphenationCommon.h"
+
+#include <Utf8.h>
+
+namespace {
+
+// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
+uint32_t toLowerLatinImpl(const uint32_t cp) {
+  if (cp >= 'A' && cp <= 'Z') {
+    return cp - 'A' + 'a';
+  }
+  if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
+    return cp + 0x20;
+  }
+
+  switch (cp) {
+    case 0x0152:      // Œ
+      return 0x0153;  // œ
+    case 0x0178:      // Ÿ
+      return 0x00FF;  // ÿ
+    case 0x1E9E:      // ẞ
+      return 0x00DF;  // ß
+    default:
+      return cp;
+  }
+}
+
+// Convert Cyrillic uppercase letters to lowercase
+// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
+// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
+uint32_t toLowerCyrillicImpl(const uint32_t cp) {
+  if (cp >= 0x0410 && cp <= 0x042F) {
+    return cp + 0x20;
+  }
+  if (cp == 0x0401) {
+    return 0x0451;
+  }
+  return cp;
+}
+
+}  // namespace
+
+uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
+
+uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
+
+bool isLatinLetter(const uint32_t cp) {
+  if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
+    return true;
+  }
+
+  if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
+      cp != 0x00D7 && cp != 0x00F7) {
+    return true;
+  }
+
+  switch (cp) {
+    case 0x0152:  // Œ
+    case 0x0153:  // œ
+    case 0x0178:  // Ÿ
+    case 0x1E9E:  // ẞ
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
+
+bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
+
+bool isPunctuation(const uint32_t cp) {
+  switch (cp) {
+    case '-':
+    case '.':
+    case ',':
+    case '!':
+    case '?':
+    case ';':
+    case ':':
+    case '"':
+    case '\'':
+    case ')':
+    case '(':
+    case 0x00AB:  // «
+    case 0x00BB:  // »
+    case 0x2018:  // ‘
+    case 0x2019:  // ’
+    case 0x201C:  // “
+    case 0x201D:  // ”
+    case 0x00A0:  // no-break space
+    case '{':
+    case '}':
+    case '[':
+    case ']':
+    case '/':
+    case 0x203A:  // ›
+    case 0x2026:  // …
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
+
+bool isExplicitHyphen(const uint32_t cp) {
+  switch (cp) {
+    case '-':
+    case 0x00AD:  // soft hyphen
+    case 0x058A:  // Armenian hyphen
+    case 0x2010:  // hyphen
+    case 0x2011:  // non-breaking hyphen
+    case 0x2012:  // figure dash
+    case 0x2013:  // en dash
+    case 0x2014:  // em dash
+    case 0x2015:  // horizontal bar
+    case 0x2043:  // hyphen bullet
+    case 0x207B:  // superscript minus
+    case 0x208B:  // subscript minus
+    case 0x2212:  // minus sign
+    case 0x2E17:  // double oblique hyphen
+    case 0x2E3A:  // two-em dash
+    case 0x2E3B:  // three-em dash
+    case 0xFE58:  // small em dash
+    case 0xFE63:  // small hyphen-minus
+    case 0xFF0D:  // fullwidth hyphen-minus
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
+
+void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
+  if (cps.empty()) {
+    return;
+  }
+
+  // Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
+  if (cps.size() >= 3) {
+    int end = static_cast<int>(cps.size()) - 1;
+    while (end >= 0 && isPunctuation(cps[end].value)) {
+      --end;
+    }
+    int pos = end;
+    if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
+      while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
+        --pos;
+      }
+      if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
+        cps.erase(cps.begin() + pos, cps.end());
+      }
+    }
+  }
+
+  while (!cps.empty() && isPunctuation(cps.front().value)) {
+    cps.erase(cps.begin());
+  }
+  while (!cps.empty() && isPunctuation(cps.back().value)) {
+    cps.pop_back();
+  }
+}
+
+std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
+  std::vector<CodepointInfo> cps;
+  cps.reserve(word.size());
+
+  const unsigned char* base = reinterpret_cast<const unsigned char*>(word.c_str());
+  const unsigned char* ptr = base;
+  while (*ptr != 0) {
+    const unsigned char* current = ptr;
+    const uint32_t cp = utf8NextCodepoint(&ptr);
+    cps.push_back({cp, static_cast<size_t>(current - base)});
+  }
+
+  return cps;
+}