Files
crosspoint-reader-mod/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
cottongin 60a3e21c0e mod: Phase 3 — Re-port unmerged upstream PRs
Re-applied upstream PRs not yet merged to upstream/master:

- #1055: Byte-level framebuffer writes (fillPhysicalHSpan*,
  optimized fillRect/drawLine/fillRectDither/fillPolygon)
- #1027: Word-width cache (FNV-1a, 128-entry) and hyphenation
  early exit in ParsedText for 7-9% layout speedup
- #1068: Already present in upstream — URL hyphenation fix
- #1019: Already present in upstream — file extensions in browser
- #1090/#1185/#1217: KOReader sync improvements — binary credential
  store, document hash caching, ChapterXPathIndexer integration
- #1209: OPDS multi-server — OpdsBookBrowserActivity accepts
  OpdsServer, directory picker for downloads, download-complete
  prompt with open/back options
- #857: Dictionary activities already ported in Phase 1/2
- #1003: Placeholder cover already integrated in Phase 2

Also fixed: STR_OFF i18n string, include paths, replaced
Epub::isValidThumbnailBmp with Storage.exists, replaced
StringUtils::checkFileExtension with FsHelpers equivalents.

Made-with: Cursor
2026-03-07 16:15:42 -05:00

391 lines
10 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "HyphenationCommon.h"
#include <Utf8.h>
namespace {
// Convert Latin uppercase letters (ASCII plus Latin-1 supplement) to lowercase
uint32_t toLowerLatinImpl(const uint32_t cp) {
if (cp >= 'A' && cp <= 'Z') {
return cp - 'A' + 'a';
}
if ((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00DE)) {
return cp + 0x20;
}
switch (cp) {
case 0x0152: // Œ
return 0x0153; // œ
case 0x0178: // Ÿ
return 0x00FF; // ÿ
case 0x1E9E: // ẞ
return 0x00DF; // ß
default:
return cp;
}
}
// Convert Cyrillic uppercase letters to lowercase
// Cyrillic uppercase range 0x0410-0x042F maps to lowercase by adding 0x20
// Special case: Cyrillic capital IO (0x0401) maps to lowercase io (0x0451)
uint32_t toLowerCyrillicImpl(const uint32_t cp) {
if (cp >= 0x0410 && cp <= 0x042F) {
return cp + 0x20;
}
if (cp == 0x0401) {
return 0x0451;
}
return cp;
}
} // namespace
uint32_t toLowerLatin(const uint32_t cp) { return toLowerLatinImpl(cp); }
uint32_t toLowerCyrillic(const uint32_t cp) { return toLowerCyrillicImpl(cp); }
bool isLatinLetter(const uint32_t cp) {
if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) {
return true;
}
if (((cp >= 0x00C0 && cp <= 0x00D6) || (cp >= 0x00D8 && cp <= 0x00F6) || (cp >= 0x00F8 && cp <= 0x00FF)) &&
cp != 0x00D7 && cp != 0x00F7) {
return true;
}
switch (cp) {
case 0x0152: // Œ
case 0x0153: // œ
case 0x0178: // Ÿ
case 0x1E9E: // ẞ
return true;
default:
return false;
}
}
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
bool isPunctuation(const uint32_t cp) {
switch (cp) {
case '-':
case '.':
case ',':
case '!':
case '?':
case ';':
case ':':
case '"':
case '\'':
case ')':
case '(':
case 0x00AB: // «
case 0x00BB: // »
case 0x2018: //
case 0x2019: //
case 0x201A: //
case 0x201C: // “
case 0x201D: // ”
case 0x201E: // „
case 0x00A0: // no-break space
case '{':
case '}':
case '[':
case ']':
case 0x203A: //
case 0x2026: // …
return true;
default:
return false;
}
}
bool isAsciiDigit(const uint32_t cp) { return cp >= '0' && cp <= '9'; }
bool isExplicitHyphen(const uint32_t cp) {
switch (cp) {
case '/':
case '-':
case 0x00AD: // soft hyphen
case 0x058A: // Armenian hyphen
case 0x2010: // hyphen
case 0x2011: // non-breaking hyphen
case 0x2012: // figure dash
case 0x2013: // en dash
case 0x2014: // em dash
case 0x2015: // horizontal bar
case 0x2043: // hyphen bullet
case 0x207B: // superscript minus
case 0x208B: // subscript minus
case 0x2212: // minus sign
case 0x2E17: // double oblique hyphen
case 0x2E3A: // two-em dash
case 0x2E3B: // three-em dash
case 0xFE58: // small em dash
case 0xFE63: // small hyphen-minus
case 0xFF0D: // fullwidth hyphen-minus
case 0x005F: // Underscore
case 0x2026: // Ellipsis
return true;
default:
return false;
}
}
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
if (cps.empty()) {
return;
}
// Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
if (cps.size() >= 3) {
int end = static_cast<int>(cps.size()) - 1;
while (end >= 0 && isPunctuation(cps[end].value)) {
--end;
}
int pos = end;
if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
--pos;
}
if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
cps.erase(cps.begin() + pos, cps.end());
}
}
}
while (!cps.empty() && isPunctuation(cps.front().value)) {
cps.erase(cps.begin());
}
while (!cps.empty() && isPunctuation(cps.back().value)) {
cps.pop_back();
}
}
std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
std::vector<CodepointInfo> cps;
cps.reserve(word.size());
const unsigned char* base = reinterpret_cast<const unsigned char*>(word.c_str());
const unsigned char* ptr = base;
while (*ptr != 0) {
const unsigned char* current = ptr;
const uint32_t cp = utf8NextCodepoint(&ptr);
// If this is a combining diacritic (e.g., U+0301 = acute) and there's
// a previous base character that can be composed into a single
// precomposed Unicode scalar (Latin-1 / Latin-Extended), do that
// composition here. This provides lightweight NFC-like behavior for
// common Western European diacritics (acute, grave, circumflex, tilde,
// diaeresis, cedilla) without pulling in a full Unicode normalization
// library.
if (!cps.empty()) {
uint32_t prev = cps.back().value;
uint32_t composed = 0;
switch (cp) {
case 0x0300: // grave
switch (prev) {
case 0x0041:
composed = 0x00C0;
break; // A -> À
case 0x0061:
composed = 0x00E0;
break; // a -> à
case 0x0045:
composed = 0x00C8;
break; // E -> È
case 0x0065:
composed = 0x00E8;
break; // e -> è
case 0x0049:
composed = 0x00CC;
break; // I -> Ì
case 0x0069:
composed = 0x00EC;
break; // i -> ì
case 0x004F:
composed = 0x00D2;
break; // O -> Ò
case 0x006F:
composed = 0x00F2;
break; // o -> ò
case 0x0055:
composed = 0x00D9;
break; // U -> Ù
case 0x0075:
composed = 0x00F9;
break; // u -> ù
default:
break;
}
break;
case 0x0301: // acute
switch (prev) {
case 0x0041:
composed = 0x00C1;
break; // A -> Á
case 0x0061:
composed = 0x00E1;
break; // a -> á
case 0x0045:
composed = 0x00C9;
break; // E -> É
case 0x0065:
composed = 0x00E9;
break; // e -> é
case 0x0049:
composed = 0x00CD;
break; // I -> Í
case 0x0069:
composed = 0x00ED;
break; // i -> í
case 0x004F:
composed = 0x00D3;
break; // O -> Ó
case 0x006F:
composed = 0x00F3;
break; // o -> ó
case 0x0055:
composed = 0x00DA;
break; // U -> Ú
case 0x0075:
composed = 0x00FA;
break; // u -> ú
case 0x0059:
composed = 0x00DD;
break; // Y -> Ý
case 0x0079:
composed = 0x00FD;
break; // y -> ý
default:
break;
}
break;
case 0x0302: // circumflex
switch (prev) {
case 0x0041:
composed = 0x00C2;
break; // A -> Â
case 0x0061:
composed = 0x00E2;
break; // a -> â
case 0x0045:
composed = 0x00CA;
break; // E -> Ê
case 0x0065:
composed = 0x00EA;
break; // e -> ê
case 0x0049:
composed = 0x00CE;
break; // I -> Î
case 0x0069:
composed = 0x00EE;
break; // i -> î
case 0x004F:
composed = 0x00D4;
break; // O -> Ô
case 0x006F:
composed = 0x00F4;
break; // o -> ô
case 0x0055:
composed = 0x00DB;
break; // U -> Û
case 0x0075:
composed = 0x00FB;
break; // u -> û
default:
break;
}
break;
case 0x0303: // tilde
switch (prev) {
case 0x0041:
composed = 0x00C3;
break; // A -> Ã
case 0x0061:
composed = 0x00E3;
break; // a -> ã
case 0x004E:
composed = 0x00D1;
break; // N -> Ñ
case 0x006E:
composed = 0x00F1;
break; // n -> ñ
default:
break;
}
break;
case 0x0308: // diaeresis/umlaut
switch (prev) {
case 0x0041:
composed = 0x00C4;
break; // A -> Ä
case 0x0061:
composed = 0x00E4;
break; // a -> ä
case 0x0045:
composed = 0x00CB;
break; // E -> Ë
case 0x0065:
composed = 0x00EB;
break; // e -> ë
case 0x0049:
composed = 0x00CF;
break; // I -> Ï
case 0x0069:
composed = 0x00EF;
break; // i -> ï
case 0x004F:
composed = 0x00D6;
break; // O -> Ö
case 0x006F:
composed = 0x00F6;
break; // o -> ö
case 0x0055:
composed = 0x00DC;
break; // U -> Ü
case 0x0075:
composed = 0x00FC;
break; // u -> ü
case 0x0059:
composed = 0x0178;
break; // Y -> Ÿ
case 0x0079:
composed = 0x00FF;
break; // y -> ÿ
default:
break;
}
break;
case 0x0327: // cedilla
switch (prev) {
case 0x0043:
composed = 0x00C7;
break; // C -> Ç
case 0x0063:
composed = 0x00E7;
break; // c -> ç
default:
break;
}
break;
default:
break;
}
if (composed != 0) {
cps.back().value = composed;
continue; // skip pushing the combining mark itself
}
}
cps.push_back({cp, static_cast<size_t>(current - base)});
}
return cps;
}