fix: Port upstream PRs #1038, #1037, #1045, #1019

- #1038 (partial): Add .erase() for consumed words in layoutAndExtractLines
  to fix redundant early flush bug; fix wordContinues flag in hyphenateWordAtIndex
- #1037: Add combining mark handling for hyphenation (NFC-like precomposition)
  and rendering (base glyph tracking in EpdFont, GfxRenderer including CCW)
- #1045: Shorten STR_FORGET_BUTTON labels across all 9 translation files
- #1019: Display file extensions in File Browser via getFileExtension helper
- Pull romanian.yaml from upstream/master (merged PR #987)

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
cottongin
2026-02-20 16:27:59 -05:00
parent 55a1fef01a
commit 406c3aeace
17 changed files with 725 additions and 25 deletions

View File

@@ -100,6 +100,15 @@ void ParsedText::layoutAndExtractLines(const GfxRenderer& renderer, const int fo
for (size_t i = 0; i < lineCount; ++i) {
extractLine(i, pageWidth, spaceWidth, wordWidths, wordContinues, lineBreakIndices, processLine);
}
// Remove consumed words so size() reflects only remaining words
if (lineCount > 0) {
const size_t consumed = lineBreakIndices[lineCount - 1];
words.erase(words.begin(), words.begin() + consumed);
wordStyles.erase(wordStyles.begin(), wordStyles.begin() + consumed);
wordContinues.erase(wordContinues.begin(), wordContinues.begin() + consumed);
forceBreakAfter.erase(forceBreakAfter.begin(), forceBreakAfter.begin() + consumed);
}
}
std::vector<uint16_t> ParsedText::calculateWordWidths(const GfxRenderer& renderer, const int fontId) {
@@ -392,11 +401,8 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
words.insert(words.begin() + wordIndex + 1, remainder);
wordStyles.insert(wordStyles.begin() + wordIndex + 1, style);
// The remainder inherits whatever continuation status the original word had with the word after it.
const bool originalContinuedToNext = wordContinues[wordIndex];
// The original word (now prefix) does NOT continue to remainder (hyphen separates them)
wordContinues[wordIndex] = false;
wordContinues.insert(wordContinues.begin() + wordIndex + 1, originalContinuedToNext);
// Preserve the prefix's attach-to-previous flag; allow a break between prefix and remainder.
wordContinues.insert(wordContinues.begin() + wordIndex + 1, false);
// Forced break belongs to the original whole word; transfer it to the remainder (last part).
if (!forceBreakAfter.empty()) {

View File

@@ -174,6 +174,213 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
while (*ptr != 0) {
const unsigned char* current = ptr;
const uint32_t cp = utf8NextCodepoint(&ptr);
// If this is a combining diacritic (e.g., U+0301 = acute) and there's
// a previous base character that can be composed into a single
// precomposed Unicode scalar (Latin-1 / Latin-Extended), do that
// composition here. This provides lightweight NFC-like behavior for
// common Western European diacritics (acute, grave, circumflex, tilde,
// diaeresis, cedilla) without pulling in a full Unicode normalization
// library.
if (!cps.empty()) {
uint32_t prev = cps.back().value;
uint32_t composed = 0;
switch (cp) {
case 0x0300: // grave
switch (prev) {
case 0x0041:
composed = 0x00C0;
break; // A -> À
case 0x0061:
composed = 0x00E0;
break; // a -> à
case 0x0045:
composed = 0x00C8;
break; // E -> È
case 0x0065:
composed = 0x00E8;
break; // e -> è
case 0x0049:
composed = 0x00CC;
break; // I -> Ì
case 0x0069:
composed = 0x00EC;
break; // i -> ì
case 0x004F:
composed = 0x00D2;
break; // O -> Ò
case 0x006F:
composed = 0x00F2;
break; // o -> ò
case 0x0055:
composed = 0x00D9;
break; // U -> Ù
case 0x0075:
composed = 0x00F9;
break; // u -> ù
default:
break;
}
break;
case 0x0301: // acute
switch (prev) {
case 0x0041:
composed = 0x00C1;
break; // A -> Á
case 0x0061:
composed = 0x00E1;
break; // a -> á
case 0x0045:
composed = 0x00C9;
break; // E -> É
case 0x0065:
composed = 0x00E9;
break; // e -> é
case 0x0049:
composed = 0x00CD;
break; // I -> Í
case 0x0069:
composed = 0x00ED;
break; // i -> í
case 0x004F:
composed = 0x00D3;
break; // O -> Ó
case 0x006F:
composed = 0x00F3;
break; // o -> ó
case 0x0055:
composed = 0x00DA;
break; // U -> Ú
case 0x0075:
composed = 0x00FA;
break; // u -> ú
case 0x0059:
composed = 0x00DD;
break; // Y -> Ý
case 0x0079:
composed = 0x00FD;
break; // y -> ý
default:
break;
}
break;
case 0x0302: // circumflex
switch (prev) {
case 0x0041:
composed = 0x00C2;
break; // A -> Â
case 0x0061:
composed = 0x00E2;
break; // a -> â
case 0x0045:
composed = 0x00CA;
break; // E -> Ê
case 0x0065:
composed = 0x00EA;
break; // e -> ê
case 0x0049:
composed = 0x00CE;
break; // I -> Î
case 0x0069:
composed = 0x00EE;
break; // i -> î
case 0x004F:
composed = 0x00D4;
break; // O -> Ô
case 0x006F:
composed = 0x00F4;
break; // o -> ô
case 0x0055:
composed = 0x00DB;
break; // U -> Û
case 0x0075:
composed = 0x00FB;
break; // u -> û
default:
break;
}
break;
case 0x0303: // tilde
switch (prev) {
case 0x0041:
composed = 0x00C3;
break; // A -> Ã
case 0x0061:
composed = 0x00E3;
break; // a -> ã
case 0x004E:
composed = 0x00D1;
break; // N -> Ñ
case 0x006E:
composed = 0x00F1;
break; // n -> ñ
default:
break;
}
break;
case 0x0308: // diaeresis/umlaut
switch (prev) {
case 0x0041:
composed = 0x00C4;
break; // A -> Ä
case 0x0061:
composed = 0x00E4;
break; // a -> ä
case 0x0045:
composed = 0x00CB;
break; // E -> Ë
case 0x0065:
composed = 0x00EB;
break; // e -> ë
case 0x0049:
composed = 0x00CF;
break; // I -> Ï
case 0x0069:
composed = 0x00EF;
break; // i -> ï
case 0x004F:
composed = 0x00D6;
break; // O -> Ö
case 0x006F:
composed = 0x00F6;
break; // o -> ö
case 0x0055:
composed = 0x00DC;
break; // U -> Ü
case 0x0075:
composed = 0x00FC;
break; // u -> ü
case 0x0059:
composed = 0x0178;
break; // Y -> Ÿ
case 0x0079:
composed = 0x00FF;
break; // y -> ÿ
default:
break;
}
break;
case 0x0327: // cedilla
switch (prev) {
case 0x0043:
composed = 0x00C7;
break; // C -> Ç
case 0x0063:
composed = 0x00E7;
break; // c -> ç
default:
break;
}
break;
default:
break;
}
if (composed != 0) {
cps.back().value = composed;
continue; // skip pushing the combining mark itself
}
}
cps.push_back({cp, static_cast<size_t>(current - base)});
}