fix: Fix hyphenation and rendering of decomposed characters (#1037)

## Summary * This PR fixes decomposed diacritic handling end-to-end: - Hyphenation: normalize common Latin base+combining sequences to precomposed codepoints before Liang pattern matching, so decomposed words hyphenate correctly - Rendering: correct combining-mark placement logic so non-spacing marks are attached to the preceding base glyph in normal and rotated text rendering paths, with corresponding text-bounds consistency updates. - Hyphenation around non breaking space variants have been fixed (and extended) - Hyphenation of terms that already included of hyphens were fixed to include Liang pattern application (eg "US-Satellitensystem" was *exclusively* broken at the existing hyphen) ## Additional Context * Before <img width="800" height="480" alt="2" src="https://github.com/user-attachments/assets/b9c515c4-ab75-45cc-8b52-f4d86bce519d" /> * After <img width="480" height="800" alt="fix1" src="https://github.com/user-attachments/assets/4999f6a8-f51c-4c0a-b144-f153f77ddb57" /> <img width="800" height="480" alt="fix2" src="https://github.com/user-attachments/assets/7355126b-80c7-441f-b390-4e0897ee3fb6" /> * Note 1: the hyphenation fix is not a 100% bullet proof implementation. It adds composition of *common* base+combining sequences (e.g. O + U+0308 -> Ö) during codepoint collection. A complete solution would require implementing proper Unicode normalization (at least NFC, possibly NFKC in specific cases) before hyphenation and rendering, instead of hand-mapping a few combining marks. That was beyond the scope of this fix. * Note 2: the render fix should be universal and not limited to the constraints outlined above: it properly x-centers the compund glyph over the previous one, and it uses at least 1pt of visual distance in y. Before: <img width="478" height="167" alt="Image" src="https://github.com/user-attachments/assets/f8db60d5-35b1-4477-96d0-5003b4e4a2a1" /> After: <img width="479" height="180" alt="Image" src="https://github.com/user-attachments/assets/1b48ef97-3a77-475a-8522-23f4aca8e904" /> * This should resolve the issues described in #998 --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**PARTIALLY**_
2026-02-22 03:11:07 +01:00
parent 10a2678584
commit 5f5561b684
8 changed files with 459 additions and 29 deletions
--- a/lib/EpdFont/EpdFont.cpp
+++ b/lib/EpdFont/EpdFont.cpp
@@ -17,6 +17,11 @@ void EpdFont::getTextBounds(const char* string, const int startX, const int star

  int cursorX = startX;
  const int cursorY = startY;
+  int lastBaseX = startX;
+  int lastBaseAdvance = 0;
+  int lastBaseTop = 0;
+  bool hasBaseGlyph = false;
+  constexpr int MIN_COMBINING_GAP_PX = 1;
  uint32_t cp;
  while ((cp = utf8NextCodepoint(reinterpret_cast<const uint8_t**>(&string)))) {
    const EpdGlyph* glyph = getGlyph(cp);
@@ -30,11 +35,30 @@ void EpdFont::getTextBounds(const char* string, const int startX, const int star
      continue;
    }

-    *minX = std::min(*minX, cursorX + glyph->left);
-    *maxX = std::max(*maxX, cursorX + glyph->left + glyph->width);
-    *minY = std::min(*minY, cursorY + glyph->top - glyph->height);
-    *maxY = std::max(*maxY, cursorY + glyph->top);
-    cursorX += glyph->advanceX;
+    const bool isCombining = utf8IsCombiningMark(cp);
+    int raiseBy = 0;
+    if (isCombining && hasBaseGlyph) {
+      const int currentGap = glyph->top - glyph->height - lastBaseTop;
+      if (currentGap < MIN_COMBINING_GAP_PX) {
+        raiseBy = MIN_COMBINING_GAP_PX - currentGap;
+      }
+    }
+
+    const int glyphBaseX = (isCombining && hasBaseGlyph) ? (lastBaseX + lastBaseAdvance / 2) : cursorX;
+    const int glyphBaseY = cursorY - raiseBy;
+
+    *minX = std::min(*minX, glyphBaseX + glyph->left);
+    *maxX = std::max(*maxX, glyphBaseX + glyph->left + glyph->width);
+    *minY = std::min(*minY, glyphBaseY + glyph->top - glyph->height);
+    *maxY = std::max(*maxY, glyphBaseY + glyph->top);
+
+    if (!isCombining) {
+      lastBaseX = cursorX;
+      lastBaseAdvance = glyph->advanceX;
+      lastBaseTop = glyph->top;
+      hasBaseGlyph = true;
+      cursorX += glyph->advanceX;
+    }
  }
 }

--- a/lib/Epub/Epub/ParsedText.cpp
+++ b/lib/Epub/Epub/ParsedText.cpp
@@ -378,20 +378,35 @@ bool ParsedText::hyphenateWordAtIndex(const size_t wordIndex, const int availabl
  words.insert(insertWordIt, remainder);
  wordStyles.insert(insertStyleIt, style);

-  // The remainder inherits whatever continuation status the original word had with the word after it.
-  // Find the continues entry for the original word and insert the remainder's entry after it.
+  // Continuation flag handling after splitting a word into prefix + remainder.
+  //
+  // The prefix keeps the original word's continuation flag so that no-break-space groups
+  // stay linked. The remainder always gets continues=false because it starts on the next
+  // line and is not attached to the prefix.
+  //
+  // Example: "200&#xA0;Quadratkilometer" produces tokens:
+  //   [0] "200"               continues=false
+  //   [1] " "                 continues=true
+  //   [2] "Quadratkilometer"  continues=true   <-- the word being split
+  //
+  // After splitting "Quadratkilometer" at "Quadrat-" / "kilometer":
+  //   [0] "200"         continues=false
+  //   [1] " "           continues=true
+  //   [2] "Quadrat-"    continues=true   (KEPT — still attached to the no-break group)
+  //   [3] "kilometer"   continues=false  (NEW — starts fresh on the next line)
+  //
+  // This lets the backtracking loop keep the entire prefix group ("200 Quadrat-") on one
+  // line, while "kilometer" moves to the next line.
  auto continuesIt = wordContinues.begin();
  std::advance(continuesIt, wordIndex);
-  const bool originalContinuedToNext = *continuesIt;
-  // The original word (now prefix) does NOT continue to remainder (hyphen separates them)
-  *continuesIt = false;
+  // *continuesIt is intentionally left unchanged — the prefix keeps its original attachment.
  const auto insertContinuesIt = std::next(continuesIt);
-  wordContinues.insert(insertContinuesIt, originalContinuedToNext);
+  wordContinues.insert(insertContinuesIt, false);

-  // Keep the indexed vector in sync if provided
+  // Keep the indexed vector in sync if provided.
  if (continuesVec) {
-    (*continuesVec)[wordIndex] = false;
-    continuesVec->insert(continuesVec->begin() + wordIndex + 1, originalContinuedToNext);
+    // (*continuesVec)[wordIndex] stays unchanged — prefix keeps its attachment.
+    continuesVec->insert(continuesVec->begin() + wordIndex + 1, false);
  }

  // Update cached widths to reflect the new prefix/remainder pairing.
--- a/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
+++ b/lib/Epub/Epub/hyphenation/HyphenationCommon.cpp
@@ -174,6 +174,213 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
  while (*ptr != 0) {
    const unsigned char* current = ptr;
    const uint32_t cp = utf8NextCodepoint(&ptr);
+    // If this is a combining diacritic (e.g., U+0301 = acute) and there's
+    // a previous base character that can be composed into a single
+    // precomposed Unicode scalar (Latin-1 / Latin-Extended), do that
+    // composition here. This provides lightweight NFC-like behavior for
+    // common Western European diacritics (acute, grave, circumflex, tilde,
+    // diaeresis, cedilla) without pulling in a full Unicode normalization
+    // library.
+    if (!cps.empty()) {
+      uint32_t prev = cps.back().value;
+      uint32_t composed = 0;
+      switch (cp) {
+        case 0x0300:  // grave
+          switch (prev) {
+            case 0x0041:
+              composed = 0x00C0;
+              break;  // A -> À
+            case 0x0061:
+              composed = 0x00E0;
+              break;  // a -> à
+            case 0x0045:
+              composed = 0x00C8;
+              break;  // E -> È
+            case 0x0065:
+              composed = 0x00E8;
+              break;  // e -> è
+            case 0x0049:
+              composed = 0x00CC;
+              break;  // I -> Ì
+            case 0x0069:
+              composed = 0x00EC;
+              break;  // i -> ì
+            case 0x004F:
+              composed = 0x00D2;
+              break;  // O -> Ò
+            case 0x006F:
+              composed = 0x00F2;
+              break;  // o -> ò
+            case 0x0055:
+              composed = 0x00D9;
+              break;  // U -> Ù
+            case 0x0075:
+              composed = 0x00F9;
+              break;  // u -> ù
+            default:
+              break;
+          }
+          break;
+        case 0x0301:  // acute
+          switch (prev) {
+            case 0x0041:
+              composed = 0x00C1;
+              break;  // A -> Á
+            case 0x0061:
+              composed = 0x00E1;
+              break;  // a -> á
+            case 0x0045:
+              composed = 0x00C9;
+              break;  // E -> É
+            case 0x0065:
+              composed = 0x00E9;
+              break;  // e -> é
+            case 0x0049:
+              composed = 0x00CD;
+              break;  // I -> Í
+            case 0x0069:
+              composed = 0x00ED;
+              break;  // i -> í
+            case 0x004F:
+              composed = 0x00D3;
+              break;  // O -> Ó
+            case 0x006F:
+              composed = 0x00F3;
+              break;  // o -> ó
+            case 0x0055:
+              composed = 0x00DA;
+              break;  // U -> Ú
+            case 0x0075:
+              composed = 0x00FA;
+              break;  // u -> ú
+            case 0x0059:
+              composed = 0x00DD;
+              break;  // Y -> Ý
+            case 0x0079:
+              composed = 0x00FD;
+              break;  // y -> ý
+            default:
+              break;
+          }
+          break;
+        case 0x0302:  // circumflex
+          switch (prev) {
+            case 0x0041:
+              composed = 0x00C2;
+              break;  // A -> Â
+            case 0x0061:
+              composed = 0x00E2;
+              break;  // a -> â
+            case 0x0045:
+              composed = 0x00CA;
+              break;  // E -> Ê
+            case 0x0065:
+              composed = 0x00EA;
+              break;  // e -> ê
+            case 0x0049:
+              composed = 0x00CE;
+              break;  // I -> Î
+            case 0x0069:
+              composed = 0x00EE;
+              break;  // i -> î
+            case 0x004F:
+              composed = 0x00D4;
+              break;  // O -> Ô
+            case 0x006F:
+              composed = 0x00F4;
+              break;  // o -> ô
+            case 0x0055:
+              composed = 0x00DB;
+              break;  // U -> Û
+            case 0x0075:
+              composed = 0x00FB;
+              break;  // u -> û
+            default:
+              break;
+          }
+          break;
+        case 0x0303:  // tilde
+          switch (prev) {
+            case 0x0041:
+              composed = 0x00C3;
+              break;  // A -> Ã
+            case 0x0061:
+              composed = 0x00E3;
+              break;  // a -> ã
+            case 0x004E:
+              composed = 0x00D1;
+              break;  // N -> Ñ
+            case 0x006E:
+              composed = 0x00F1;
+              break;  // n -> ñ
+            default:
+              break;
+          }
+          break;
+        case 0x0308:  // diaeresis/umlaut
+          switch (prev) {
+            case 0x0041:
+              composed = 0x00C4;
+              break;  // A -> Ä
+            case 0x0061:
+              composed = 0x00E4;
+              break;  // a -> ä
+            case 0x0045:
+              composed = 0x00CB;
+              break;  // E -> Ë
+            case 0x0065:
+              composed = 0x00EB;
+              break;  // e -> ë
+            case 0x0049:
+              composed = 0x00CF;
+              break;  // I -> Ï
+            case 0x0069:
+              composed = 0x00EF;
+              break;  // i -> ï
+            case 0x004F:
+              composed = 0x00D6;
+              break;  // O -> Ö
+            case 0x006F:
+              composed = 0x00F6;
+              break;  // o -> ö
+            case 0x0055:
+              composed = 0x00DC;
+              break;  // U -> Ü
+            case 0x0075:
+              composed = 0x00FC;
+              break;  // u -> ü
+            case 0x0059:
+              composed = 0x0178;
+              break;  // Y -> Ÿ
+            case 0x0079:
+              composed = 0x00FF;
+              break;  // y -> ÿ
+            default:
+              break;
+          }
+          break;
+        case 0x0327:  // cedilla
+          switch (prev) {
+            case 0x0043:
+              composed = 0x00C7;
+              break;  // C -> Ç
+            case 0x0063:
+              composed = 0x00E7;
+              break;  // c -> ç
+            default:
+              break;
+          }
+          break;
+        default:
+          break;
+      }
+
+      if (composed != 0) {
+        cps.back().value = composed;
+        continue;  // skip pushing the combining mark itself
+      }
+    }
+
    cps.push_back({cp, static_cast<size_t>(current - base)});
  }

--- a/lib/Epub/Epub/hyphenation/Hyphenator.cpp
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.cpp
@@ -1,8 +1,10 @@
 #include "Hyphenator.h"

+#include <algorithm>
 #include <vector>

 #include "HyphenationCommon.h"
+#include "LanguageHyphenator.h"
 #include "LanguageRegistry.h"

 const LanguageHyphenator* Hyphenator::cachedHyphenator_ = nullptr;
@@ -32,10 +34,19 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
 }

 // Builds a vector of break information from explicit hyphen markers in the given codepoints.
+// Only hyphens that appear between two alphabetic characters are considered valid breaks.
+//
+// Example: "US-Satellitensystems" (cps: U, S, -, S, a, t, ...)
+//   -> finds '-' at index 2 with alphabetic neighbors 'S' and 'S'
+//   -> returns one BreakInfo at the byte offset of 'S' (the char after '-'),
+//      with requiresInsertedHyphen=false because '-' is already visible.
+//
+// Example: "Satel\u00ADliten" (soft-hyphen between 'l' and 'l')
+//   -> returns one BreakInfo with requiresInsertedHyphen=true (soft-hyphen
+//      is invisible and needs a visible '-' when the break is used).
 std::vector<Hyphenator::BreakInfo> buildExplicitBreakInfos(const std::vector<CodepointInfo>& cps) {
  std::vector<Hyphenator::BreakInfo> breaks;

-  // Scan every codepoint looking for explicit/soft hyphen markers that are surrounded by letters.
  for (size_t i = 1; i + 1 < cps.size(); ++i) {
    const uint32_t cp = cps[i].value;
    if (!isExplicitHyphen(cp) || !isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
@@ -63,6 +74,43 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
  // Explicit hyphen markers (soft or hard) take precedence over language breaks.
  auto explicitBreakInfos = buildExplicitBreakInfos(cps);
  if (!explicitBreakInfos.empty()) {
+    // When a word contains explicit hyphens we also run Liang patterns on each alphabetic
+    // segment between them. Without this, "US-Satellitensystems" would only offer one split
+    // point (after "US-"), making it impossible to break mid-"Satellitensystems" even when
+    // "US-Satelliten-" would fit on the line.
+    //
+    // Example: "US-Satellitensystems"
+    //   Segments: ["US", "Satellitensystems"]
+    //   Explicit break: after "US-"           -> @3  (no inserted hyphen)
+    //   Pattern breaks on "Satellitensystems" -> @5  Sa|tel  (+hyphen)
+    //                                            @8  Satel|li  (+hyphen)
+    //                                            @10 Satelli|ten  (+hyphen)
+    //                                            @13 Satelliten|sys  (+hyphen)
+    //                                            @16 Satellitensys|tems  (+hyphen)
+    //   Result: 6 sorted break points; the line-breaker picks the widest prefix that fits.
+    if (hyphenator) {
+      size_t segStart = 0;
+      for (size_t i = 0; i <= cps.size(); ++i) {
+        const bool atEnd = (i == cps.size());
+        const bool atHyphen = !atEnd && isExplicitHyphen(cps[i].value);
+        if (atEnd || atHyphen) {
+          if (i > segStart) {
+            std::vector<CodepointInfo> segment(cps.begin() + segStart, cps.begin() + i);
+            auto segIndexes = hyphenator->breakIndexes(segment);
+            for (const size_t idx : segIndexes) {
+              const size_t cpIdx = segStart + idx;
+              if (cpIdx < cps.size()) {
+                explicitBreakInfos.push_back({cps[cpIdx].byteOffset, true});
+              }
+            }
+          }
+          segStart = i + 1;
+        }
+      }
+      // Merge explicit and pattern breaks into ascending byte-offset order.
+      std::sort(explicitBreakInfos.begin(), explicitBreakInfos.end(),
+                [](const BreakInfo& a, const BreakInfo& b) { return a.byteOffset < b.byteOffset; });
+    }
    return explicitBreakInfos;
  }

--- a/lib/Epub/Epub/hyphenation/Hyphenator.h
+++ b/lib/Epub/Epub/hyphenation/Hyphenator.h
@@ -9,11 +9,24 @@ class LanguageHyphenator;
 class Hyphenator {
 public:
  struct BreakInfo {
-    size_t byteOffset;
-    bool requiresInsertedHyphen;
+    size_t byteOffset;            // Byte position inside the UTF-8 word where a break may occur.
+    bool requiresInsertedHyphen;  // true = a visible '-' must be rendered at the break (pattern/fallback breaks).
+                                  // false = the word already contains a hyphen at this position (explicit '-').
  };
-  // Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
-  // minimum prefix/suffix constraints are returned even if no language-specific rule matches.
+
+  // Returns byte offsets where the word may be hyphenated.
+  //
+  // Break sources (in priority order):
+  //   1. Explicit hyphens already present in the word (e.g. '-' or soft-hyphen U+00AD).
+  //      When found, language patterns are additionally run on each alphabetic segment
+  //      between hyphens so compound words can break within their parts.
+  //      Example: "US-Satellitensystems" yields breaks after "US-" (no inserted hyphen)
+  //               plus pattern breaks inside "Satellitensystems" (Sa|tel|li|ten|sys|tems).
+  //   2. Language-specific Liang patterns (e.g. German de_patterns).
+  //      Example: "Quadratkilometer" -> Qua|drat|ki|lo|me|ter.
+  //   3. Fallback every-N-chars splitting (only when includeFallback is true AND no
+  //      pattern breaks were found). Used as a last resort to prevent a single oversized
+  //      word from overflowing the page width.
  static std::vector<BreakInfo> breakOffsets(const std::string& word, bool includeFallback);

  // Provide a publication-level language hint (e.g. "en", "en-US", "ru") used to select hyphenation rules.
--- a/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
+++ b/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.cpp
@@ -594,28 +594,60 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
      continue;
    }

-    // Detect U+00A0 (non-breaking space): UTF-8 encoding is 0xC2 0xA0
-    // Render a visible space without allowing a line break around it.
+    // Detect U+00A0 (non-breaking space, UTF-8: 0xC2 0xA0) or
+    //        U+202F (narrow no-break space, UTF-8: 0xE2 0x80 0xAF).
+    //
+    // Both are rendered as a visible space but must never allow a line break around them.
+    // We split the no-break space into its own word token and link the surrounding words
+    // with continuation flags so the layout engine treats them as an indivisible group.
+    //
+    // Example: "200&#xA0;Quadratkilometer" or "200&#x202F;Quadratkilometer"
+    //   Input bytes:  "200\xC2\xA0Quadratkilometer"  (or 0xE2 0x80 0xAF for U+202F)
+    //   Tokens produced:
+    //     [0] "200"               continues=false
+    //     [1] " "                 continues=true   (attaches to "200", no gap)
+    //     [2] "Quadratkilometer"  continues=true   (attaches to " ", no gap)
+    //
+    //   The continuation flags prevent the line-breaker from inserting a line break
+    //   between "200" and "Quadratkilometer". However, "Quadratkilometer" is now a
+    //   standalone word for hyphenation purposes, so Liang patterns can produce
+    //   "200 Quadrat-" / "kilometer" instead of the unusable "200" / "Quadratkilometer".
    if (static_cast<uint8_t>(s[i]) == 0xC2 && i + 1 < len && static_cast<uint8_t>(s[i + 1]) == 0xA0) {
-      // Flush any pending text so style is applied correctly.
      if (self->partWordBufferIndex > 0) {
        self->flushPartWordBuffer();
      }

-      // Add a standalone space that attaches to the previous word.
      self->partWordBuffer[0] = ' ';
      self->partWordBuffer[1] = '\0';
      self->partWordBufferIndex = 1;
      self->nextWordContinues = true;  // Attach space to previous word (no break).
      self->flushPartWordBuffer();

-      // Ensure the next real word attaches to this space (no break).
-      self->nextWordContinues = true;
+      self->nextWordContinues = true;  // Next real word attaches to this space (no break).

      i++;  // Skip the second byte (0xA0)
      continue;
    }

+    // U+202F (narrow no-break space) — identical logic to U+00A0 above.
+    if (static_cast<uint8_t>(s[i]) == 0xE2 && i + 2 < len && static_cast<uint8_t>(s[i + 1]) == 0x80 &&
+        static_cast<uint8_t>(s[i + 2]) == 0xAF) {
+      if (self->partWordBufferIndex > 0) {
+        self->flushPartWordBuffer();
+      }
+
+      self->partWordBuffer[0] = ' ';
+      self->partWordBuffer[1] = '\0';
+      self->partWordBufferIndex = 1;
+      self->nextWordContinues = true;
+      self->flushPartWordBuffer();
+
+      self->nextWordContinues = true;
+
+      i += 2;  // Skip the remaining two bytes (0x80 0xAF)
+      continue;
+    }
+
    // Skip Zero Width No-Break Space / BOM (U+FEFF) = 0xEF 0xBB 0xBF
    const XML_Char FEFF_BYTE_1 = static_cast<XML_Char>(0xEF);
    const XML_Char FEFF_BYTE_2 = static_cast<XML_Char>(0xBB);
--- a/lib/GfxRenderer/GfxRenderer.cpp
+++ b/lib/GfxRenderer/GfxRenderer.cpp
@@ -157,10 +157,12 @@ static void renderCharImpl(const GfxRenderer& renderer, GfxRenderer::RenderMode
    }
  }

-  if constexpr (rotation == TextRotation::Rotated90CW) {
-    *cursorY -= glyph->advanceX;
-  } else {
-    *cursorX += glyph->advanceX;
+  if (!utf8IsCombiningMark(cp)) {
+    if constexpr (rotation == TextRotation::Rotated90CW) {
+      *cursorY -= glyph->advanceX;
+    } else {
+      *cursorX += glyph->advanceX;
+    }
  }
 }

@@ -212,6 +214,11 @@ void GfxRenderer::drawText(const int fontId, const int x, const int y, const cha
                           const EpdFontFamily::Style style) const {
  int yPos = y + getFontAscenderSize(fontId);
  int xpos = x;
+  int lastBaseX = x;
+  int lastBaseY = yPos;
+  int lastBaseAdvance = 0;
+  int lastBaseTop = 0;
+  bool hasBaseGlyph = false;

  // cannot draw a NULL / empty string
  if (text == nullptr || *text == '\0') {
@@ -224,9 +231,43 @@ void GfxRenderer::drawText(const int fontId, const int x, const int y, const cha
    return;
  }
  const auto& font = fontIt->second;
+  constexpr int MIN_COMBINING_GAP_PX = 1;

  uint32_t cp;
  while ((cp = utf8NextCodepoint(reinterpret_cast<const uint8_t**>(&text)))) {
+    if (utf8IsCombiningMark(cp) && hasBaseGlyph) {
+      const EpdGlyph* combiningGlyph = font.getGlyph(cp, style);
+      if (!combiningGlyph) {
+        combiningGlyph = font.getGlyph(REPLACEMENT_GLYPH, style);
+      }
+
+      int raiseBy = 0;
+      if (combiningGlyph) {
+        const int currentGap = combiningGlyph->top - combiningGlyph->height - lastBaseTop;
+        if (currentGap < MIN_COMBINING_GAP_PX) {
+          raiseBy = MIN_COMBINING_GAP_PX - currentGap;
+        }
+      }
+
+      int combiningX = lastBaseX + lastBaseAdvance / 2;
+      int combiningY = lastBaseY - raiseBy;
+      renderChar(font, cp, &combiningX, &combiningY, black, style);
+      continue;
+    }
+
+    const EpdGlyph* glyph = font.getGlyph(cp, style);
+    if (!glyph) {
+      glyph = font.getGlyph(REPLACEMENT_GLYPH, style);
+    }
+
+    if (!utf8IsCombiningMark(cp)) {
+      lastBaseX = xpos;
+      lastBaseY = yPos;
+      lastBaseAdvance = glyph ? glyph->advanceX : 0;
+      lastBaseTop = glyph ? glyph->top : 0;
+      hasBaseGlyph = true;
+    }
+
    renderChar(font, cp, &xpos, &yPos, black, style);
  }
 }
@@ -864,6 +905,9 @@ int GfxRenderer::getTextAdvanceX(const int fontId, const char* text, const EpdFo
  int width = 0;
  const auto& font = fontIt->second;
  while ((cp = utf8NextCodepoint(reinterpret_cast<const uint8_t**>(&text)))) {
+    if (utf8IsCombiningMark(cp)) {
+      continue;
+    }
    const EpdGlyph* glyph = font.getGlyph(cp, style);
    if (!glyph) glyph = font.getGlyph(REPLACEMENT_GLYPH, style);
    if (glyph) width += glyph->advanceX;
@@ -917,9 +961,48 @@ void GfxRenderer::drawTextRotated90CW(const int fontId, const int x, const int y

  int xPos = x;
  int yPos = y;
+  int lastBaseX = x;
+  int lastBaseY = y;
+  int lastBaseAdvance = 0;
+  int lastBaseTop = 0;
+  bool hasBaseGlyph = false;
+  constexpr int MIN_COMBINING_GAP_PX = 1;

  uint32_t cp;
  while ((cp = utf8NextCodepoint(reinterpret_cast<const uint8_t**>(&text)))) {
+    if (utf8IsCombiningMark(cp) && hasBaseGlyph) {
+      const EpdGlyph* combiningGlyph = font.getGlyph(cp, style);
+      if (!combiningGlyph) {
+        combiningGlyph = font.getGlyph(REPLACEMENT_GLYPH, style);
+      }
+
+      int raiseBy = 0;
+      if (combiningGlyph) {
+        const int currentGap = combiningGlyph->top - combiningGlyph->height - lastBaseTop;
+        if (currentGap < MIN_COMBINING_GAP_PX) {
+          raiseBy = MIN_COMBINING_GAP_PX - currentGap;
+        }
+      }
+
+      int combiningX = lastBaseX - raiseBy;
+      int combiningY = lastBaseY - lastBaseAdvance / 2;
+      renderCharImpl<TextRotation::Rotated90CW>(*this, renderMode, font, cp, &combiningX, &combiningY, black, style);
+      continue;
+    }
+
+    const EpdGlyph* glyph = font.getGlyph(cp, style);
+    if (!glyph) {
+      glyph = font.getGlyph(REPLACEMENT_GLYPH, style);
+    }
+
+    if (!utf8IsCombiningMark(cp)) {
+      lastBaseX = xPos;
+      lastBaseY = yPos;
+      lastBaseAdvance = glyph ? glyph->advanceX : 0;
+      lastBaseTop = glyph ? glyph->top : 0;
+      hasBaseGlyph = true;
+    }
+
    renderCharImpl<TextRotation::Rotated90CW>(*this, renderMode, font, cp, &xPos, &yPos, black, style);
  }
 }
--- a/lib/Utf8/Utf8.h
+++ b/lib/Utf8/Utf8.h
@@ -9,3 +9,11 @@ uint32_t utf8NextCodepoint(const unsigned char** string);
 size_t utf8RemoveLastChar(std::string& str);
 // Truncate string by removing N UTF-8 codepoints from the end.
 void utf8TruncateChars(std::string& str, size_t numChars);
+
+// Returns true for Unicode combining diacritical marks that should not advance the cursor.
+inline bool utf8IsCombiningMark(const uint32_t cp) {
+  return (cp >= 0x0300 && cp <= 0x036F)      // Combining Diacritical Marks
+         || (cp >= 0x1DC0 && cp <= 0x1DFF)   // Combining Diacritical Marks Supplement
+         || (cp >= 0x20D0 && cp <= 0x20FF)   // Combining Diacritical Marks for Symbols
+         || (cp >= 0xFE20 && cp <= 0xFE2F);  // Combining Half Marks
+}