crosspoint-reader/lib/ThaiShaper/ThaiClusterBuilder.cpp
2026-01-22 00:07:29 +07:00

322 lines
9.2 KiB
C++

#include "ThaiClusterBuilder.h"
#include <Utf8.h>
// Debug logging for Thai rendering investigation
// Set to 1 to enable verbose cluster building logging
#define THAI_CLUSTER_DEBUG_LOGGING 0
#if THAI_CLUSTER_DEBUG_LOGGING
#include <Arduino.h>
#endif
namespace ThaiShaper {
bool ThaiClusterBuilder::isAscenderConsonant(uint32_t cp) {
// Thai consonants with tall ascenders that may affect mark positioning
// These have parts that extend above the normal consonant height
switch (cp) {
case 0x0E1B: // PO PLA (ป)
case 0x0E1D: // FO FA (ฝ)
case 0x0E1F: // FO FAN (ฟ)
case 0x0E2C: // LO CHULA (ฬ)
return true;
default:
return false;
}
}
bool ThaiClusterBuilder::isDescenderConsonant(uint32_t cp) {
// Thai consonants with descenders that extend below the baseline
// These may affect below-vowel positioning
switch (cp) {
case 0x0E0E: // DO CHADA (ฎ)
case 0x0E0F: // TO PATAK (ฏ)
case 0x0E24: // RU (ฤ)
case 0x0E26: // LU (ฦ)
return true;
default:
return false;
}
}
std::vector<ThaiCluster> ThaiClusterBuilder::buildClusters(const char* text) {
std::vector<ThaiCluster> clusters;
if (text == nullptr || *text == '\0') {
return clusters;
}
#if THAI_CLUSTER_DEBUG_LOGGING
Serial.printf("[THAI] buildClusters input bytes: ");
const uint8_t* debugPtr = reinterpret_cast<const uint8_t*>(text);
for (int i = 0; i < 32 && debugPtr[i] != '\0'; i++) {
Serial.printf("%02X ", debugPtr[i]);
}
Serial.printf("\n");
#endif
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(text);
while (*ptr != '\0') {
ThaiCluster cluster = buildNextCluster(&ptr);
if (!cluster.glyphs.empty()) {
clusters.push_back(std::move(cluster));
}
}
#if THAI_CLUSTER_DEBUG_LOGGING
Serial.printf("[THAI] Built %zu clusters\n", clusters.size());
#endif
return clusters;
}
ThaiCluster ThaiClusterBuilder::buildNextCluster(const uint8_t** text) {
ThaiCluster cluster;
if (*text == nullptr || **text == '\0') {
return cluster;
}
#if THAI_CLUSTER_DEBUG_LOGGING
// Log raw bytes at current position
Serial.printf("[THAI] buildNextCluster at ptr=%p, bytes: ", (void*)*text);
for (int i = 0; i < 6 && (*text)[i] != '\0'; i++) {
Serial.printf("%02X ", (*text)[i]);
}
Serial.printf("\n");
#endif
// Peek at first codepoint to determine cluster type
const uint8_t* peekPtr = *text;
uint32_t firstCp = utf8NextCodepoint(&peekPtr);
#if THAI_CLUSTER_DEBUG_LOGGING
Serial.printf("[THAI] First codepoint: U+%04X\n", firstCp);
#endif
// Non-Thai character: return as single-glyph cluster
if (!isThaiCodepoint(firstCp)) {
utf8NextCodepoint(text); // Consume the codepoint
PositionedGlyph glyph;
glyph.codepoint = firstCp;
glyph.xOffset = 0;
glyph.yOffset = 0;
glyph.zeroAdvance = false;
cluster.glyphs.push_back(glyph);
#if THAI_CLUSTER_DEBUG_LOGGING
Serial.printf("[THAI] Non-Thai cluster: U+%04X\n", firstCp);
#endif
return cluster;
}
// Collect all codepoints that form this Thai cluster
uint32_t leadingVowel = 0;
uint32_t baseConsonant = 0;
uint32_t aboveVowel = 0;
uint32_t belowVowel = 0;
uint32_t toneMark = 0;
uint32_t followVowel = 0;
uint32_t thanthakhat = 0; // ์ or ํ (nikhahit)
// Parse the cluster: consume codepoints until we hit a cluster boundary
while (**text != '\0') {
peekPtr = *text;
uint32_t cp = utf8NextCodepoint(&peekPtr);
if (!isThaiCodepoint(cp)) {
break; // Non-Thai ends the cluster
}
ThaiCharType type = getThaiCharType(cp);
switch (type) {
case ThaiCharType::LEADING_VOWEL:
if (leadingVowel != 0 || baseConsonant != 0) {
// Another leading vowel or we already have base = new cluster
goto done_parsing;
}
leadingVowel = cp;
utf8NextCodepoint(text);
break;
case ThaiCharType::CONSONANT:
if (baseConsonant != 0) {
// Second consonant = new cluster
goto done_parsing;
}
baseConsonant = cp;
utf8NextCodepoint(text);
break;
case ThaiCharType::ABOVE_VOWEL:
if (aboveVowel != 0) {
// Multiple above vowels - take first, new cluster for next
goto done_parsing;
}
aboveVowel = cp;
utf8NextCodepoint(text);
break;
case ThaiCharType::BELOW_VOWEL:
if (belowVowel != 0) {
goto done_parsing;
}
belowVowel = cp;
utf8NextCodepoint(text);
break;
case ThaiCharType::TONE_MARK:
if (toneMark != 0) {
goto done_parsing;
}
toneMark = cp;
utf8NextCodepoint(text);
break;
case ThaiCharType::FOLLOW_VOWEL:
if (followVowel != 0) {
goto done_parsing;
}
followVowel = cp;
utf8NextCodepoint(text);
// Follow vowel typically ends the syllable
goto done_parsing;
case ThaiCharType::NIKHAHIT:
case ThaiCharType::YAMAKKAN:
if (thanthakhat != 0) {
goto done_parsing;
}
thanthakhat = cp;
utf8NextCodepoint(text);
break;
case ThaiCharType::THAI_DIGIT:
case ThaiCharType::THAI_SYMBOL:
// Digits and symbols are standalone clusters
if (leadingVowel == 0 && baseConsonant == 0) {
// Start of cluster with digit/symbol
utf8NextCodepoint(text);
PositionedGlyph glyph;
glyph.codepoint = cp;
glyph.xOffset = 0;
glyph.yOffset = 0;
glyph.zeroAdvance = false;
cluster.glyphs.push_back(glyph);
return cluster;
}
// Otherwise end current cluster
goto done_parsing;
default:
// Unknown Thai character - treat as cluster boundary
goto done_parsing;
}
}
done_parsing:
// Now build positioned glyphs from collected codepoints
// 1. Leading vowel (if any) - rendered FIRST but stored after consonant in Unicode
if (leadingVowel != 0) {
PositionedGlyph glyph;
glyph.codepoint = leadingVowel;
glyph.xOffset = 0;
glyph.yOffset = 0;
glyph.zeroAdvance = false; // Leading vowel has its own advance
cluster.glyphs.push_back(glyph);
}
// 2. Base consonant
if (baseConsonant != 0) {
PositionedGlyph glyph;
glyph.codepoint = baseConsonant;
glyph.xOffset = 0;
glyph.yOffset = 0;
glyph.zeroAdvance = false;
cluster.glyphs.push_back(glyph);
// Check if this is an ascender consonant for mark positioning
bool hasAscender = isAscenderConsonant(baseConsonant);
// 3. Above vowel (positioned above base)
if (aboveVowel != 0) {
PositionedGlyph aboveGlyph;
aboveGlyph.codepoint = aboveVowel;
aboveGlyph.xOffset = hasAscender ? ThaiOffset::ASCENDER_X_SHIFT : 0;
aboveGlyph.yOffset = ThaiOffset::ABOVE_VOWEL;
aboveGlyph.zeroAdvance = true; // Above vowel doesn't advance cursor
cluster.glyphs.push_back(aboveGlyph);
}
// 4. Below vowel (positioned below base)
if (belowVowel != 0) {
PositionedGlyph belowGlyph;
belowGlyph.codepoint = belowVowel;
belowGlyph.xOffset = 0;
belowGlyph.yOffset = ThaiOffset::BELOW_VOWEL;
belowGlyph.zeroAdvance = true;
cluster.glyphs.push_back(belowGlyph);
}
// 5. Tone mark (positioned above everything else)
if (toneMark != 0) {
PositionedGlyph toneGlyph;
toneGlyph.codepoint = toneMark;
toneGlyph.xOffset = hasAscender ? ThaiOffset::ASCENDER_X_SHIFT : 0;
// Tone mark goes above above-vowel if present, otherwise just above base
toneGlyph.yOffset = aboveVowel != 0 ? ThaiOffset::TONE_MARK : ThaiOffset::TONE_MARK_ALONE;
toneGlyph.zeroAdvance = true;
cluster.glyphs.push_back(toneGlyph);
}
// 6. Thanthakhat/Nikhahit (positioned above)
if (thanthakhat != 0) {
PositionedGlyph thanGlyph;
thanGlyph.codepoint = thanthakhat;
thanGlyph.xOffset = 0;
// Position depends on what's already above
if (toneMark != 0) {
thanGlyph.yOffset = ThaiOffset::TONE_MARK - 2; // Above tone mark
} else if (aboveVowel != 0) {
thanGlyph.yOffset = ThaiOffset::TONE_MARK; // Above above-vowel
} else {
thanGlyph.yOffset = ThaiOffset::TONE_MARK_ALONE;
}
thanGlyph.zeroAdvance = true;
cluster.glyphs.push_back(thanGlyph);
}
}
// 7. Follow vowel (displayed after base)
if (followVowel != 0) {
PositionedGlyph glyph;
glyph.codepoint = followVowel;
glyph.xOffset = 0;
glyph.yOffset = 0;
glyph.zeroAdvance = false; // Follow vowel advances cursor
cluster.glyphs.push_back(glyph);
}
// Handle edge case: leading vowel with no consonant (shouldn't happen in valid Thai)
if (leadingVowel != 0 && baseConsonant == 0) {
// Just the leading vowel by itself
// Already added above
}
#if THAI_CLUSTER_DEBUG_LOGGING
Serial.printf("[THAI] Cluster built with %zu glyphs: ", cluster.glyphs.size());
for (const auto& g : cluster.glyphs) {
Serial.printf("U+%04X ", g.codepoint);
}
Serial.printf("(lead=%04X base=%04X above=%04X below=%04X tone=%04X follow=%04X)\n",
leadingVowel, baseConsonant, aboveVowel, belowVowel, toneMark, followVowel);
#endif
return cluster;
}
} // namespace ThaiShaper