crosspoint-reader/lib/ThaiShaper/ThaiCharacter.h

96 lines
3.7 KiB
C
Raw Normal View History

2026-01-22 00:07:29 +07:00
#pragma once
#include <cstdint>
/**
* Thai Character Classification
*
* Thai Unicode block (U+0E00-U+0E7F) contains:
* - Consonants (-): U+0E01-U+0E2E
* - Vowels that can appear in various positions
* - Tone marks that stack above consonants/vowels
* - Thai digits and punctuation
*
* Thai text rendering requires special handling because:
* 1. Leading vowels ( ) display BEFORE the consonant but
* appear AFTER in Unicode codepoint order
* 2. Above vowels ( etc.) must be positioned above consonants
* 3. Below vowels ( ) must be positioned below consonants
* 4. Tone marks must stack above vowels/consonants
*/
namespace ThaiShaper {
// Thai character types for positioning and cluster formation
enum class ThaiCharType : uint8_t {
NON_THAI, // Not a Thai character
CONSONANT, // Base consonant (ก-ฮ)
LEADING_VOWEL, // Vowels that display before consonant (เ แ โ ไ ใ)
ABOVE_VOWEL, // Vowels above consonant (ั ิ ี ึ ื ็)
BELOW_VOWEL, // Vowels below consonant (ุ ู ฺ)
FOLLOW_VOWEL, // Vowels that follow consonant (ะ า ำ)
TONE_MARK, // Tone marks (่ ้ ๊ ๋)
NIKHAHIT, // Nikhahit (ํ) - special combining mark
YAMAKKAN, // Yamakkan (์) - cancellation mark
THAI_DIGIT, // Thai digits (-๙)
THAI_SYMBOL, // Thai punctuation and symbols
};
// Check if a codepoint is in the Thai Unicode block
inline bool isThaiCodepoint(uint32_t cp) { return cp >= 0x0E00 && cp <= 0x0E7F; }
// Get the character type for a Thai codepoint
ThaiCharType getThaiCharType(uint32_t cp);
// Check if codepoint is a Thai consonant (can be a cluster base)
inline bool isThaiConsonant(uint32_t cp) { return cp >= 0x0E01 && cp <= 0x0E2E; }
// Check if codepoint is a leading vowel (needs reordering)
inline bool isThaiLeadingVowel(uint32_t cp) {
return cp == 0x0E40 || // SARA E (เ)
cp == 0x0E41 || // SARA AE (แ)
cp == 0x0E42 || // SARA O (โ)
cp == 0x0E43 || // SARA AI MAIMUAN (ใ)
cp == 0x0E44; // SARA AI MAIMALAI (ไ)
}
// Check if codepoint is an above vowel/mark (positioned above base)
inline bool isThaiAboveVowel(uint32_t cp) {
return cp == 0x0E31 || // MAI HAN-AKAT (ั)
cp == 0x0E34 || // SARA I (ิ)
cp == 0x0E35 || // SARA II (ี)
cp == 0x0E36 || // SARA UE (ึ)
cp == 0x0E37 || // SARA UEE (ื)
cp == 0x0E47; // MAITAIKHU (็)
}
// Check if codepoint is a below vowel (positioned below base)
inline bool isThaibelowVowel(uint32_t cp) {
return cp == 0x0E38 || // SARA U (ุ)
cp == 0x0E39 || // SARA UU (ู)
cp == 0x0E3A; // PHINTHU (ฺ)
}
// Check if codepoint is a tone mark (positioned above)
inline bool isThaiToneMark(uint32_t cp) {
return cp == 0x0E48 || // MAI EK (่)
cp == 0x0E49 || // MAI THO (้)
cp == 0x0E4A || // MAI TRI (๊)
cp == 0x0E4B; // MAI CHATTAWA (๋)
}
// Check if codepoint is a Thai digit
inline bool isThaiDigit(uint32_t cp) { return cp >= 0x0E50 && cp <= 0x0E59; }
// Check if a codepoint is a combining character (needs to attach to base)
inline bool isThaiCombining(uint32_t cp) {
return isThaiAboveVowel(cp) || isThaibelowVowel(cp) || isThaiToneMark(cp) || cp == 0x0E4C || // THANTHAKHAT (์)
cp == 0x0E4D || // NIKHAHIT (ํ)
cp == 0x0E4E; // YAMAKKAN
}
// Check if text contains any Thai codepoints (for fast path detection)
bool containsThai(const char* text);
} // namespace ThaiShaper