96 lines
3.7 KiB
C++
96 lines
3.7 KiB
C++
#pragma once
|
||
|
||
#include <cstdint>
|
||
|
||
/**
|
||
* Thai Character Classification
|
||
*
|
||
* Thai Unicode block (U+0E00-U+0E7F) contains:
|
||
* - Consonants (ก-ฮ): U+0E01-U+0E2E
|
||
* - Vowels that can appear in various positions
|
||
* - Tone marks that stack above consonants/vowels
|
||
* - Thai digits and punctuation
|
||
*
|
||
* Thai text rendering requires special handling because:
|
||
* 1. Leading vowels (เ แ โ ไ ใ) display BEFORE the consonant but
|
||
* appear AFTER in Unicode codepoint order
|
||
* 2. Above vowels (ิ ี ึ ื etc.) must be positioned above consonants
|
||
* 3. Below vowels (ุ ู) must be positioned below consonants
|
||
* 4. Tone marks must stack above vowels/consonants
|
||
*/
|
||
|
||
namespace ThaiShaper {
|
||
|
||
// Thai character types for positioning and cluster formation
|
||
enum class ThaiCharType : uint8_t {
|
||
NON_THAI, // Not a Thai character
|
||
CONSONANT, // Base consonant (ก-ฮ)
|
||
LEADING_VOWEL, // Vowels that display before consonant (เ แ โ ไ ใ)
|
||
ABOVE_VOWEL, // Vowels above consonant (ั ิ ี ึ ื ็)
|
||
BELOW_VOWEL, // Vowels below consonant (ุ ู ฺ)
|
||
FOLLOW_VOWEL, // Vowels that follow consonant (ะ า ำ)
|
||
TONE_MARK, // Tone marks (่ ้ ๊ ๋)
|
||
NIKHAHIT, // Nikhahit (ํ) - special combining mark
|
||
YAMAKKAN, // Yamakkan (์) - cancellation mark
|
||
THAI_DIGIT, // Thai digits (๐-๙)
|
||
THAI_SYMBOL, // Thai punctuation and symbols
|
||
};
|
||
|
||
// Check if a codepoint is in the Thai Unicode block
|
||
inline bool isThaiCodepoint(uint32_t cp) { return cp >= 0x0E00 && cp <= 0x0E7F; }
|
||
|
||
// Get the character type for a Thai codepoint
|
||
ThaiCharType getThaiCharType(uint32_t cp);
|
||
|
||
// Check if codepoint is a Thai consonant (can be a cluster base)
|
||
inline bool isThaiConsonant(uint32_t cp) { return cp >= 0x0E01 && cp <= 0x0E2E; }
|
||
|
||
// Check if codepoint is a leading vowel (needs reordering)
|
||
inline bool isThaiLeadingVowel(uint32_t cp) {
|
||
return cp == 0x0E40 || // SARA E (เ)
|
||
cp == 0x0E41 || // SARA AE (แ)
|
||
cp == 0x0E42 || // SARA O (โ)
|
||
cp == 0x0E43 || // SARA AI MAIMUAN (ใ)
|
||
cp == 0x0E44; // SARA AI MAIMALAI (ไ)
|
||
}
|
||
|
||
// Check if codepoint is an above vowel/mark (positioned above base)
|
||
inline bool isThaiAboveVowel(uint32_t cp) {
|
||
return cp == 0x0E31 || // MAI HAN-AKAT (ั)
|
||
cp == 0x0E34 || // SARA I (ิ)
|
||
cp == 0x0E35 || // SARA II (ี)
|
||
cp == 0x0E36 || // SARA UE (ึ)
|
||
cp == 0x0E37 || // SARA UEE (ื)
|
||
cp == 0x0E47; // MAITAIKHU (็)
|
||
}
|
||
|
||
// Check if codepoint is a below vowel (positioned below base)
|
||
inline bool isThaibelowVowel(uint32_t cp) {
|
||
return cp == 0x0E38 || // SARA U (ุ)
|
||
cp == 0x0E39 || // SARA UU (ู)
|
||
cp == 0x0E3A; // PHINTHU (ฺ)
|
||
}
|
||
|
||
// Check if codepoint is a tone mark (positioned above)
|
||
inline bool isThaiToneMark(uint32_t cp) {
|
||
return cp == 0x0E48 || // MAI EK (่)
|
||
cp == 0x0E49 || // MAI THO (้)
|
||
cp == 0x0E4A || // MAI TRI (๊)
|
||
cp == 0x0E4B; // MAI CHATTAWA (๋)
|
||
}
|
||
|
||
// Check if codepoint is a Thai digit
|
||
inline bool isThaiDigit(uint32_t cp) { return cp >= 0x0E50 && cp <= 0x0E59; }
|
||
|
||
// Check if a codepoint is a combining character (needs to attach to base)
|
||
inline bool isThaiCombining(uint32_t cp) {
|
||
return isThaiAboveVowel(cp) || isThaibelowVowel(cp) || isThaiToneMark(cp) || cp == 0x0E4C || // THANTHAKHAT (์)
|
||
cp == 0x0E4D || // NIKHAHIT (ํ)
|
||
cp == 0x0E4E; // YAMAKKAN
|
||
}
|
||
|
||
// Check if text contains any Thai codepoints (for fast path detection)
|
||
bool containsThai(const char* text);
|
||
|
||
} // namespace ThaiShaper
|