96 lines
3.7 KiB
C
96 lines
3.7 KiB
C
|
|
#pragma once
|
|||
|
|
|
|||
|
|
#include <cstdint>
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Thai Character Classification
|
|||
|
|
*
|
|||
|
|
* Thai Unicode block (U+0E00-U+0E7F) contains:
|
|||
|
|
* - Consonants (ก-ฮ): U+0E01-U+0E2E
|
|||
|
|
* - Vowels that can appear in various positions
|
|||
|
|
* - Tone marks that stack above consonants/vowels
|
|||
|
|
* - Thai digits and punctuation
|
|||
|
|
*
|
|||
|
|
* Thai text rendering requires special handling because:
|
|||
|
|
* 1. Leading vowels (เ แ โ ไ ใ) display BEFORE the consonant but
|
|||
|
|
* appear AFTER in Unicode codepoint order
|
|||
|
|
* 2. Above vowels (ิ ี ึ ื etc.) must be positioned above consonants
|
|||
|
|
* 3. Below vowels (ุ ู) must be positioned below consonants
|
|||
|
|
* 4. Tone marks must stack above vowels/consonants
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
namespace ThaiShaper {
|
|||
|
|
|
|||
|
|
// Thai character types for positioning and cluster formation
|
|||
|
|
enum class ThaiCharType : uint8_t {
|
|||
|
|
NON_THAI, // Not a Thai character
|
|||
|
|
CONSONANT, // Base consonant (ก-ฮ)
|
|||
|
|
LEADING_VOWEL, // Vowels that display before consonant (เ แ โ ไ ใ)
|
|||
|
|
ABOVE_VOWEL, // Vowels above consonant (ั ิ ี ึ ื ็)
|
|||
|
|
BELOW_VOWEL, // Vowels below consonant (ุ ู ฺ)
|
|||
|
|
FOLLOW_VOWEL, // Vowels that follow consonant (ะ า ำ)
|
|||
|
|
TONE_MARK, // Tone marks (่ ้ ๊ ๋)
|
|||
|
|
NIKHAHIT, // Nikhahit (ํ) - special combining mark
|
|||
|
|
YAMAKKAN, // Yamakkan (์) - cancellation mark
|
|||
|
|
THAI_DIGIT, // Thai digits (๐-๙)
|
|||
|
|
THAI_SYMBOL, // Thai punctuation and symbols
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// Check if a codepoint is in the Thai Unicode block
|
|||
|
|
inline bool isThaiCodepoint(uint32_t cp) { return cp >= 0x0E00 && cp <= 0x0E7F; }
|
|||
|
|
|
|||
|
|
// Get the character type for a Thai codepoint
|
|||
|
|
ThaiCharType getThaiCharType(uint32_t cp);
|
|||
|
|
|
|||
|
|
// Check if codepoint is a Thai consonant (can be a cluster base)
|
|||
|
|
inline bool isThaiConsonant(uint32_t cp) { return cp >= 0x0E01 && cp <= 0x0E2E; }
|
|||
|
|
|
|||
|
|
// Check if codepoint is a leading vowel (needs reordering)
|
|||
|
|
inline bool isThaiLeadingVowel(uint32_t cp) {
|
|||
|
|
return cp == 0x0E40 || // SARA E (เ)
|
|||
|
|
cp == 0x0E41 || // SARA AE (แ)
|
|||
|
|
cp == 0x0E42 || // SARA O (โ)
|
|||
|
|
cp == 0x0E43 || // SARA AI MAIMUAN (ใ)
|
|||
|
|
cp == 0x0E44; // SARA AI MAIMALAI (ไ)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check if codepoint is an above vowel/mark (positioned above base)
|
|||
|
|
inline bool isThaiAboveVowel(uint32_t cp) {
|
|||
|
|
return cp == 0x0E31 || // MAI HAN-AKAT (ั)
|
|||
|
|
cp == 0x0E34 || // SARA I (ิ)
|
|||
|
|
cp == 0x0E35 || // SARA II (ี)
|
|||
|
|
cp == 0x0E36 || // SARA UE (ึ)
|
|||
|
|
cp == 0x0E37 || // SARA UEE (ื)
|
|||
|
|
cp == 0x0E47; // MAITAIKHU (็)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check if codepoint is a below vowel (positioned below base)
|
|||
|
|
inline bool isThaibelowVowel(uint32_t cp) {
|
|||
|
|
return cp == 0x0E38 || // SARA U (ุ)
|
|||
|
|
cp == 0x0E39 || // SARA UU (ู)
|
|||
|
|
cp == 0x0E3A; // PHINTHU (ฺ)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check if codepoint is a tone mark (positioned above)
|
|||
|
|
inline bool isThaiToneMark(uint32_t cp) {
|
|||
|
|
return cp == 0x0E48 || // MAI EK (่)
|
|||
|
|
cp == 0x0E49 || // MAI THO (้)
|
|||
|
|
cp == 0x0E4A || // MAI TRI (๊)
|
|||
|
|
cp == 0x0E4B; // MAI CHATTAWA (๋)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check if codepoint is a Thai digit
|
|||
|
|
inline bool isThaiDigit(uint32_t cp) { return cp >= 0x0E50 && cp <= 0x0E59; }
|
|||
|
|
|
|||
|
|
// Check if a codepoint is a combining character (needs to attach to base)
|
|||
|
|
inline bool isThaiCombining(uint32_t cp) {
|
|||
|
|
return isThaiAboveVowel(cp) || isThaibelowVowel(cp) || isThaiToneMark(cp) || cp == 0x0E4C || // THANTHAKHAT (์)
|
|||
|
|
cp == 0x0E4D || // NIKHAHIT (ํ)
|
|||
|
|
cp == 0x0E4E; // YAMAKKAN
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check if text contains any Thai codepoints (for fast path detection)
|
|||
|
|
bool containsThai(const char* text);
|
|||
|
|
|
|||
|
|
} // namespace ThaiShaper
|