crosspoint-reader/lib/ThaiShaper/ThaiCharacter.h
2026-01-22 00:07:29 +07:00

96 lines
3.7 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <cstdint>
/**
* Thai Character Classification
*
* Thai Unicode block (U+0E00-U+0E7F) contains:
* - Consonants (ก-ฮ): U+0E01-U+0E2E
* - Vowels that can appear in various positions
* - Tone marks that stack above consonants/vowels
* - Thai digits and punctuation
*
* Thai text rendering requires special handling because:
* 1. Leading vowels (เ แ โ ไ ใ) display BEFORE the consonant but
* appear AFTER in Unicode codepoint order
* 2. Above vowels (ิ ี ึ ื etc.) must be positioned above consonants
* 3. Below vowels (ุ ู) must be positioned below consonants
* 4. Tone marks must stack above vowels/consonants
*/
namespace ThaiShaper {
// Thai character types for positioning and cluster formation
enum class ThaiCharType : uint8_t {
NON_THAI, // Not a Thai character
CONSONANT, // Base consonant (ก-ฮ)
LEADING_VOWEL, // Vowels that display before consonant (เ แ โ ไ ใ)
ABOVE_VOWEL, // Vowels above consonant (ั ิ ี ึ ื ็)
BELOW_VOWEL, // Vowels below consonant (ุ ู ฺ)
FOLLOW_VOWEL, // Vowels that follow consonant (ะ า ำ)
TONE_MARK, // Tone marks (่ ้ ๊ ๋)
NIKHAHIT, // Nikhahit (ํ) - special combining mark
YAMAKKAN, // Yamakkan (์) - cancellation mark
THAI_DIGIT, // Thai digits (-๙)
THAI_SYMBOL, // Thai punctuation and symbols
};
// Check if a codepoint is in the Thai Unicode block
inline bool isThaiCodepoint(uint32_t cp) { return cp >= 0x0E00 && cp <= 0x0E7F; }
// Get the character type for a Thai codepoint
ThaiCharType getThaiCharType(uint32_t cp);
// Check if codepoint is a Thai consonant (can be a cluster base)
inline bool isThaiConsonant(uint32_t cp) { return cp >= 0x0E01 && cp <= 0x0E2E; }
// Check if codepoint is a leading vowel (needs reordering)
inline bool isThaiLeadingVowel(uint32_t cp) {
return cp == 0x0E40 || // SARA E (เ)
cp == 0x0E41 || // SARA AE (แ)
cp == 0x0E42 || // SARA O (โ)
cp == 0x0E43 || // SARA AI MAIMUAN (ใ)
cp == 0x0E44; // SARA AI MAIMALAI (ไ)
}
// Check if codepoint is an above vowel/mark (positioned above base)
inline bool isThaiAboveVowel(uint32_t cp) {
return cp == 0x0E31 || // MAI HAN-AKAT (ั)
cp == 0x0E34 || // SARA I (ิ)
cp == 0x0E35 || // SARA II (ี)
cp == 0x0E36 || // SARA UE (ึ)
cp == 0x0E37 || // SARA UEE (ื)
cp == 0x0E47; // MAITAIKHU (็)
}
// Check if codepoint is a below vowel (positioned below base)
inline bool isThaibelowVowel(uint32_t cp) {
return cp == 0x0E38 || // SARA U (ุ)
cp == 0x0E39 || // SARA UU (ู)
cp == 0x0E3A; // PHINTHU (ฺ)
}
// Check if codepoint is a tone mark (positioned above)
inline bool isThaiToneMark(uint32_t cp) {
return cp == 0x0E48 || // MAI EK (่)
cp == 0x0E49 || // MAI THO (้)
cp == 0x0E4A || // MAI TRI (๊)
cp == 0x0E4B; // MAI CHATTAWA (๋)
}
// Check if codepoint is a Thai digit
inline bool isThaiDigit(uint32_t cp) { return cp >= 0x0E50 && cp <= 0x0E59; }
// Check if a codepoint is a combining character (needs to attach to base)
inline bool isThaiCombining(uint32_t cp) {
return isThaiAboveVowel(cp) || isThaibelowVowel(cp) || isThaiToneMark(cp) || cp == 0x0E4C || // THANTHAKHAT (์)
cp == 0x0E4D || // NIKHAHIT (ํ)
cp == 0x0E4E; // YAMAKKAN
}
// Check if text contains any Thai codepoints (for fast path detection)
bool containsThai(const char* text);
} // namespace ThaiShaper