Add punctuation handling: implement isPunctuation and trimTrailingPunctuation functions

This commit is contained in:
Arthur Tazhitdinov 2025-12-26 03:39:15 +05:00
parent a3dc96a3b8
commit e156790705
3 changed files with 48 additions and 1 deletions

View File

@ -57,10 +57,54 @@ bool isCyrillicVowel(uint32_t cp) {
bool isCyrillicConsonant(const uint32_t cp) { return isCyrillicLetter(cp) && !isCyrillicVowel(cp); }
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp) || isPunctuation(cp); }
bool isVowel(const uint32_t cp) { return isLatinVowel(cp) || isCyrillicVowel(cp); }
bool isPunctuation(const uint32_t cp) {
switch (cp) {
case '.':
case ',':
case '!':
case '?':
case ';':
case ':':
case '"':
case '\'':
case ')':
case '(':
case '«':
case '»':
case '':
case '':
case '':
case '':
case '':
case '-':
case '':
case '':
case '[':
case ']':
case '{':
case '}':
case '/':
case 0x2019: //
case 0x201D: // ”
case 0x00BB: // »
case 0x203A: //
case 0x2026: // …
return true;
default:
return false;
}
}
void trimTrailingPunctuation(std::vector<CodepointInfo>& cps) {
while (!cps.empty() && isPunctuation(cps.back().value)) {
cps.pop_back();
}
}
Script detectScript(const std::vector<CodepointInfo>& cps) {
bool hasLatin = false;
bool hasCyrillic = false;

View File

@ -27,5 +27,7 @@ bool isCyrillicConsonant(uint32_t cp);
bool isAlphabetic(uint32_t cp);
bool isVowel(uint32_t cp);
bool isPunctuation(uint32_t cp);
void trimTrailingPunctuation(std::vector<CodepointInfo>& cps);
Script detectScript(const std::vector<CodepointInfo>& cps);

View File

@ -94,6 +94,7 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
}
auto cps = collectCodepoints(word);
trimTrailingPunctuation(cps);
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
return byteOffsets;
}