Rename trimTrailingPunctuation to trimSurroundingPunctuation and update logic to remove surrounding punctuation; add explicit hyphen handling in breakOffsets function.
This commit is contained in:
parent
23183a6270
commit
f6767c857f
@ -92,7 +92,10 @@ bool isPunctuation(const uint32_t cp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void trimTrailingPunctuation(std::vector<CodepointInfo>& cps) {
|
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
|
||||||
|
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
||||||
|
cps.erase(cps.begin());
|
||||||
|
}
|
||||||
while (!cps.empty() && isPunctuation(cps.back().value)) {
|
while (!cps.empty() && isPunctuation(cps.back().value)) {
|
||||||
cps.pop_back();
|
cps.pop_back();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,6 +28,6 @@ bool isCyrillicConsonant(uint32_t cp);
|
|||||||
bool isAlphabetic(uint32_t cp);
|
bool isAlphabetic(uint32_t cp);
|
||||||
bool isVowel(uint32_t cp);
|
bool isVowel(uint32_t cp);
|
||||||
bool isPunctuation(uint32_t cp);
|
bool isPunctuation(uint32_t cp);
|
||||||
void trimTrailingPunctuation(std::vector<CodepointInfo>& cps);
|
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
|
||||||
|
|
||||||
Script detectScript(const std::vector<CodepointInfo>& cps);
|
Script detectScript(const std::vector<CodepointInfo>& cps);
|
||||||
|
|||||||
@ -48,6 +48,32 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
|||||||
return cps;
|
return cps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isExplicitHyphen(const uint32_t cp) { return cp == '-' || cp == 0x2010; }
|
||||||
|
|
||||||
|
std::vector<size_t> collectExplicitHyphenIndexes(const std::vector<CodepointInfo>& cps) {
|
||||||
|
std::vector<size_t> indexes;
|
||||||
|
for (size_t i = 0; i < cps.size(); ++i) {
|
||||||
|
if (!isExplicitHyphen(cps[i].value)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (i == 0 || i + 1 >= cps.size()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!isAlphabetic(cps[i - 1].value) || !isAlphabetic(cps[i + 1].value)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const size_t breakIndex = i + 1;
|
||||||
|
if (breakIndex >= cps.size()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (breakIndex == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
indexes.push_back(breakIndex);
|
||||||
|
}
|
||||||
|
return indexes;
|
||||||
|
}
|
||||||
|
|
||||||
// Rejects words containing punctuation or digits unless forced.
|
// Rejects words containing punctuation or digits unless forced.
|
||||||
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
bool hasOnlyAlphabetic(const std::vector<CodepointInfo>& cps) {
|
||||||
if (cps.empty()) {
|
if (cps.empty()) {
|
||||||
@ -93,11 +119,22 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto cps = collectCodepoints(word);
|
auto cps = collectCodepoints(word);
|
||||||
trimTrailingPunctuation(cps);
|
trimSurroundingPunctuation(cps);
|
||||||
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
if (cps.size() < MIN_PREFIX_CP + MIN_SUFFIX_CP) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (auto explicitIndexes = collectExplicitHyphenIndexes(cps); !explicitIndexes.empty()) {
|
||||||
|
std::sort(explicitIndexes.begin(), explicitIndexes.end());
|
||||||
|
explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
|
||||||
|
std::vector<size_t> byteOffsets;
|
||||||
|
byteOffsets.reserve(explicitIndexes.size());
|
||||||
|
for (const size_t idx : explicitIndexes) {
|
||||||
|
byteOffsets.push_back(byteOffsetForIndex(cps, idx));
|
||||||
|
}
|
||||||
|
return byteOffsets;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
|
std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
|
||||||
if (includeFallback) {
|
if (includeFallback) {
|
||||||
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
for (size_t idx = MIN_PREFIX_CP; idx + MIN_SUFFIX_CP <= cps.size(); ++idx) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user