Refactor hyphenation logic to return detailed break information, enhancing line breaking capabilities

This commit is contained in:
Arthur Tazhitdinov
2026-01-07 03:54:43 +05:00
parent f998180353
commit 2315513ca1
3 changed files with 34 additions and 20 deletions

View File

@@ -135,9 +135,20 @@ size_t byteOffsetForIndex(const std::vector<CodepointInfo>& cps, const size_t in
return cps[index].byteOffset;
}
std::vector<Hyphenator::BreakInfo> buildBreakInfoVector(const std::vector<size_t>& indexes,
const std::vector<CodepointInfo>& cps,
const bool requiresHyphen) {
std::vector<Hyphenator::BreakInfo> breaks;
breaks.reserve(indexes.size());
for (const size_t idx : indexes) {
breaks.push_back({byteOffsetForIndex(cps, idx), requiresHyphen});
}
return breaks;
}
} // namespace
std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& word, const bool includeFallback) {
if (word.empty()) {
return {};
}
@@ -153,12 +164,7 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
if (!explicitIndexes.empty()) {
std::sort(explicitIndexes.begin(), explicitIndexes.end());
explicitIndexes.erase(std::unique(explicitIndexes.begin(), explicitIndexes.end()), explicitIndexes.end());
std::vector<size_t> byteOffsets;
byteOffsets.reserve(explicitIndexes.size());
for (const size_t idx : explicitIndexes) {
byteOffsets.push_back(byteOffsetForIndex(cps, idx));
}
return byteOffsets;
return buildBreakInfoVector(explicitIndexes, cps, false);
}
std::vector<size_t> indexes = hasOnlyAlphabetic(cps) ? collectBreakIndexes(cps) : std::vector<size_t>();
@@ -175,10 +181,5 @@ std::vector<size_t> Hyphenator::breakOffsets(const std::string& word, const bool
std::sort(indexes.begin(), indexes.end());
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
std::vector<size_t> byteOffsets;
byteOffsets.reserve(indexes.size());
for (const size_t idx : indexes) {
byteOffsets.push_back(byteOffsetForIndex(cps, idx));
}
return byteOffsets;
return buildBreakInfoVector(indexes, cps, true);
}

View File

@@ -6,7 +6,11 @@
class Hyphenator {
public:
struct BreakInfo {
size_t byteOffset;
bool requiresInsertedHyphen;
};
// Returns byte offsets where the word may be hyphenated. When includeFallback is true, all positions obeying the
// minimum prefix/suffix constraints are returned even if no language-specific rule matches.
static std::vector<size_t> breakOffsets(const std::string& word, bool includeFallback);
static std::vector<BreakInfo> breakOffsets(const std::string& word, bool includeFallback);
};