crosspoint-reader/lib/Epub/Epub/hyphenation/RussianHyphenator.cpp
Arthur Tazhitdinov 3ef2448f72 optimization
2026-01-08 03:27:27 +05:00

411 lines
14 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "RussianHyphenator.h"
#include <algorithm>
#include <array>
#include <limits>
#include <vector>
#include "HyphenationLiterals.h"
namespace {
using CyrillicLiteral = HyphenLiteralT<uint32_t>;
constexpr uint32_t PFX_BEZ[3] = {0x0431, 0x0435, 0x0437};
constexpr uint32_t PFX_RAZ[3] = {0x0440, 0x0430, 0x0437};
constexpr uint32_t PFX_POD[3] = {0x043F, 0x043E, 0x0434};
constexpr uint32_t PFX_NAD[3] = {0x043D, 0x0430, 0x0434};
constexpr uint32_t PFX_PERE[4] = {0x043F, 0x0435, 0x0440, 0x0435};
constexpr uint32_t PFX_SVERH[5] = {0x0441, 0x0432, 0x0435, 0x0440, 0x0445};
constexpr uint32_t PFX_MEZH[3] = {0x043C, 0x0435, 0x0436};
constexpr uint32_t PFX_SUPER[5] = {0x0441, 0x0443, 0x043F, 0x0435, 0x0440};
constexpr uint32_t PFX_PRED[4] = {0x043F, 0x0440, 0x0435, 0x0434};
constexpr uint32_t PFX_SAMO[4] = {0x0441, 0x0430, 0x043C, 0x043E};
constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E};
constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432};
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3},
{PFX_RAZ, 3},
{PFX_POD, 3},
{PFX_NAD, 3},
{PFX_PERE, 4},
{PFX_SVERH, 5},
{PFX_MEZH, 3},
{PFX_SUPER, 5},
{PFX_PRED, 4},
{PFX_SAMO, 4},
{PFX_OBO, 3},
{PFX_PROTIV, 6}}};
constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442};
constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E};
constexpr uint32_t SFX_ENIE[4] = {0x0435, 0x043D, 0x0438, 0x0435};
constexpr uint32_t SFX_ATION[4] = {0x0430, 0x0446, 0x0438, 0x044F};
constexpr uint32_t SFX_CHIK[3] = {0x0447, 0x0438, 0x043A};
constexpr uint32_t SFX_NIK[3] = {0x043D, 0x0438, 0x043A};
constexpr uint32_t SFX_TEL[4] = {0x0442, 0x0435, 0x043B, 0x044C};
constexpr uint32_t SFX_SKII[4] = {0x0441, 0x043A, 0x0438, 0x0439};
constexpr uint32_t SFX_AL[6] = {0x0430, 0x043B, 0x044C, 0x043D, 0x044B, 0x0439};
constexpr uint32_t SFX_ISM[3] = {0x0438, 0x0437, 0x043C};
constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439};
constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C};
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4},
{SFX_STVO, 4},
{SFX_ENIE, 4},
{SFX_ATION, 4},
{SFX_CHIK, 3},
{SFX_NIK, 3},
{SFX_TEL, 4},
{SFX_SKII, 4},
{SFX_AL, 6},
{SFX_ISM, 3},
{SFX_LIV, 5},
{SFX_OST, 4}}};
std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) {
std::vector<uint32_t> lower;
lower.reserve(cps.size());
for (const auto& info : cps) {
lower.push_back(isCyrillicLetter(info.value) ? toLowerCyrillic(info.value) : info.value);
}
return lower;
}
bool russianSegmentHasVowel(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= cps.size()) {
return false;
}
const size_t clampedEnd = std::min(end, cps.size());
for (size_t i = start; i < clampedEnd; ++i) {
if (isCyrillicVowel(cps[i].value)) {
return true;
}
}
return false;
}
bool exposesLeadingDoubleConsonant(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index + 1 >= cps.size()) {
return false;
}
const auto first = cps[index].value;
const auto second = cps[index + 1].value;
if (!isCyrillicConsonant(first) || !isCyrillicConsonant(second)) {
return false;
}
if (toLowerCyrillic(first) != toLowerCyrillic(second)) {
return false;
}
const bool hasLeftVowel = index > 0 && isCyrillicVowel(cps[index - 1].value);
const bool hasRightVowel = (index + 2 < cps.size()) && isCyrillicVowel(cps[index + 2].value);
return hasLeftVowel && hasRightVowel;
}
bool exposesTrailingDoubleConsonant(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index < 2) {
return false;
}
const auto last = cps[index - 1].value;
const auto prev = cps[index - 2].value;
if (!isCyrillicConsonant(last) || !isCyrillicConsonant(prev)) {
return false;
}
if (toLowerCyrillic(last) != toLowerCyrillic(prev)) {
return false;
}
const bool hasLeftVowel = (index >= 3) && isCyrillicVowel(cps[index - 3].value);
const bool hasRightVowel = (index < cps.size()) && isCyrillicVowel(cps[index].value);
return hasLeftVowel && hasRightVowel;
}
bool violatesDoubleConsonantRule(const std::vector<CodepointInfo>& cps, const size_t index) {
return exposesLeadingDoubleConsonant(cps, index) || exposesTrailingDoubleConsonant(cps, index);
}
// Checks if the codepoint is the Cyrillic soft sign (ь).
bool isSoftSign(uint32_t cp) { return toLowerCyrillic(cp) == 0x044C; }
// Checks if the codepoint is the Cyrillic hard sign (ъ).
bool isHardSign(uint32_t cp) { return toLowerCyrillic(cp) == 0x044A; }
// Checks if the codepoint is either the Cyrillic soft sign (ь) or hard sign (ъ).
bool isSoftOrHardSign(uint32_t cp) { return isSoftSign(cp) || isHardSign(cp); }
// Checks if the codepoint is the Cyrillic short i (й).
bool isCyrillicShortI(uint32_t cp) { return toLowerCyrillic(cp) == 0x0439; }
// Checks if the codepoint is the Cyrillic yeru (ы).
bool isCyrillicYeru(uint32_t cp) { return toLowerCyrillic(cp) == 0x044B; }
// Checks if the codepoint is a Russian prefix consonant that can start certain clusters.
bool isRussianPrefixConsonant(uint32_t cp) {
cp = toLowerCyrillic(cp);
return cp == 0x0432 || cp == 0x0437 || cp == 0x0441; // в, з, с
}
// Checks if the codepoint is a Russian sibilant consonant.
bool isRussianSibilant(uint32_t cp) {
cp = toLowerCyrillic(cp);
switch (cp) {
case 0x0437: // з
case 0x0441: // с
case 0x0436: // ж
case 0x0448: // ш
case 0x0449: // щ
case 0x0447: // ч
case 0x0446: // ц
return true;
default:
return false;
}
}
// Checks if the codepoint is a Russian stop consonant.
bool isRussianStop(uint32_t cp) {
cp = toLowerCyrillic(cp);
switch (cp) {
case 0x0431: // б
case 0x0433: // г
case 0x0434: // д
case 0x043F: // п
case 0x0442: // т
case 0x043A: // к
return true;
default:
return false;
}
}
// Checks the sonority rank of a Russian consonant for syllable onset validation.
int russianSonority(uint32_t cp) {
cp = toLowerCyrillic(cp);
switch (cp) {
case 0x043B: // л
case 0x0440: // р
case 0x0439: // й
return 4;
case 0x043C: // м
case 0x043D: // н
return 3;
case 0x0432: // в
case 0x0437: // з
case 0x0436: // ж
return 2;
case 0x0444: // ф
case 0x0441: // с
case 0x0448: // ш
case 0x0449: // щ
case 0x0447: // ч
case 0x0446: // ц
case 0x0445: // х
return 1;
case 0x0431: // б
case 0x0433: // г
case 0x0434: // д
case 0x043F: // п
case 0x0442: // т
case 0x043A: // к
return 0;
default:
return 1;
}
}
// Applies Russian sonority sequencing to ensure the consonant cluster can start a syllable.
bool russianClusterIsValidOnset(const std::vector<CodepointInfo>& cps, const size_t start, const size_t end) {
if (start >= end) {
return false;
}
for (size_t i = start; i < end; ++i) {
const auto cp = cps[i].value;
if (!isCyrillicConsonant(cp) || isSoftOrHardSign(cp)) {
return false;
}
}
if (end - start == 1) {
return true;
}
for (size_t i = start; i + 1 < end; ++i) {
const uint32_t current = cps[i].value;
const uint32_t next = cps[i + 1].value;
const int currentRank = russianSonority(current);
const int nextRank = russianSonority(next);
if (currentRank > nextRank) {
const bool atClusterStart = (i == start);
const bool prefixAllowance = atClusterStart && isRussianPrefixConsonant(current);
const bool sibilantAllowance = isRussianSibilant(current) && isRussianStop(next);
if (!prefixAllowance && !sibilantAllowance) {
return false;
}
}
}
return true;
}
// Identifies splits within double consonant clusters.
size_t doubleConsonantSplit(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
for (size_t i = clusterStart; i + 1 < clusterEnd; ++i) {
const auto left = cps[i].value;
const auto right = cps[i + 1].value;
if (isCyrillicConsonant(left) && toLowerCyrillic(left) == toLowerCyrillic(right) && !isSoftOrHardSign(right)) {
return i + 1;
}
}
return std::numeric_limits<size_t>::max();
}
// Prevents breaks that would create forbidden suffixes.
bool beginsWithForbiddenSuffix(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index >= cps.size()) {
return true;
}
const auto cp = cps[index].value;
return isSoftOrHardSign(cp) || isCyrillicShortI(cp) || isCyrillicYeru(cp);
}
// Validates whether a hyphenation break is allowed at the specified index.
bool russianBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t breakIndex) {
if (breakIndex == 0 || breakIndex >= cps.size()) {
return false;
}
const size_t prefixLen = breakIndex;
const size_t suffixLen = cps.size() - breakIndex;
if (prefixLen < 2 || suffixLen < 2) {
return false;
}
if (!russianSegmentHasVowel(cps, 0, breakIndex) || !russianSegmentHasVowel(cps, breakIndex, cps.size())) {
return false;
}
if (beginsWithForbiddenSuffix(cps, breakIndex)) {
return false;
}
if (violatesDoubleConsonantRule(cps, breakIndex)) {
return false;
}
return true;
}
// Chooses the longest valid onset contained within the inter-vowel cluster.
size_t russianOnsetLength(const std::vector<CodepointInfo>& cps, const size_t clusterStart, const size_t clusterEnd) {
const size_t clusterLen = clusterEnd - clusterStart;
if (clusterLen == 0) {
return 0;
}
const size_t maxLen = std::min<size_t>(4, clusterLen);
for (size_t len = maxLen; len >= 1; --len) {
const size_t suffixStart = clusterEnd - len;
if (russianClusterIsValidOnset(cps, suffixStart, clusterEnd)) {
return len;
}
}
return 1;
}
// Prevents hyphenation splits immediately beside ь/ъ characters.
bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
if (index == 0 || index >= cps.size()) {
return false;
}
const auto left = cps[index - 1].value;
const auto right = cps[index].value;
return isSoftOrHardSign(left) || isSoftOrHardSign(right);
}
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord,
std::vector<size_t>& indexes) {
appendLiteralBreaks(
lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
}
// Produces syllable break indexes tailored to Russian phonotactics.
std::vector<size_t> russianBreakIndexes(const std::vector<CodepointInfo>& cps) {
std::vector<size_t> indexes;
const size_t wordSize = cps.size();
// Collect vowel positions.
std::vector<size_t> vowelPositions;
vowelPositions.reserve(wordSize / 2); // Typical estimate: ~50% vowels
for (size_t i = 0; i < wordSize; ++i) {
if (isCyrillicVowel(cps[i].value)) {
vowelPositions.push_back(i);
}
}
// Need at least 2 vowels to create a syllable break.
if (vowelPositions.size() < 2) {
return indexes;
}
// Process inter-vowel clusters for hyphenation points.
for (size_t v = 0; v + 1 < vowelPositions.size(); ++v) {
const size_t leftVowel = vowelPositions[v];
const size_t rightVowel = vowelPositions[v + 1];
const size_t suffixLen = wordSize - rightVowel;
// Adjacent vowels: can break between them if constraints allow.
if (rightVowel - leftVowel == 1) {
if (rightVowel >= MIN_PREFIX_CP && suffixLen >= MIN_SUFFIX_CP && !nextToSoftSign(cps, rightVowel) &&
russianBreakAllowed(cps, rightVowel)) {
indexes.push_back(rightVowel);
}
continue;
}
// Consonant cluster between vowels: find optimal break point.
const size_t clusterStart = leftVowel + 1;
const size_t clusterEnd = rightVowel;
// Try double consonant split first (preferred).
size_t breakIndex = doubleConsonantSplit(cps, clusterStart, clusterEnd);
// Fall back to onset-based split.
if (breakIndex == std::numeric_limits<size_t>::max()) {
const size_t onsetLen = russianOnsetLength(cps, clusterStart, clusterEnd);
breakIndex = clusterEnd - onsetLen;
}
// Validate candidate break point.
if (breakIndex < MIN_PREFIX_CP || suffixLen < MIN_SUFFIX_CP || nextToSoftSign(cps, breakIndex) ||
!russianBreakAllowed(cps, breakIndex)) {
continue;
}
indexes.push_back(breakIndex);
}
const auto lowerWord = lowercaseCyrillicWord(cps);
const size_t preDedupeCount = indexes.size();
appendMorphologyBreaks(cps, lowerWord, indexes);
if (indexes.size() > preDedupeCount) {
std::sort(indexes.begin(), indexes.end());
indexes.erase(std::unique(indexes.begin(), indexes.end()), indexes.end());
}
return indexes;
}
} // namespace
const RussianHyphenator& RussianHyphenator::instance() {
static RussianHyphenator instance;
return instance;
}
std::vector<size_t> RussianHyphenator::breakIndexes(const std::vector<CodepointInfo>& cps) const {
return russianBreakIndexes(cps);
}