format fix

This commit is contained in:
Arthur Tazhitdinov 2025-12-26 05:12:26 +05:00
parent 3cf52d8bd1
commit 23183a6270
3 changed files with 44 additions and 29 deletions

View File

@ -1,5 +1,4 @@
#include "EnglishHyphenator.h" #include "EnglishHyphenator.h"
#include "HyphenationLiterals.h"
#include <algorithm> #include <algorithm>
#include <array> #include <array>
@ -7,6 +6,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "HyphenationLiterals.h"
namespace { namespace {
char lowerLatinChar(const uint32_t cp) { char lowerLatinChar(const uint32_t cp) {
@ -50,18 +51,15 @@ bool isEnglishFricativeChar(const char c) {
using LatinLiteral = HyphenLiteralT<char>; using LatinLiteral = HyphenLiteralT<char>;
constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {{{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, constexpr std::array<LatinLiteral, 20> ENGLISH_PREFIXES = {
{"dis", 3}, {"hyper", 5}, {"inter", 5}, {"micro", 5}, {{"anti", 4}, {"auto", 4}, {"counter", 7}, {"de", 2}, {"dis", 3}, {"hyper", 5}, {"inter", 5},
{"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, {"micro", 5}, {"mis", 3}, {"mono", 4}, {"multi", 5}, {"non", 3}, {"over", 4}, {"post", 4},
{"over", 4}, {"post", 4}, {"pre", 3}, {"pro", 3}, {"pre", 3}, {"pro", 3}, {"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
{"re", 2}, {"sub", 3}, {"super", 5}, {"trans", 5}}};
constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {{{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, constexpr std::array<LatinLiteral, 24> ENGLISH_SUFFIXES = {
{"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3}, {{"able", 4}, {"ible", 4}, {"ing", 3}, {"ings", 4}, {"ed", 2}, {"er", 2}, {"ers", 3}, {"est", 3},
{"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ful", 3}, {"hood", 4}, {"less", 4}, {"lessly", 6}, {"ly", 2}, {"ment", 4}, {"ments", 5}, {"ness", 4},
{"ly", 2}, {"ment", 4}, {"ments", 5},{"ness", 4}, {"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4}, {"wards", 5}, {"ship", 4}, {"ships", 5}, {"y", 1}}};
{"ous", 3}, {"tion", 4}, {"sion", 4}, {"ward", 4},
{"wards", 5},{"ship", 4}, {"ships", 5},{"y", 1}}};
bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index); bool nextToApostrophe(const std::vector<CodepointInfo>& cps, size_t index);
@ -111,8 +109,9 @@ bool englishBreakAllowed(const std::vector<CodepointInfo>& cps, const size_t bre
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord, void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::string& lowerWord,
std::vector<size_t>& indexes) { std::vector<size_t>& indexes) {
appendLiteralBreaks(lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES, appendLiteralBreaks(
[&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes); lowerWord, ENGLISH_PREFIXES, ENGLISH_SUFFIXES,
[&](const size_t breakIndex) { return englishBreakAllowed(cps, breakIndex); }, indexes);
} }
struct CharPair { struct CharPair {
@ -313,8 +312,7 @@ std::vector<size_t> englishBreakIndexes(const std::vector<CodepointInfo>& cps) {
const size_t rightVowel = vowelPositions[v + 1]; const size_t rightVowel = vowelPositions[v + 1];
if (rightVowel - leftVowel == 1) { if (rightVowel - leftVowel == 1) {
if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && if (!isEnglishDiphthong(cps[leftVowel].value, cps[rightVowel].value) && englishBreakAllowed(cps, rightVowel)) {
englishBreakAllowed(cps, rightVowel)) {
indexes.push_back(rightVowel); indexes.push_back(rightVowel);
} }
continue; continue;

View File

@ -30,8 +30,7 @@ bool matchesLiteralAt(const WordContainer& word, const size_t start, const Liter
template <typename WordContainer, typename PrefixContainer, typename SuffixContainer, typename BreakAllowedFn> template <typename WordContainer, typename PrefixContainer, typename SuffixContainer, typename BreakAllowedFn>
void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes, void appendLiteralBreaks(const WordContainer& lowerWord, const PrefixContainer& prefixes,
const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed, const SuffixContainer& suffixes, BreakAllowedFn&& breakAllowed, std::vector<size_t>& indexes) {
std::vector<size_t>& indexes) {
const size_t length = lowerWord.size(); const size_t length = lowerWord.size();
const auto tryPush = [&](const size_t breakIndex) { const auto tryPush = [&](const size_t breakIndex) {

View File

@ -1,11 +1,12 @@
#include "RussianHyphenator.h" #include "RussianHyphenator.h"
#include "HyphenationLiterals.h"
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <limits> #include <limits>
#include <vector> #include <vector>
#include "HyphenationLiterals.h"
namespace { namespace {
using CyrillicLiteral = HyphenLiteralT<uint32_t>; using CyrillicLiteral = HyphenLiteralT<uint32_t>;
@ -23,10 +24,18 @@ constexpr uint32_t PFX_SAMO[4] = {0x0441, 0x0430, 0x043C, 0x043E};
constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E}; constexpr uint32_t PFX_OBO[3] = {0x043E, 0x0431, 0x043E};
constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432}; constexpr uint32_t PFX_PROTIV[6] = {0x043F, 0x0440, 0x043E, 0x0442, 0x0438, 0x0432};
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3}, {PFX_RAZ, 3}, {PFX_POD, 3}, constexpr std::array<CyrillicLiteral, 12> RUSSIAN_PREFIXES = {{{PFX_BEZ, 3},
{PFX_NAD, 3}, {PFX_PERE, 4}, {PFX_SVERH, 5}, {PFX_RAZ, 3},
{PFX_MEZH, 3}, {PFX_SUPER, 5},{PFX_PRED, 4}, {PFX_POD, 3},
{PFX_SAMO, 4}, {PFX_OBO, 3}, {PFX_PROTIV, 6}}}; {PFX_NAD, 3},
{PFX_PERE, 4},
{PFX_SVERH, 5},
{PFX_MEZH, 3},
{PFX_SUPER, 5},
{PFX_PRED, 4},
{PFX_SAMO, 4},
{PFX_OBO, 3},
{PFX_PROTIV, 6}}};
constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442}; constexpr uint32_t SFX_NOST[4] = {0x043D, 0x043E, 0x0441, 0x0442};
constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E}; constexpr uint32_t SFX_STVO[4] = {0x0441, 0x0442, 0x0432, 0x043E};
@ -41,10 +50,18 @@ constexpr uint32_t SFX_ISM[3] = {0x0438, 0x0437, 0x043C};
constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439}; constexpr uint32_t SFX_LIV[5] = {0x043B, 0x0438, 0x0432, 0x044B, 0x0439};
constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C}; constexpr uint32_t SFX_OST[4] = {0x043E, 0x0441, 0x0442, 0x044C};
constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4}, {SFX_STVO, 4}, {SFX_ENIE, 4}, constexpr std::array<CyrillicLiteral, 12> RUSSIAN_SUFFIXES = {{{SFX_NOST, 4},
{SFX_ATION, 4}, {SFX_CHIK, 3}, {SFX_NIK, 3}, {SFX_STVO, 4},
{SFX_TEL, 4}, {SFX_SKII, 4}, {SFX_AL, 6}, {SFX_ENIE, 4},
{SFX_ISM, 3}, {SFX_LIV, 5}, {SFX_OST, 4}}}; {SFX_ATION, 4},
{SFX_CHIK, 3},
{SFX_NIK, 3},
{SFX_TEL, 4},
{SFX_SKII, 4},
{SFX_AL, 6},
{SFX_ISM, 3},
{SFX_LIV, 5},
{SFX_OST, 4}}};
std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) { std::vector<uint32_t> lowercaseCyrillicWord(const std::vector<CodepointInfo>& cps) {
std::vector<uint32_t> lower; std::vector<uint32_t> lower;
@ -308,8 +325,9 @@ bool nextToSoftSign(const std::vector<CodepointInfo>& cps, const size_t index) {
void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord, void appendMorphologyBreaks(const std::vector<CodepointInfo>& cps, const std::vector<uint32_t>& lowerWord,
std::vector<size_t>& indexes) { std::vector<size_t>& indexes) {
appendLiteralBreaks(lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES, appendLiteralBreaks(
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes); lowerWord, RUSSIAN_PREFIXES, RUSSIAN_SUFFIXES,
[&](const size_t breakIndex) { return russianBreakAllowed(cps, breakIndex); }, indexes);
} }
// Produces syllable break indexes tailored to Russian phonotactics. // Produces syllable break indexes tailored to Russian phonotactics.