refactor: unify punctuation trimming to handle footnotes in hyphenation logic
This commit is contained in:
parent
32cffaf504
commit
f02872542f
@ -67,10 +67,16 @@ bool isLatinLetter(const uint32_t cp) {
|
||||
|
||||
bool isCyrillicLetter(const uint32_t cp) { return (cp >= 0x0400 && cp <= 0x052F); }
|
||||
|
||||
bool isAlphabetic(const uint32_t cp) { return isLatinLetter(cp) || isCyrillicLetter(cp); }
|
||||
bool isAlphabetic(const uint32_t cp) {
|
||||
if (isLatinLetter(cp) || isCyrillicLetter(cp) || isAsciiDigit(cp)) {
|
||||
return true;
|
||||
}
|
||||
return cp > 0x7F && !isPunctuation(cp);
|
||||
}
|
||||
|
||||
bool isPunctuation(const uint32_t cp) {
|
||||
switch (cp) {
|
||||
case '-':
|
||||
case '.':
|
||||
case ',':
|
||||
case '!':
|
||||
@ -87,8 +93,11 @@ bool isPunctuation(const uint32_t cp) {
|
||||
case 0x2019: // ’
|
||||
case 0x201C: // “
|
||||
case 0x201D: // ”
|
||||
case 0x00A0: // no-break space
|
||||
case '{':
|
||||
case '}':
|
||||
case '[':
|
||||
case ']':
|
||||
case '/':
|
||||
case 0x203A: // ›
|
||||
case 0x2026: // …
|
||||
@ -107,18 +116,6 @@ bool isExplicitHyphen(const uint32_t cp) {
|
||||
case 0x058A: // Armenian hyphen
|
||||
case 0x2010: // hyphen
|
||||
case 0x2011: // non-breaking hyphen
|
||||
case 0x2012: // figure dash
|
||||
case 0x2013: // en dash
|
||||
case 0x2014: // em dash
|
||||
case 0x2015: // horizontal bar
|
||||
case 0x2043: // hyphen bullet
|
||||
case 0x207B: // superscript minus
|
||||
case 0x208B: // subscript minus
|
||||
case 0x2212: // minus sign
|
||||
case 0x2E17: // double oblique hyphen
|
||||
case 0x2E3A: // two-em dash
|
||||
case 0x2E3B: // three-em dash
|
||||
case 0xFE58: // small em dash
|
||||
case 0xFE63: // small hyphen-minus
|
||||
case 0xFF0D: // fullwidth hyphen-minus
|
||||
return true;
|
||||
@ -129,7 +126,28 @@ bool isExplicitHyphen(const uint32_t cp) {
|
||||
|
||||
bool isSoftHyphen(const uint32_t cp) { return cp == 0x00AD; }
|
||||
|
||||
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps) {
|
||||
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps) {
|
||||
if (cps.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove trailing footnote references like [12], even if punctuation trails after the closing bracket.
|
||||
if (cps.size() >= 3) {
|
||||
int end = static_cast<int>(cps.size()) - 1;
|
||||
while (end >= 0 && isPunctuation(cps[end].value)) {
|
||||
--end;
|
||||
}
|
||||
int pos = end;
|
||||
if (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||||
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||||
--pos;
|
||||
}
|
||||
if (pos >= 0 && cps[pos].value == '[' && end - pos > 1) {
|
||||
cps.erase(cps.begin() + pos, cps.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (!cps.empty() && isPunctuation(cps.front().value)) {
|
||||
cps.erase(cps.begin());
|
||||
}
|
||||
@ -152,27 +170,3 @@ std::vector<CodepointInfo> collectCodepoints(const std::string& word) {
|
||||
|
||||
return cps;
|
||||
}
|
||||
|
||||
void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps) {
|
||||
if (cps.size() < 3) {
|
||||
return;
|
||||
}
|
||||
int closing = static_cast<int>(cps.size()) - 1;
|
||||
if (cps[closing].value != ']') {
|
||||
return;
|
||||
}
|
||||
int pos = closing - 1;
|
||||
if (pos < 0 || !isAsciiDigit(cps[pos].value)) {
|
||||
return;
|
||||
}
|
||||
while (pos >= 0 && isAsciiDigit(cps[pos].value)) {
|
||||
--pos;
|
||||
}
|
||||
if (pos < 0 || cps[pos].value != '[') {
|
||||
return;
|
||||
}
|
||||
if (closing - pos <= 1) {
|
||||
return;
|
||||
}
|
||||
cps.erase(cps.begin() + pos, cps.end());
|
||||
}
|
||||
|
||||
@ -21,6 +21,5 @@ bool isPunctuation(uint32_t cp);
|
||||
bool isAsciiDigit(uint32_t cp);
|
||||
bool isExplicitHyphen(uint32_t cp);
|
||||
bool isSoftHyphen(uint32_t cp);
|
||||
void trimSurroundingPunctuation(std::vector<CodepointInfo>& cps);
|
||||
void trimSurroundingPunctuationAndFootnote(std::vector<CodepointInfo>& cps);
|
||||
std::vector<CodepointInfo> collectCodepoints(const std::string& word);
|
||||
void trimTrailingFootnoteReference(std::vector<CodepointInfo>& cps);
|
||||
|
||||
@ -1,8 +1,5 @@
|
||||
#include "Hyphenator.h"
|
||||
|
||||
#include <Utf8.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#include "HyphenationCommon.h"
|
||||
@ -60,13 +57,10 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
||||
|
||||
// Convert to codepoints and normalize word boundaries.
|
||||
auto cps = collectCodepoints(word);
|
||||
trimSurroundingPunctuation(cps);
|
||||
trimTrailingFootnoteReference(cps);
|
||||
trimSurroundingPunctuationAndFootnote(cps);
|
||||
const auto* hyphenator = cachedHyphenator_;
|
||||
const size_t minPrefix = hyphenator ? hyphenator->minPrefix() : LiangWordConfig::kDefaultMinPrefix;
|
||||
const size_t minSuffix = hyphenator ? hyphenator->minSuffix() : LiangWordConfig::kDefaultMinSuffix;
|
||||
|
||||
// Explicit hyphen markers (soft or hard) take precedence over heuristic breaks.
|
||||
// Explicit hyphen markers (soft or hard) take precedence over language breaks.
|
||||
auto explicitBreakInfos = buildExplicitBreakInfos(cps);
|
||||
if (!explicitBreakInfos.empty()) {
|
||||
return explicitBreakInfos;
|
||||
@ -80,6 +74,8 @@ std::vector<Hyphenator::BreakInfo> Hyphenator::breakOffsets(const std::string& w
|
||||
|
||||
// Only add fallback breaks if needed
|
||||
if (includeFallback && indexes.empty()) {
|
||||
const size_t minPrefix = hyphenator ? hyphenator->minPrefix() : LiangWordConfig::kDefaultMinPrefix;
|
||||
const size_t minSuffix = hyphenator ? hyphenator->minSuffix() : LiangWordConfig::kDefaultMinSuffix;
|
||||
for (size_t idx = minPrefix; idx + minSuffix <= cps.size(); ++idx) {
|
||||
indexes.push_back(idx);
|
||||
}
|
||||
|
||||
@ -128,8 +128,7 @@ std::string positionsToHyphenated(const std::string& word, const std::vector<siz
|
||||
|
||||
std::vector<size_t> hyphenateWordWithHyphenator(const std::string& word, const LanguageHyphenator& hyphenator) {
|
||||
auto cps = collectCodepoints(word);
|
||||
trimSurroundingPunctuation(cps);
|
||||
trimTrailingFootnoteReference(cps);
|
||||
trimSurroundingPunctuationAndFootnote(cps);
|
||||
|
||||
return hyphenator.breakIndexes(cps);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user