Skip soft hyphens. (#195)
For now, let's skip the soft hyphens (later, we can treat them in the layouter). See https://github.com/daveallie/crosspoint-reader/discussions/17#discussioncomment-15378475
This commit is contained in:
parent
9e59a5106b
commit
39080c0e51
@ -137,6 +137,21 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Skip soft-hyphen with UTF-8 representation (U+00AD) = 0xC2 0xAD
|
||||||
|
const XML_Char SHY_BYTE_1 = static_cast<XML_Char>(0xC2);
|
||||||
|
const XML_Char SHY_BYTE_2 = static_cast<XML_Char>(0xAD);
|
||||||
|
// 1. Check for the start of the 2-byte Soft Hyphen sequence
|
||||||
|
if (s[i] == SHY_BYTE_1) {
|
||||||
|
// 2. Check if the next byte exists AND if it completes the sequence
|
||||||
|
// We must check i + 1 < len to prevent reading past the end of the buffer.
|
||||||
|
if ((i + 1 < len) && (s[i + 1] == SHY_BYTE_2)) {
|
||||||
|
// Sequence 0xC2 0xAD found!
|
||||||
|
// Skip the current byte (0xC2) and the next byte (0xAD)
|
||||||
|
i++; // Increment 'i' one more time to skip the 0xAD byte
|
||||||
|
continue; // Skip the rest of the loop and move to the next iteration
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If we're about to run out of space, then cut the word off and start a new one
|
// If we're about to run out of space, then cut the word off and start a new one
|
||||||
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
|
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
|
||||||
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
self->partWordBuffer[self->partWordBufferIndex] = '\0';
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user