Files
crosspoint-reader-mod/lib/Epub/Epub/parsers/ChapterHtmlSlimParser.h
Jake Kenneally 6e51afb977 fix: Account for nbsp; character as non-breaking space (#757)
## Summary

Closes #743.

**What is the goal of this PR?**

- Add back handling for HTML entities in expat. This was originally part
of the code that got removed
[here](https://github.com/crosspoint-reader/crosspoint-reader/pull/274)
- Handle ` ` characters to resolve issue #743 

**What changes are included?**

- Brought back HTML entity table from previous commit and refactored it
to use a static const char * table with linear lookup to reduce heap
allocations.
- Used `XML_SetDefaultHandlerExpand` in expat to parse out the entities
correctly, without needing them defined in DOCTYPE
- Added handling for ` ` so that the text stays together and
doesn't break onto a new line with text separated by an ` `

## Additional Context

- This supersedes [this
PR](https://github.com/crosspoint-reader/crosspoint-reader/pull/751)
that simply handled `nbsp;` as whitespace. Instead, we want that
character to serve its true purpose and affect the line-breaking
algorithm.
- Updated my test EPUB [here](https://github.com/jdk2pq/css-test-epub)
with ` ` characters examples at the end of the book

---

### AI Usage

While CrossPoint doesn't have restrictions on AI tools in contributing,
please be transparent about their usage as it
helps set the right context for reviewers.

Did you use AI tools to help write this code? _**YES**_, Claude Code
2026-02-13 15:46:46 +01:00

97 lines
3.4 KiB
C++

#pragma once
#include <expat.h>
#include <climits>
#include <functional>
#include <memory>
#include "../ParsedText.h"
#include "../blocks/TextBlock.h"
#include "../css/CssParser.h"
#include "../css/CssStyle.h"
class Page;
class GfxRenderer;
#define MAX_WORD_SIZE 200
class ChapterHtmlSlimParser {
const std::string& filepath;
GfxRenderer& renderer;
std::function<void(std::unique_ptr<Page>)> completePageFn;
std::function<void()> popupFn; // Popup callback
int depth = 0;
int skipUntilDepth = INT_MAX;
int boldUntilDepth = INT_MAX;
int italicUntilDepth = INT_MAX;
int underlineUntilDepth = INT_MAX;
// buffer for building up words from characters, will auto break if longer than this
// leave one char at end for null pointer
char partWordBuffer[MAX_WORD_SIZE + 1] = {};
int partWordBufferIndex = 0;
bool nextWordContinues = false; // true when next flushed word attaches to previous (inline element boundary)
std::unique_ptr<ParsedText> currentTextBlock = nullptr;
std::unique_ptr<Page> currentPage = nullptr;
int16_t currentPageNextY = 0;
int fontId;
float lineCompression;
bool extraParagraphSpacing;
uint8_t paragraphAlignment;
uint16_t viewportWidth;
uint16_t viewportHeight;
bool hyphenationEnabled;
const CssParser* cssParser;
bool embeddedStyle;
// Style tracking (replaces depth-based approach)
struct StyleStackEntry {
int depth = 0;
bool hasBold = false, bold = false;
bool hasItalic = false, italic = false;
bool hasUnderline = false, underline = false;
};
std::vector<StyleStackEntry> inlineStyleStack;
CssStyle currentCssStyle;
bool effectiveBold = false;
bool effectiveItalic = false;
bool effectiveUnderline = false;
void updateEffectiveInlineStyle();
void startNewTextBlock(const BlockStyle& blockStyle);
void flushPartWordBuffer();
void makePages();
// XML callbacks
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
static void XMLCALL defaultHandlerExpand(void* userData, const XML_Char* s, int len);
static void XMLCALL endElement(void* userData, const XML_Char* name);
public:
explicit ChapterHtmlSlimParser(const std::string& filepath, GfxRenderer& renderer, const int fontId,
const float lineCompression, const bool extraParagraphSpacing,
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
const uint16_t viewportHeight, const bool hyphenationEnabled,
const std::function<void(std::unique_ptr<Page>)>& completePageFn,
const bool embeddedStyle, const std::function<void()>& popupFn = nullptr,
const CssParser* cssParser = nullptr)
: filepath(filepath),
renderer(renderer),
fontId(fontId),
lineCompression(lineCompression),
extraParagraphSpacing(extraParagraphSpacing),
paragraphAlignment(paragraphAlignment),
viewportWidth(viewportWidth),
viewportHeight(viewportHeight),
hyphenationEnabled(hyphenationEnabled),
completePageFn(completePageFn),
popupFn(popupFn),
cssParser(cssParser),
embeddedStyle(embeddedStyle) {}
~ChapterHtmlSlimParser() = default;
bool parseAndBuildPages();
void addLineToPage(std::shared_ptr<TextBlock> line);
};