## Summary Closes #743. **What is the goal of this PR?** - Add back handling for HTML entities in expat. This was originally part of the code that got removed [here](https://github.com/crosspoint-reader/crosspoint-reader/pull/274) - Handle ` ` characters to resolve issue #743 **What changes are included?** - Brought back HTML entity table from previous commit and refactored it to use a static const char * table with linear lookup to reduce heap allocations. - Used `XML_SetDefaultHandlerExpand` in expat to parse out the entities correctly, without needing them defined in DOCTYPE - Added handling for ` ` so that the text stays together and doesn't break onto a new line with text separated by an ` ` ## Additional Context - This supersedes [this PR](https://github.com/crosspoint-reader/crosspoint-reader/pull/751) that simply handled `nbsp;` as whitespace. Instead, we want that character to serve its true purpose and affect the line-breaking algorithm. - Updated my test EPUB [here](https://github.com/jdk2pq/css-test-epub) with ` ` characters examples at the end of the book --- ### AI Usage While CrossPoint doesn't have restrictions on AI tools in contributing, please be transparent about their usage as it helps set the right context for reviewers. Did you use AI tools to help write this code? _**YES**_, Claude Code
97 lines
3.4 KiB
C++
97 lines
3.4 KiB
C++
#pragma once
|
|
|
|
#include <expat.h>
|
|
|
|
#include <climits>
|
|
#include <functional>
|
|
#include <memory>
|
|
|
|
#include "../ParsedText.h"
|
|
#include "../blocks/TextBlock.h"
|
|
#include "../css/CssParser.h"
|
|
#include "../css/CssStyle.h"
|
|
|
|
class Page;
|
|
class GfxRenderer;
|
|
|
|
#define MAX_WORD_SIZE 200
|
|
|
|
class ChapterHtmlSlimParser {
|
|
const std::string& filepath;
|
|
GfxRenderer& renderer;
|
|
std::function<void(std::unique_ptr<Page>)> completePageFn;
|
|
std::function<void()> popupFn; // Popup callback
|
|
int depth = 0;
|
|
int skipUntilDepth = INT_MAX;
|
|
int boldUntilDepth = INT_MAX;
|
|
int italicUntilDepth = INT_MAX;
|
|
int underlineUntilDepth = INT_MAX;
|
|
// buffer for building up words from characters, will auto break if longer than this
|
|
// leave one char at end for null pointer
|
|
char partWordBuffer[MAX_WORD_SIZE + 1] = {};
|
|
int partWordBufferIndex = 0;
|
|
bool nextWordContinues = false; // true when next flushed word attaches to previous (inline element boundary)
|
|
std::unique_ptr<ParsedText> currentTextBlock = nullptr;
|
|
std::unique_ptr<Page> currentPage = nullptr;
|
|
int16_t currentPageNextY = 0;
|
|
int fontId;
|
|
float lineCompression;
|
|
bool extraParagraphSpacing;
|
|
uint8_t paragraphAlignment;
|
|
uint16_t viewportWidth;
|
|
uint16_t viewportHeight;
|
|
bool hyphenationEnabled;
|
|
const CssParser* cssParser;
|
|
bool embeddedStyle;
|
|
|
|
// Style tracking (replaces depth-based approach)
|
|
struct StyleStackEntry {
|
|
int depth = 0;
|
|
bool hasBold = false, bold = false;
|
|
bool hasItalic = false, italic = false;
|
|
bool hasUnderline = false, underline = false;
|
|
};
|
|
std::vector<StyleStackEntry> inlineStyleStack;
|
|
CssStyle currentCssStyle;
|
|
bool effectiveBold = false;
|
|
bool effectiveItalic = false;
|
|
bool effectiveUnderline = false;
|
|
|
|
void updateEffectiveInlineStyle();
|
|
void startNewTextBlock(const BlockStyle& blockStyle);
|
|
void flushPartWordBuffer();
|
|
void makePages();
|
|
// XML callbacks
|
|
static void XMLCALL startElement(void* userData, const XML_Char* name, const XML_Char** atts);
|
|
static void XMLCALL characterData(void* userData, const XML_Char* s, int len);
|
|
static void XMLCALL defaultHandlerExpand(void* userData, const XML_Char* s, int len);
|
|
static void XMLCALL endElement(void* userData, const XML_Char* name);
|
|
|
|
public:
|
|
explicit ChapterHtmlSlimParser(const std::string& filepath, GfxRenderer& renderer, const int fontId,
|
|
const float lineCompression, const bool extraParagraphSpacing,
|
|
const uint8_t paragraphAlignment, const uint16_t viewportWidth,
|
|
const uint16_t viewportHeight, const bool hyphenationEnabled,
|
|
const std::function<void(std::unique_ptr<Page>)>& completePageFn,
|
|
const bool embeddedStyle, const std::function<void()>& popupFn = nullptr,
|
|
const CssParser* cssParser = nullptr)
|
|
|
|
: filepath(filepath),
|
|
renderer(renderer),
|
|
fontId(fontId),
|
|
lineCompression(lineCompression),
|
|
extraParagraphSpacing(extraParagraphSpacing),
|
|
paragraphAlignment(paragraphAlignment),
|
|
viewportWidth(viewportWidth),
|
|
viewportHeight(viewportHeight),
|
|
hyphenationEnabled(hyphenationEnabled),
|
|
completePageFn(completePageFn),
|
|
popupFn(popupFn),
|
|
cssParser(cssParser),
|
|
embeddedStyle(embeddedStyle) {}
|
|
|
|
~ChapterHtmlSlimParser() = default;
|
|
bool parseAndBuildPages();
|
|
void addLineToPage(std::shared_ptr<TextBlock> line);
|
|
};
|