Remove HTML entity parsing (#274)

## Summary

* Remove HTML entity parsing
  * This has been completely useless since the introduction of expat
* expat tries to parse all entities in the document, but only knows of
HTML ones
* Parsing will never end with HTML entities in the text, so the
additional step to parse them that we had went completely unused
* We should figure out the best way to parse that content in the future,
but for now remove that module as it generates a lot of heap allocations
with its map and strings
This commit is contained in:
Dave Allie
2026-01-07 22:08:43 +10:00
committed by GitHub
parent 46fa186b82
commit 2b12a65011
3 changed files with 3 additions and 174 deletions

View File

@@ -6,7 +6,6 @@
#include <expat.h>
#include "../Page.h"
#include "../htmlEntities.h"
const char* HEADER_TAGS[] = {"h1", "h2", "h3", "h4", "h5", "h6"};
constexpr int NUM_HEADER_TAGS = sizeof(HEADER_TAGS) / sizeof(HEADER_TAGS[0]);
@@ -130,7 +129,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
// Currently looking at whitespace, if there's anything in the partWordBuffer, flush it
if (self->partWordBufferIndex > 0) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
// Skip the whitespace char
@@ -155,7 +154,7 @@ void XMLCALL ChapterHtmlSlimParser::characterData(void* userData, const XML_Char
// If we're about to run out of space, then cut the word off and start a new one
if (self->partWordBufferIndex >= MAX_WORD_SIZE) {
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
@@ -197,7 +196,7 @@ void XMLCALL ChapterHtmlSlimParser::endElement(void* userData, const XML_Char* n
}
self->partWordBuffer[self->partWordBufferIndex] = '\0';
self->currentTextBlock->addWord(std::move(replaceHtmlEntities(self->partWordBuffer)), fontStyle);
self->currentTextBlock->addWord(self->partWordBuffer, fontStyle);
self->partWordBufferIndex = 0;
}
}