crosspoint-reader/lib/StarDict/DictHtmlParser.cpp

#include "DictHtmlParser.h"

#include <Epub/ParsedText.h>
#include <GfxRenderer.h>

#include <algorithm>
#include <cctype>
#include <stack>

std::string DictHtmlParser::decodeEntity(const std::string& html, size_t& i) {
  const size_t start = i;  // Position of '&'
  const size_t remaining = html.length() - start;

  // Numeric entities: &#NNN; or &#xHHH;
  if (remaining > 2 && html[start + 1] == '#') {
    size_t numStart = start + 2;
    bool isHex = false;
    if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
      isHex = true;
      numStart++;
    }

    size_t numEnd = numStart;
    while (numEnd < html.length() && html[numEnd] != ';') {
      const char c = html[numEnd];
      if (isHex) {
        if (!std::isxdigit(static_cast<unsigned char>(c))) break;
      } else {
        if (!std::isdigit(static_cast<unsigned char>(c))) break;
      }
      numEnd++;
    }

    if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
      const std::string numStr = html.substr(numStart, numEnd - numStart);
      unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
      i = numEnd;  // Will be incremented by caller's loop

      // Convert codepoint to UTF-8
      std::string utf8;
      if (codepoint < 0x80) {
        utf8 += static_cast<char>(codepoint);
      } else if (codepoint < 0x800) {
        utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
        utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
      } else if (codepoint < 0x10000) {
        utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
        utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
        utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
      } else if (codepoint < 0x110000) {
        utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
        utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
        utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
        utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
      }
      return utf8;
    }
  }

  // Named entities - find the semicolon first
  size_t semicolon = html.find(';', start + 1);
  if (semicolon != std::string::npos && semicolon - start < 12) {
    const std::string entity = html.substr(start, semicolon - start + 1);

    // Common named entities
    struct EntityMapping {
      const char* entity;
      const char* replacement;
    };
    static const EntityMapping entities[] = {
        {"&nbsp;", " "},
        {"&lt;", "<"},
        {"&gt;", ">"},
        {"&amp;", "&"},
        {"&quot;", "\""},
        {"&apos;", "'"},
        {"&mdash;", "\xe2\x80\x94"},   // —
        {"&ndash;", "\xe2\x80\x93"},   // –
        {"&hellip;", "\xe2\x80\xa6"},  // …
        {"&rsquo;", "\xe2\x80\x99"},   // '
        {"&lsquo;", "\xe2\x80\x98"},   // '
        {"&rdquo;", "\xe2\x80\x9d"},   // "
        {"&ldquo;", "\xe2\x80\x9c"},   // "
        {"&deg;", "\xc2\xb0"},         // °
        {"&times;", "\xc3\x97"},       // ×
        {"&divide;", "\xc3\xb7"},      // ÷
        {"&plusmn;", "\xc2\xb1"},      // ±
        {"&frac12;", "\xc2\xbd"},      // ½
        {"&frac14;", "\xc2\xbc"},      // ¼
        {"&frac34;", "\xc2\xbe"},      // ¾
        {"&cent;", "\xc2\xa2"},        // ¢
        {"&pound;", "\xc2\xa3"},       // £
        {"&euro;", "\xe2\x82\xac"},    // €
        {"&yen;", "\xc2\xa5"},         // ¥
        {"&copy;", "\xc2\xa9"},        // ©
        {"&reg;", "\xc2\xae"},         // ®
        {"&trade;", "\xe2\x84\xa2"},   // ™
        {"&bull;", "\xe2\x80\xa2"},    // •
        {"&middot;", "\xc2\xb7"},      // ·
        {"&sect;", "\xc2\xa7"},        // §
        {"&para;", "\xc2\xb6"},        // ¶
        {"&dagger;", "\xe2\x80\xa0"},  // †
        {"&Dagger;", "\xe2\x80\xa1"},  // ‡
        {"&iexcl;", "\xc2\xa1"},       // ¡
        {"&iquest;", "\xc2\xbf"},      // ¿
        {"&laquo;", "\xc2\xab"},       // «
        {"&raquo;", "\xc2\xbb"},       // »
        {"&lrm;", ""},                 // Left-to-right mark (invisible)
        {"&rlm;", ""},                 // Right-to-left mark (invisible)
        {"&shy;", ""},                 // Soft hyphen
        {"&ensp;", " "},
        {"&emsp;", " "},
        {"&thinsp;", " "},
        {"&zwj;", ""},
        {"&zwnj;", ""},
    };

    for (const auto& mapping : entities) {
      if (entity == mapping.entity) {
        i = semicolon;  // Will be incremented by caller's loop
        return mapping.replacement;
      }
    }
  }

  // Unknown entity - return just the ampersand
  return "&";
}

std::string DictHtmlParser::extractTagName(const std::string& html, size_t start, bool& isClosing) {
  isClosing = false;
  size_t pos = start;

  // Skip whitespace after '<'
  while (pos < html.length() && std::isspace(static_cast<unsigned char>(html[pos]))) {
    pos++;
  }

  // Check for closing tag
  if (pos < html.length() && html[pos] == '/') {
    isClosing = true;
    pos++;
  }

  // Extract tag name (alphanumeric characters)
  size_t nameStart = pos;
  while (pos < html.length() && (std::isalnum(static_cast<unsigned char>(html[pos])) || html[pos] == '!')) {
    pos++;
  }

  std::string tagName = html.substr(nameStart, pos - nameStart);
  // Convert to lowercase
  std::transform(tagName.begin(), tagName.end(), tagName.begin(), [](unsigned char c) { return std::tolower(c); });
  return tagName;
}

bool DictHtmlParser::isBlockTag(const std::string& tagName) {
  return tagName == "p" || tagName == "div" || tagName == "br" || tagName == "hr" || tagName == "li" ||
         tagName == "ol" || tagName == "ul" || tagName == "dt" || tagName == "dd" || tagName == "html";
}

bool DictHtmlParser::isBoldTag(const std::string& tagName) { return tagName == "b" || tagName == "strong"; }

bool DictHtmlParser::isItalicTag(const std::string& tagName) { return tagName == "i" || tagName == "em"; }

bool DictHtmlParser::isUnderlineTag(const std::string& tagName) { return tagName == "u" || tagName == "ins"; }

bool DictHtmlParser::isSuperscriptTag(const std::string& tagName) { return tagName == "sup"; }

bool DictHtmlParser::isListItemTag(const std::string& tagName) { return tagName == "li"; }

bool DictHtmlParser::isOrderedListTag(const std::string& tagName) { return tagName == "ol"; }

void DictHtmlParser::parse(const std::string& html, int fontId, const GfxRenderer& renderer, uint16_t viewportWidth,
                           const std::function<void(std::shared_ptr<TextBlock>)>& onTextBlock) {
  // Current paragraph being built
  ParsedText currentParagraph(TextBlock::Style::LEFT_ALIGN, false, false);

  // State tracking
  int boldDepth = 0;
  int italicDepth = 0;
  int underlineDepth = 0;
  bool inSuperscript = false;
  bool inTag = false;

  // List tracking
  std::stack<int> listCounters;  // Stack for nested lists (0 = unordered, >0 = ordered counter)

  // Current word being accumulated
  std::string currentWord;
  bool lastWasSpace = true;  // Start true to skip leading spaces

  // Helper to flush current word to paragraph
  auto flushWord = [&]() {
    if (currentWord.empty()) return;

    // Determine font style
    EpdFontFamily::Style fontStyle = EpdFontFamily::REGULAR;
    if (boldDepth > 0 && italicDepth > 0) {
      fontStyle = EpdFontFamily::BOLD_ITALIC;
    } else if (boldDepth > 0) {
      fontStyle = EpdFontFamily::BOLD;
    } else if (italicDepth > 0) {
      fontStyle = EpdFontFamily::ITALIC;
    }

    currentParagraph.addWord(currentWord, fontStyle, underlineDepth > 0);
    currentWord.clear();
    lastWasSpace = false;
  };

  // Helper to flush current paragraph (create TextBlocks)
  auto flushParagraph = [&]() {
    flushWord();
    if (!currentParagraph.isEmpty()) {
      currentParagraph.layoutAndExtractLines(renderer, fontId, viewportWidth, onTextBlock);
      currentParagraph = ParsedText(TextBlock::Style::LEFT_ALIGN, false, false);
    }
    lastWasSpace = true;
  };

  // Parse the HTML
  for (size_t i = 0; i < html.length(); i++) {
    const char c = html[i];

    if (c == '<') {
      // Start of tag - flush current word first
      flushWord();

      // Find end of tag
      size_t tagEnd = html.find('>', i);
      if (tagEnd == std::string::npos) {
        // Malformed HTML - treat rest as text
        currentWord += c;
        continue;
      }

      // Extract tag name
      bool isClosing = false;
      std::string tagName = extractTagName(html, i + 1, isClosing);

      // Handle different tag types
      if (isBoldTag(tagName)) {
        if (isClosing) {
          boldDepth = std::max(0, boldDepth - 1);
        } else {
          boldDepth++;
        }
      } else if (isItalicTag(tagName)) {
        if (isClosing) {
          italicDepth = std::max(0, italicDepth - 1);
        } else {
          italicDepth++;
        }
      } else if (isUnderlineTag(tagName)) {
        if (isClosing) {
          underlineDepth = std::max(0, underlineDepth - 1);
        } else {
          underlineDepth++;
        }
      } else if (isSuperscriptTag(tagName)) {
        if (isClosing) {
          inSuperscript = false;
        } else {
          inSuperscript = true;
          // Add caret prefix for superscript
          currentWord += '^';
        }
      } else if (isOrderedListTag(tagName)) {
        if (isClosing) {
          if (!listCounters.empty()) {
            listCounters.pop();
          }
        } else {
          // Check if it's an unordered list style
          std::string tagContent = html.substr(i, tagEnd - i);
          if (tagContent.find("list-style-type:lower-alpha") != std::string::npos) {
            listCounters.push(-1);  // -1 = alphabetic
          } else {
            listCounters.push(1);  // Start at 1 for ordered
          }
        }
      } else if (tagName == "ul") {
        if (isClosing) {
          if (!listCounters.empty()) {
            listCounters.pop();
          }
        } else {
          listCounters.push(0);  // 0 = unordered (bullet)
        }
      } else if (isListItemTag(tagName) && !isClosing) {
        // Start of list item - flush paragraph and add bullet/number
        flushParagraph();

        std::string prefix;
        if (!listCounters.empty()) {
          int counter = listCounters.top();
          if (counter == 0) {
            // Unordered - bullet point
            prefix = "\xe2\x80\xa2 ";  // • bullet
          } else if (counter == -1) {
            // Alphabetic - not fully supported, just use bullet
            prefix = "  ";
          } else {
            // Ordered - number
            char numBuf[8];
            snprintf(numBuf, sizeof(numBuf), "%d. ", counter);
            prefix = numBuf;
            listCounters.pop();
            listCounters.push(counter + 1);  // Increment for next item
          }
        } else {
          // No list context - just indent
          prefix = "\xe2\x80\xa2 ";  // • bullet
        }

        // Add prefix as a word (em-space for indent + prefix)
        currentParagraph.addWord("\xe2\x80\x83" + prefix, EpdFontFamily::REGULAR, false);
        lastWasSpace = true;
      } else if (isBlockTag(tagName)) {
        // Block element - flush paragraph
        flushParagraph();

        // Special handling for </html> which separates dictionary entries
        if (tagName == "html" && isClosing) {
          // Add extra spacing between entries
          flushParagraph();
        }
      }

      // Skip to end of tag
      i = tagEnd;
    } else if (c == '&') {
      // HTML entity
      std::string decoded = decodeEntity(html, i);
      if (!decoded.empty()) {
        if (decoded == " ") {
          // Space entity - treat as space
          if (!lastWasSpace) {
            flushWord();
            lastWasSpace = true;
          }
        } else {
          currentWord += decoded;
          lastWasSpace = false;
        }
      }
    } else if (std::isspace(static_cast<unsigned char>(c))) {
      // Whitespace - flush word and collapse
      if (!lastWasSpace) {
        flushWord();
        lastWasSpace = true;
      }
    } else {
      // Regular character
      currentWord += c;
      lastWasSpace = false;
    }
  }

  // Flush any remaining content
  flushParagraph();
}