crosspoint-reader/lib/StarDict/StarDict.cpp

#include "StarDict.h"

#include <HardwareSerial.h>
#include <SDCardManager.h>
#include <miniz.h>

#include <algorithm>
#include <cctype>

#include "DictPrefixIndex.generated.h"

StarDict::StarDict(const std::string& basePath) : basePath(basePath) {}

StarDict::~StarDict() {
  if (dzInfo.chunkSizes) {
    free(dzInfo.chunkSizes);
    dzInfo.chunkSizes = nullptr;
  }
}

uint32_t StarDict::readBE32(const uint8_t* data) {
  return (static_cast<uint32_t>(data[0]) << 24) | (static_cast<uint32_t>(data[1]) << 16) |
         (static_cast<uint32_t>(data[2]) << 8) | static_cast<uint32_t>(data[3]);
}

bool StarDict::loadInfo() {
  const std::string ifoPath = basePath + ".ifo";
  FsFile file;
  if (!SdMan.openFileForRead("DICT", ifoPath, file)) {
    Serial.printf("[%lu] [DICT] Failed to open .ifo file: %s\n", millis(), ifoPath.c_str());
    return false;
  }

  char buffer[256];
  while (file.available()) {
    const int len = file.fgets(buffer, sizeof(buffer));
    if (len <= 0) break;

    // Remove newline
    char* newline = strchr(buffer, '\n');
    if (newline) *newline = '\0';
    newline = strchr(buffer, '\r');
    if (newline) *newline = '\0';

    // Parse key=value
    char* eq = strchr(buffer, '=');
    if (!eq) continue;

    *eq = '\0';
    const char* key = buffer;
    const char* value = eq + 1;

    if (strcmp(key, "bookname") == 0) {
      info.bookname = value;
    } else if (strcmp(key, "wordcount") == 0) {
      info.wordcount = strtoul(value, nullptr, 10);
    } else if (strcmp(key, "idxfilesize") == 0) {
      info.idxfilesize = strtoul(value, nullptr, 10);
    } else if (strcmp(key, "sametypesequence") == 0) {
      info.sametypesequence = value[0];
    } else if (strcmp(key, "synwordcount") == 0) {
      info.synwordcount = strtoul(value, nullptr, 10);
    }
  }

  file.close();
  info.loaded = true;

  Serial.printf("[%lu] [DICT] Loaded dictionary: %s (%u words)\n", millis(), info.bookname.c_str(), info.wordcount);
  return true;
}

bool StarDict::loadDictzipHeader() {
  if (dzInfo.loaded) return true;

  const std::string dzPath = basePath + ".dict.dz";
  FsFile file;
  if (!SdMan.openFileForRead("DICT", dzPath, file)) {
    Serial.printf("[%lu] [DICT] Failed to open .dict.dz file\n", millis());
    return false;
  }

  // Read gzip header
  uint8_t header[10];
  if (file.read(header, 10) != 10) {
    file.close();
    return false;
  }

  // Verify gzip magic number
  if (header[0] != 0x1f || header[1] != 0x8b) {
    Serial.printf("[%lu] [DICT] Not a valid gzip file\n", millis());
    file.close();
    return false;
  }

  // Check for extra field flag (bit 2)
  const uint8_t flags = header[3];
  if (!(flags & 0x04)) {
    Serial.printf("[%lu] [DICT] No extra field - not a dictzip file\n", millis());
    file.close();
    return false;
  }

  // Read extra field length
  uint8_t xlenBuf[2];
  if (file.read(xlenBuf, 2) != 2) {
    file.close();
    return false;
  }
  const uint16_t xlen = xlenBuf[0] | (xlenBuf[1] << 8);

  // Read extra field
  auto* extraField = static_cast<uint8_t*>(malloc(xlen));
  if (!extraField) {
    file.close();
    return false;
  }

  if (file.read(extraField, xlen) != xlen) {
    free(extraField);
    file.close();
    return false;
  }

  // Parse dictzip subfield (SI1='R', SI2='A')
  bool foundDictzip = false;
  uint16_t pos = 0;
  while (pos + 4 <= xlen) {
    const uint8_t si1 = extraField[pos];
    const uint8_t si2 = extraField[pos + 1];
    const uint16_t slen = extraField[pos + 2] | (extraField[pos + 3] << 8);

    if (si1 == 'R' && si2 == 'A' && pos + 4 + slen <= xlen) {
      // Dictzip subfield found
      // Format: ver(2) + chlen(2) + count(2) + sizes[count](2 each)
      const uint8_t* data = &extraField[pos + 4];
      // uint16_t version = data[0] | (data[1] << 8);  // Usually 1
      dzInfo.chunkLength = data[2] | (data[3] << 8);
      dzInfo.chunkCount = data[4] | (data[5] << 8);

      dzInfo.chunkSizes = static_cast<uint16_t*>(malloc(dzInfo.chunkCount * sizeof(uint16_t)));
      if (!dzInfo.chunkSizes) {
        free(extraField);
        file.close();
        return false;
      }

      for (uint16_t i = 0; i < dzInfo.chunkCount; i++) {
        dzInfo.chunkSizes[i] = data[6 + i * 2] | (data[7 + i * 2] << 8);
      }

      foundDictzip = true;
      break;
    }

    pos += 4 + slen;
  }

  free(extraField);

  if (!foundDictzip) {
    Serial.printf("[%lu] [DICT] Dictzip subfield not found\n", millis());
    file.close();
    return false;
  }

  // Calculate header size (10 + 2 + xlen + optional fields)
  dzInfo.headerSize = 10 + 2 + xlen;

  // Skip FNAME if present (bit 3)
  if (flags & 0x08) {
    file.seek(dzInfo.headerSize);
    while (file.available()) {
      uint8_t c;
      file.read(&c, 1);
      dzInfo.headerSize++;
      if (c == 0) break;
    }
  }

  // Skip FCOMMENT if present (bit 4)
  if (flags & 0x10) {
    file.seek(dzInfo.headerSize);
    while (file.available()) {
      uint8_t c;
      file.read(&c, 1);
      dzInfo.headerSize++;
      if (c == 0) break;
    }
  }

  // Skip FHCRC if present (bit 1)
  if (flags & 0x02) {
    dzInfo.headerSize += 2;
  }

  file.close();
  dzInfo.loaded = true;

  Serial.printf("[%lu] [DICT] Dictzip: %u chunks of %u bytes, header size %u\n", millis(), dzInfo.chunkCount,
                dzInfo.chunkLength, dzInfo.headerSize);
  return true;
}

bool StarDict::begin() {
  if (!loadInfo()) return false;

  // Try uncompressed .dict file first (preferred - no memory overhead)
  const std::string dictPath = basePath + ".dict";
  FsFile testFile;
  if (SdMan.openFileForRead("DICT", dictPath, testFile)) {
    testFile.close();
    useUncompressed = true;
    Serial.printf("[%lu] [DICT] Using uncompressed .dict file (no decompression needed)\n", millis());
    return true;
  }

  // Fall back to compressed .dict.dz
  useUncompressed = false;
  if (!loadDictzipHeader()) return false;
  return true;
}

bool StarDict::readWordAtPosition(FsFile& idxFile, uint32_t& position, std::string& word, uint32_t& dictOffset,
                                  uint32_t& dictSize) {
  idxFile.seek(position);

  // Read null-terminated word
  word.clear();
  char c;
  while (idxFile.read(&c, 1) == 1) {
    if (c == '\0') break;
    word += c;
    if (word.length() > 256) {
      // Safety limit
      return false;
    }
  }

  if (word.empty()) return false;

  // Read 4-byte big-endian offset
  uint8_t buf[8];
  if (idxFile.read(buf, 8) != 8) return false;

  dictOffset = readBE32(buf);
  dictSize = readBE32(buf + 4);

  position = idxFile.position();
  return true;
}

bool StarDict::readDefinitionDirect(uint32_t offset, uint32_t size, std::string& definition) {
  // Read directly from uncompressed .dict file - no decompression needed!
  const std::string dictPath = basePath + ".dict";
  FsFile file;
  if (!SdMan.openFileForRead("DICT", dictPath, file)) {
    Serial.printf("[DICT-DBG] Failed to open .dict file\n");
    return false;
  }

  // Seek to the definition offset
  if (!file.seek(offset)) {
    Serial.printf("[DICT-DBG] Failed to seek to offset %lu\n", offset);
    file.close();
    return false;
  }

  // Read the definition directly into the string
  definition.resize(size);
  const int bytesRead = file.read(&definition[0], size);
  file.close();

  if (bytesRead != static_cast<int>(size)) {
    Serial.printf("[DICT-DBG] Read %d bytes, expected %lu\n", bytesRead, size);
    definition.clear();
    return false;
  }

  return true;
}

bool StarDict::decompressDefinition(uint32_t offset, uint32_t size, std::string& definition) {
  if (!dzInfo.loaded) {
    Serial.printf("[DICT-DBG] dzInfo not loaded!\n");
    return false;
  }

  const std::string dzPath = basePath + ".dict.dz";
  FsFile file;
  if (!SdMan.openFileForRead("DICT", dzPath, file)) {
    Serial.printf("[DICT-DBG] Failed to open dict.dz file\n");
    return false;
  }

  // Calculate which chunk(s) we need
  const uint32_t startChunk = offset / dzInfo.chunkLength;
  const uint32_t endChunk = (offset + size - 1) / dzInfo.chunkLength;
  const uint32_t startOffsetInChunk = offset % dzInfo.chunkLength;

  Serial.printf("[DICT-DBG] Chunks: start=%lu, end=%lu, total=%u\n",
                startChunk, endChunk, dzInfo.chunkCount);

  if (endChunk >= dzInfo.chunkCount) {
    Serial.printf("[DICT-DBG] endChunk %lu >= chunkCount %u\n", endChunk, dzInfo.chunkCount);
    file.close();
    return false;
  }

  // Calculate file offset for start chunk
  uint32_t fileOffset = dzInfo.headerSize;
  for (uint32_t i = 0; i < startChunk; i++) {
    fileOffset += dzInfo.chunkSizes[i];
  }

  // Calculate actual max compressed size needed for the chunks we'll process
  uint32_t maxCompressedSize = 0;
  for (uint32_t i = startChunk; i <= endChunk; i++) {
    if (dzInfo.chunkSizes[i] > maxCompressedSize) {
      maxCompressedSize = dzInfo.chunkSizes[i];
    }
  }

  // Allocate buffers - allocate inflator FIRST (smallest) to reduce fragmentation impact
  // tinfl_decompressor is ~11KB, so total allocations are ~85KB
  Serial.printf("[DICT-DBG] Allocating inflator=%u, comp=%lu, decomp=%u bytes\n",
                sizeof(tinfl_decompressor), maxCompressedSize, dzInfo.chunkLength);

  auto* inflator = static_cast<tinfl_decompressor*>(malloc(sizeof(tinfl_decompressor)));
  if (!inflator) {
    Serial.printf("[DICT-DBG] inflator alloc failed! (need %u bytes)\n", sizeof(tinfl_decompressor));
    file.close();
    return false;
  }

  auto* compressedBuf = static_cast<uint8_t*>(malloc(maxCompressedSize));
  if (!compressedBuf) {
    Serial.printf("[DICT-DBG] compressedBuf alloc failed!\n");
    free(inflator);
    file.close();
    return false;
  }
  auto* decompressedBuf = static_cast<uint8_t*>(malloc(dzInfo.chunkLength));
  if (!decompressedBuf) {
    Serial.printf("[DICT-DBG] decompressedBuf alloc failed!\n");
    free(inflator);
    free(compressedBuf);
    file.close();
    return false;
  }

  definition.clear();
  definition.reserve(size);

  // Process each needed chunk (reusing inflator allocation)
  for (uint32_t chunk = startChunk; chunk <= endChunk; chunk++) {
    const uint16_t compressedSize = dzInfo.chunkSizes[chunk];

    // Seek and read compressed data
    file.seek(fileOffset);
    if (file.read(compressedBuf, compressedSize) != compressedSize) {
      Serial.printf("[DICT-DBG] File read failed at offset %lu, size %u\n", fileOffset, compressedSize);
      free(inflator);
      free(compressedBuf);
      free(decompressedBuf);
      file.close();
      return false;
    }

    // Decompress using raw inflate (no zlib header)
    tinfl_init(inflator);

    size_t inBytes = compressedSize;
    size_t outBytes = dzInfo.chunkLength;
    const tinfl_status status =
        tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
                         TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF | TINFL_FLAG_PARSE_ZLIB_HEADER);

    if (status != TINFL_STATUS_DONE && status != TINFL_STATUS_HAS_MORE_OUTPUT) {
      // Try without zlib header flag
      tinfl_init(inflator);
      inBytes = compressedSize;
      outBytes = dzInfo.chunkLength;
      tinfl_decompress(inflator, compressedBuf, &inBytes, decompressedBuf, decompressedBuf, &outBytes,
                       TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
    }

    // Extract the portion we need from this chunk
    uint32_t copyStart = 0;
    uint32_t copyEnd = outBytes;

    if (chunk == startChunk) {
      copyStart = startOffsetInChunk;
    }
    if (chunk == endChunk) {
      const uint32_t endOffsetInChunk = (offset + size) - (endChunk * dzInfo.chunkLength);
      if (endOffsetInChunk < copyEnd) {
        copyEnd = endOffsetInChunk;
      }
    }

    if (copyEnd > copyStart) {
      definition.append(reinterpret_cast<char*>(decompressedBuf + copyStart), copyEnd - copyStart);
    }

    fileOffset += compressedSize;
  }

  free(inflator);
  free(compressedBuf);
  free(decompressedBuf);
  file.close();

  return true;
}

// StarDict comparison function: case-insensitive matching
int StarDict::stardictStrcmp(const std::string& a, const std::string& b) {
  // Case-insensitive comparison (like g_ascii_strcasecmp)
  size_t i = 0;
  while (i < a.length() && i < b.length()) {
    const int ca = std::tolower(static_cast<unsigned char>(a[i]));
    const int cb = std::tolower(static_cast<unsigned char>(b[i]));
    if (ca != cb) return ca - cb;
    i++;
  }
  if (a.length() != b.length()) {
    return static_cast<int>(a.length()) - static_cast<int>(b.length());
  }
  // Case-insensitive match found
  return 0;
}

std::string StarDict::normalizeWord(const std::string& word) {
  std::string result;
  result.reserve(word.length());

  // Trim leading whitespace
  size_t start = 0;
  while (start < word.length() && std::isspace(static_cast<unsigned char>(word[start]))) {
    start++;
  }

  // Trim trailing whitespace
  size_t end = word.length();
  while (end > start && std::isspace(static_cast<unsigned char>(word[end - 1]))) {
    end--;
  }

  // Convert to lowercase
  for (size_t i = start; i < end; i++) {
    result += static_cast<char>(std::tolower(static_cast<unsigned char>(word[i])));
  }

  return result;
}

StarDict::LookupResult StarDict::lookup(const std::string& word) {
  LookupResult result;
  result.word = word;

  if (!info.loaded) {
    return result;
  }

  const std::string normalizedSearch = normalizeWord(word);
  if (normalizedSearch.empty()) {
    return result;
  }

  Serial.printf("[DICT-DBG] Searching for: '%s' (normalized: '%s')\n",
                word.c_str(), normalizedSearch.c_str());

  // First try .idx (main entries) - use prefix jump table for fast lookup
  const std::string idxPath = basePath + ".idx";
  FsFile idxFile;
  if (!SdMan.openFileForRead("DICT", idxPath, idxFile)) {
    Serial.printf("[%lu] [DICT] Failed to open index file\n", millis());
    return result;
  }

  // Jump to the relevant section using prefix index (if word has 2+ alpha chars)
  uint32_t position = 0;
  if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
      DictPrefixIndex::isAlpha(normalizedSearch[1])) {
    const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
    position = DictPrefixIndex::dictPrefixOffsets[prefixIdx];
  }
  Serial.printf("[DICT-DBG] Starting at position %lu (prefix: %c%c)\n",
                position, normalizedSearch[0], normalizedSearch[1]);
  bool found = false;
  uint32_t wordCount = 0;

  while (position < info.idxfilesize) {
    std::string currentWord;
    uint32_t dictOffset, dictSize;

    if (!readWordAtPosition(idxFile, position, currentWord, dictOffset, dictSize)) {
      break;
    }
    wordCount++;
    if (wordCount % 50000 == 0) {
      Serial.printf("[DICT-DBG] Progress: %lu words scanned, pos=%lu, current='%s'\n",
                    wordCount, position, currentWord.c_str());
    }

    // Use stardictStrcmp for case-insensitive matching
    const int cmp = stardictStrcmp(normalizedSearch, currentWord);

    if (cmp == 0) {
      Serial.printf("[DICT-DBG] MATCH: '%s' == '%s' (offset=%lu, size=%lu)\n",
                    normalizedSearch.c_str(), currentWord.c_str(), dictOffset, dictSize);
      std::string definition;
      const bool loaded = useUncompressed
          ? readDefinitionDirect(dictOffset, dictSize, definition)
          : decompressDefinition(dictOffset, dictSize, definition);
      if (loaded) {
        Serial.printf("[DICT-DBG] Definition loaded, %u bytes\n", definition.length());
        if (!found) {
          result.word = currentWord;
          result.definition = definition;
          result.found = true;
          found = true;
        } else {
          result.definition += "</html>" + definition;
        }
      } else {
        Serial.printf("[DICT-DBG] Definition load FAILED!\n");
      }
      // Continue scanning for additional matches (same word, different case)
    } else if (found) {
      // We had matches but now moved past them - safe to stop
      break;
    }
    // Note: Cannot use early-break before first match because prefix index
    // may not land exactly at target position
  }

  Serial.printf("[DICT-DBG] Search complete: %lu words scanned, found=%s\n",
                wordCount, found ? "YES" : "NO");
  idxFile.close();

  // If not found in main index, try synonym file with prefix jump
  if (!found && info.synwordcount > 0) {
    const std::string synPath = basePath + ".syn";
    FsFile synFile;
    if (SdMan.openFileForRead("DICT", synPath, synFile)) {
      const uint32_t synFileSize = synFile.size();

      // Jump to the relevant section using prefix index (if word has 2+ alpha chars)
      uint32_t synPosition = 0;
      if (normalizedSearch.length() >= 2 && DictPrefixIndex::isAlpha(normalizedSearch[0]) &&
          DictPrefixIndex::isAlpha(normalizedSearch[1])) {
        const uint16_t prefixIdx = DictPrefixIndex::prefixToIndex(normalizedSearch[0], normalizedSearch[1]);
        synPosition = DictPrefixIndex::synPrefixOffsets[prefixIdx];
        synFile.seek(synPosition);
      }

      while (synFile.position() < synFileSize) {
        // Read synonym word (null-terminated)
        std::string synWord;
        char c;
        while (synFile.read(&c, 1) == 1 && c != '\0') {
          synWord += c;
        }

        // Read 4-byte big-endian index
        uint8_t idxBytes[4];
        if (synFile.read(idxBytes, 4) != 4) break;
        const uint32_t mainIdx = readBE32(idxBytes);

        // Use stardictStrcmp for case-insensitive comparison
        const int cmp = stardictStrcmp(normalizedSearch, synWord);

        if (cmp == 0) {
          // Found synonym - look up the main entry by index
          FsFile idxFile2;
          if (SdMan.openFileForRead("DICT", idxPath, idxFile2)) {
            uint32_t pos = 0;
            uint32_t entryNum = 0;
            while (entryNum < mainIdx && pos < info.idxfilesize) {
              std::string w;
              uint32_t off, sz;
              if (!readWordAtPosition(idxFile2, pos, w, off, sz)) break;
              entryNum++;
            }
            // Now read the target entry
            if (entryNum == mainIdx) {
              std::string mainWord;
              uint32_t dictOffset, dictSize;
              if (readWordAtPosition(idxFile2, pos, mainWord, dictOffset, dictSize)) {
                std::string definition;
                const bool loaded = useUncompressed
                    ? readDefinitionDirect(dictOffset, dictSize, definition)
                    : decompressDefinition(dictOffset, dictSize, definition);
                if (loaded) {
                  result.word = synWord;
                  result.definition = definition;
                  result.found = true;
                  found = true;
                }
              }
            }
            idxFile2.close();
          }
          break;  // Found a match, stop searching
        }
        // Note: Cannot use early-break optimization here because prefix index
        // may not land exactly at target position
      }
      synFile.close();
    }
  }

  return result;
}

// Helper to decode a single HTML entity starting at position i (after the '&')
// Returns the decoded string and advances i past the entity (including ';')
static std::string decodeHtmlEntity(const std::string& html, size_t& i) {
  const size_t start = i;  // Position of '&'
  const size_t remaining = html.length() - start;

  // Numeric entities: &#NNN; or &#xHHH;
  if (remaining > 2 && html[start + 1] == '#') {
    size_t numStart = start + 2;
    bool isHex = false;
    if (remaining > 3 && (html[numStart] == 'x' || html[numStart] == 'X')) {
      isHex = true;
      numStart++;
    }

    size_t numEnd = numStart;
    while (numEnd < html.length() && html[numEnd] != ';') {
      const char c = html[numEnd];
      if (isHex) {
        if (!std::isxdigit(static_cast<unsigned char>(c))) break;
      } else {
        if (!std::isdigit(static_cast<unsigned char>(c))) break;
      }
      numEnd++;
    }

    if (numEnd > numStart && numEnd < html.length() && html[numEnd] == ';') {
      const std::string numStr = html.substr(numStart, numEnd - numStart);
      unsigned long codepoint = std::strtoul(numStr.c_str(), nullptr, isHex ? 16 : 10);
      i = numEnd;  // Will be incremented by caller's loop

      // Convert codepoint to UTF-8
      std::string utf8;
      if (codepoint < 0x80) {
        utf8 += static_cast<char>(codepoint);
      } else if (codepoint < 0x800) {
        utf8 += static_cast<char>(0xC0 | (codepoint >> 6));
        utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
      } else if (codepoint < 0x10000) {
        utf8 += static_cast<char>(0xE0 | (codepoint >> 12));
        utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
        utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
      } else if (codepoint < 0x110000) {
        utf8 += static_cast<char>(0xF0 | (codepoint >> 18));
        utf8 += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
        utf8 += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
        utf8 += static_cast<char>(0x80 | (codepoint & 0x3F));
      }
      return utf8;
    }
  }

  // Named entities - find the semicolon first
  size_t semicolon = html.find(';', start + 1);
  if (semicolon != std::string::npos && semicolon - start < 12) {
    const std::string entity = html.substr(start, semicolon - start + 1);

    // Common named entities
    struct EntityMapping {
      const char* entity;
      const char* replacement;
    };
    static const EntityMapping entities[] = {
        {"&nbsp;", " "},
        {"&lt;", "<"},
        {"&gt;", ">"},
        {"&amp;", "&"},
        {"&quot;", "\""},
        {"&apos;", "'"},
        {"&mdash;", "\xe2\x80\x94"},   // —
        {"&ndash;", "\xe2\x80\x93"},   // –
        {"&hellip;", "\xe2\x80\xa6"},  // …
        {"&rsquo;", "\xe2\x80\x99"},   // '
        {"&lsquo;", "\xe2\x80\x98"},   // '
        {"&rdquo;", "\xe2\x80\x9d"},   // "
        {"&ldquo;", "\xe2\x80\x9c"},   // "
        {"&deg;", "\xc2\xb0"},         // °
        {"&times;", "\xc3\x97"},       // ×
        {"&divide;", "\xc3\xb7"},      // ÷
        {"&plusmn;", "\xc2\xb1"},      // ±
        {"&frac12;", "\xc2\xbd"},      // ½
        {"&frac14;", "\xc2\xbc"},      // ¼
        {"&frac34;", "\xc2\xbe"},      // ¾
        {"&cent;", "\xc2\xa2"},        // ¢
        {"&pound;", "\xc2\xa3"},       // £
        {"&euro;", "\xe2\x82\xac"},    // €
        {"&yen;", "\xc2\xa5"},         // ¥
        {"&copy;", "\xc2\xa9"},        // ©
        {"&reg;", "\xc2\xae"},         // ®
        {"&trade;", "\xe2\x84\xa2"},   // ™
        {"&bull;", "\xe2\x80\xa2"},    // •
        {"&middot;", "\xc2\xb7"},      // ·
        {"&sect;", "\xc2\xa7"},        // §
        {"&para;", "\xc2\xb6"},        // ¶
        {"&dagger;", "\xe2\x80\xa0"},  // †
        {"&Dagger;", "\xe2\x80\xa1"},  // ‡
        {"&iexcl;", "\xc2\xa1"},       // ¡
        {"&iquest;", "\xc2\xbf"},      // ¿
        {"&laquo;", "\xc2\xab"},       // «
        {"&raquo;", "\xc2\xbb"},       // »
        {"&shy;", ""},
        {"&ensp;", " "},
        {"&emsp;", " "},
        {"&thinsp;", " "},
        {"&zwj;", ""},
        {"&zwnj;", ""},
    };

    for (const auto& mapping : entities) {
      if (entity == mapping.entity) {
        i = semicolon;  // Will be incremented by caller's loop
        return mapping.replacement;
      }
    }
  }

  // Unknown entity - return just the ampersand and let the rest be processed normally
  return "&";
}

// Helper to check if a tag is a block-level element that needs line breaks
static bool isBlockTag(const std::string& tag, bool isClosing) {
  // Normalize to lowercase for comparison
  std::string lowerTag = tag;
  for (char& c : lowerTag) {
    c = std::tolower(static_cast<unsigned char>(c));
  }

  // Block-level tags that should have line breaks
  if (lowerTag == "p" || lowerTag == "div" || lowerTag == "br" || lowerTag == "hr" || lowerTag == "li" ||
      lowerTag == "dt" || lowerTag == "dd" || lowerTag == "tr" || lowerTag == "h1" || lowerTag == "h2" ||
      lowerTag == "h3" || lowerTag == "h4" || lowerTag == "h5" || lowerTag == "h6" || lowerTag == "blockquote" ||
      lowerTag == "pre" || lowerTag == "ol" || lowerTag == "ul") {
    return true;
  }
  return false;
}

std::string StarDict::stripHtml(const std::string& html) {
  std::string result;
  result.reserve(html.length());

  bool inTag = false;
  bool lastWasSpace = false;
  bool lastWasNewline = false;

  for (size_t i = 0; i < html.length(); i++) {
    const char c = html[i];

    if (c == '<') {
      // Parse the tag name
      size_t tagStart = i + 1;
      bool isClosing = false;

      // Skip whitespace after <
      while (tagStart < html.length() && std::isspace(static_cast<unsigned char>(html[tagStart]))) {
        tagStart++;
      }

      // Check for closing tag
      if (tagStart < html.length() && html[tagStart] == '/') {
        isClosing = true;
        tagStart++;
      }

      // Extract tag name
      size_t tagEnd = tagStart;
      while (tagEnd < html.length() && !std::isspace(static_cast<unsigned char>(html[tagEnd])) && html[tagEnd] != '>' &&
             html[tagEnd] != '/') {
        tagEnd++;
      }

      const std::string tagName = html.substr(tagStart, tagEnd - tagStart);

      // Check if this is a block-level element
      if (isBlockTag(tagName, isClosing)) {
        // Add line break for block elements
        if (!result.empty() && !lastWasNewline) {
          result += '\n';
          lastWasNewline = true;
          lastWasSpace = true;
        }
      }

      inTag = true;
    } else if (c == '>') {
      inTag = false;
    } else if (!inTag) {
      // Handle HTML entities
      if (c == '&') {
        const std::string decoded = decodeHtmlEntity(html, i);
        if (!decoded.empty()) {
          // Check if decoded content is whitespace
          bool allSpace = true;
          for (const char dc : decoded) {
            if (!std::isspace(static_cast<unsigned char>(dc))) {
              allSpace = false;
              break;
            }
          }

          if (allSpace) {
            if (!lastWasSpace) {
              result += ' ';
              lastWasSpace = true;
            }
          } else {
            result += decoded;
            lastWasSpace = false;
            lastWasNewline = false;
          }
        }
        continue;
      }

      // Collapse whitespace
      if (std::isspace(static_cast<unsigned char>(c))) {
        if (!lastWasSpace) {
          result += ' ';
          lastWasSpace = true;
        }
      } else {
        result += c;
        lastWasSpace = false;
        lastWasNewline = false;
      }
    }
  }

  // Trim trailing whitespace
  while (!result.empty() && std::isspace(static_cast<unsigned char>(result.back()))) {
    result.pop_back();
  }

  return result;
}