crosspoint-reader-mod/src/util/Dictionary.cpp

#include "Dictionary.h"

#include <HalStorage.h>

#include <algorithm>
#include <cctype>
#include <cstring>

namespace {
constexpr const char* IDX_PATH = "/.dictionary/dictionary.idx";
constexpr const char* DICT_PATH = "/.dictionary/dictionary.dict";
constexpr const char* CACHE_PATH = "/.dictionary/dictionary.cache";
constexpr uint32_t CACHE_MAGIC = 0x44494358;  // "DICX"

// g_ascii_strcasecmp equivalent: compare lowercasing only ASCII A-Z.
int asciiCaseCmp(const char* s1, const char* s2) {
  const auto* p1 = reinterpret_cast<const unsigned char*>(s1);
  const auto* p2 = reinterpret_cast<const unsigned char*>(s2);
  while (*p1 && *p2) {
    unsigned char c1 = *p1, c2 = *p2;
    if (c1 >= 'A' && c1 <= 'Z') c1 += 32;
    if (c2 >= 'A' && c2 <= 'Z') c2 += 32;
    if (c1 != c2) return static_cast<int>(c1) - static_cast<int>(c2);
    ++p1;
    ++p2;
  }
  return static_cast<int>(*p1) - static_cast<int>(*p2);
}

// StarDict index comparison: case-insensitive first, then case-sensitive tiebreaker.
// This matches the stardict_strcmp used by StarDict to sort .idx entries.
int stardictCmp(const char* s1, const char* s2) {
  int ci = asciiCaseCmp(s1, s2);
  if (ci != 0) return ci;
  return std::strcmp(s1, s2);
}
}  // namespace

std::vector<uint32_t> Dictionary::sparseOffsets;
uint32_t Dictionary::totalWords = 0;
bool Dictionary::indexLoaded = false;

bool Dictionary::exists() { return Storage.exists(IDX_PATH); }

bool Dictionary::cacheExists() { return Storage.exists(CACHE_PATH); }

void Dictionary::deleteCache() {
  Storage.remove(CACHE_PATH);
  // Reset in-memory state so next lookup rebuilds from the .idx file.
  sparseOffsets.clear();
  totalWords = 0;
  indexLoaded = false;
}

std::string Dictionary::cleanWord(const std::string& word) {
  if (word.empty()) return "";

  // Find first alphanumeric character
  size_t start = 0;
  while (start < word.size() && !std::isalnum(static_cast<unsigned char>(word[start]))) {
    start++;
  }

  // Find last alphanumeric character
  size_t end = word.size();
  while (end > start && !std::isalnum(static_cast<unsigned char>(word[end - 1]))) {
    end--;
  }

  if (start >= end) return "";

  std::string result = word.substr(start, end - start);
  // Lowercase
  std::transform(result.begin(), result.end(), result.begin(), [](unsigned char c) { return std::tolower(c); });
  return result;
}

// ---------------------------------------------------------------------------
// Cache: persists the sparse offset table to SD card so subsequent boots skip
// the full .idx scan.  The cache is invalidated when the .idx file size changes.
//
// Format: [magic 4B][idxFileSize 4B][totalWords 4B][count 4B][offsets N×4B]
// All values are stored in native byte order (little-endian on ESP32).
// ---------------------------------------------------------------------------
bool Dictionary::loadCachedIndex() {
  FsFile idx;
  if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;
  const uint32_t idxFileSize = static_cast<uint32_t>(idx.fileSize());
  idx.close();

  FsFile cache;
  if (!Storage.openFileForRead("DICT", CACHE_PATH, cache)) return false;

  // Read and validate header
  uint32_t header[4];  // magic, idxFileSize, totalWords, count
  if (cache.read(reinterpret_cast<uint8_t*>(header), 16) != 16) {
    cache.close();
    return false;
  }

  if (header[0] != CACHE_MAGIC || header[1] != idxFileSize) {
    cache.close();
    return false;
  }

  totalWords = header[2];
  const uint32_t count = header[3];

  sparseOffsets.resize(count);
  const int bytesToRead = static_cast<int>(count * sizeof(uint32_t));
  if (cache.read(reinterpret_cast<uint8_t*>(sparseOffsets.data()), bytesToRead) != bytesToRead) {
    cache.close();
    sparseOffsets.clear();
    totalWords = 0;
    return false;
  }

  cache.close();
  indexLoaded = true;
  return true;
}

void Dictionary::saveCachedIndex(uint32_t idxFileSize) {
  FsFile cache;
  if (!Storage.openFileForWrite("DICT", CACHE_PATH, cache)) return;

  const uint32_t count = static_cast<uint32_t>(sparseOffsets.size());
  uint32_t header[4] = {CACHE_MAGIC, idxFileSize, totalWords, count};

  cache.write(reinterpret_cast<const uint8_t*>(header), 16);
  cache.write(reinterpret_cast<const uint8_t*>(sparseOffsets.data()), count * sizeof(uint32_t));
  cache.close();
}

// Scan the .idx file to build a sparse offset table for fast lookups.
// Records the file offset of every SPARSE_INTERVAL-th entry.
bool Dictionary::loadIndex(const std::function<void(int percent)>& onProgress,
                           const std::function<bool()>& shouldCancel) {
  // Try loading from cache first (nearly instant)
  if (loadCachedIndex()) return true;

  FsFile idx;
  if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return false;

  const uint32_t fileSize = static_cast<uint32_t>(idx.fileSize());

  sparseOffsets.clear();
  totalWords = 0;

  uint32_t pos = 0;
  int lastReportedPercent = -1;

  while (pos < fileSize) {
    if (shouldCancel && (totalWords % 100 == 0) && shouldCancel()) {
      idx.close();
      sparseOffsets.clear();
      totalWords = 0;
      return false;
    }

    if (totalWords % SPARSE_INTERVAL == 0) {
      sparseOffsets.push_back(pos);
    }

    // Skip word (read until null terminator)
    int ch;
    do {
      ch = idx.read();
      if (ch < 0) {
        pos = fileSize;
        break;
      }
      pos++;
    } while (ch != 0);

    if (pos >= fileSize) break;

    // Skip 8 bytes (4-byte offset + 4-byte size)
    uint8_t skip[8];
    if (idx.read(skip, 8) != 8) break;
    pos += 8;

    totalWords++;

    if (onProgress && fileSize > 0) {
      int percent = static_cast<int>(static_cast<uint64_t>(pos) * 90 / fileSize);
      if (percent > lastReportedPercent + 4) {
        lastReportedPercent = percent;
        onProgress(percent);
      }
    }
  }

  idx.close();
  indexLoaded = true;

  // Persist to cache so next boot is instant
  if (totalWords > 0) saveCachedIndex(fileSize);

  return totalWords > 0;
}

// Read a null-terminated word string from the current file position.
std::string Dictionary::readWord(FsFile& file) {
  std::string word;
  while (true) {
    int ch = file.read();
    if (ch <= 0) break;  // null terminator (0) or error (-1)
    word += static_cast<char>(ch);
  }
  return word;
}

// Read a definition from the .dict file at the given offset and size.
std::string Dictionary::readDefinition(uint32_t offset, uint32_t size) {
  FsFile dict;
  if (!Storage.openFileForRead("DICT", DICT_PATH, dict)) return "";

  dict.seekSet(offset);

  std::string def(size, '\0');
  int bytesRead = dict.read(reinterpret_cast<uint8_t*>(&def[0]), size);
  dict.close();

  if (bytesRead < 0) return "";
  if (static_cast<uint32_t>(bytesRead) < size) def.resize(bytesRead);
  return def;
}

// Binary search the sparse offset table, then linear scan within the matching segment.
// Uses StarDict's sort order: case-insensitive first, then case-sensitive tiebreaker.
// The exact match is case-insensitive so e.g. "simple" matches "Simple".
std::string Dictionary::searchIndex(const std::string& word, const std::function<bool()>& shouldCancel) {
  if (sparseOffsets.empty()) return "";

  FsFile idx;
  if (!Storage.openFileForRead("DICT", IDX_PATH, idx)) return "";

  // Binary search the sparse offset table to find the right segment.
  int lo = 0, hi = static_cast<int>(sparseOffsets.size()) - 1;

  while (lo < hi) {
    if (shouldCancel && shouldCancel()) {
      idx.close();
      return "";
    }

    int mid = lo + (hi - lo + 1) / 2;
    idx.seekSet(sparseOffsets[mid]);
    std::string key = readWord(idx);

    if (stardictCmp(key.c_str(), word.c_str()) <= 0) {
      lo = mid;
    } else {
      hi = mid - 1;
    }
  }

  // Linear scan within the segment starting at sparseOffsets[lo].
  idx.seekSet(sparseOffsets[lo]);

  int maxEntries = SPARSE_INTERVAL;
  if (lo == static_cast<int>(sparseOffsets.size()) - 1) {
    maxEntries = static_cast<int>(totalWords - static_cast<uint32_t>(lo) * SPARSE_INTERVAL);
  }

  // Scan entries, preferring an exact case-sensitive match over a case-insensitive one.
  // In stardict order, all case variants of a word are adjacent (e.g. "Professor" then "professor"),
  // and they may have different definitions.  We want the lowercase entry when the user searched
  // for a lowercase word, falling back to any case variant.
  uint32_t bestOffset = 0, bestSize = 0;
  bool found = false;

  for (int i = 0; i < maxEntries; i++) {
    if (shouldCancel && shouldCancel()) {
      idx.close();
      return "";
    }

    std::string key = readWord(idx);
    if (key.empty()) break;

    // Read offset and size (4 bytes each, big-endian)
    uint8_t buf[8];
    if (idx.read(buf, 8) != 8) break;

    uint32_t dictOffset = (static_cast<uint32_t>(buf[0]) << 24) | (static_cast<uint32_t>(buf[1]) << 16) |
                          (static_cast<uint32_t>(buf[2]) << 8) | static_cast<uint32_t>(buf[3]);
    uint32_t dictSize = (static_cast<uint32_t>(buf[4]) << 24) | (static_cast<uint32_t>(buf[5]) << 16) |
                        (static_cast<uint32_t>(buf[6]) << 8) | static_cast<uint32_t>(buf[7]);

    if (asciiCaseCmp(key.c_str(), word.c_str()) == 0) {
      // Case-insensitive match — remember the first one as fallback
      if (!found) {
        bestOffset = dictOffset;
        bestSize = dictSize;
        found = true;
      }
      // Exact case-sensitive match — use immediately
      if (key == word) {
        idx.close();
        return readDefinition(dictOffset, dictSize);
      }
    } else if (found) {
      // We've moved past all case variants of this word — stop
      break;
    } else if (stardictCmp(key.c_str(), word.c_str()) > 0) {
      // Past the target in StarDict sort order — stop scanning
      break;
    }
  }

  idx.close();
  return found ? readDefinition(bestOffset, bestSize) : "";
}

std::string Dictionary::lookup(const std::string& word, const std::function<void(int percent)>& onProgress,
                               const std::function<bool()>& shouldCancel) {
  if (!indexLoaded) {
    if (!loadIndex(onProgress, shouldCancel)) return "";
  }

  // searchIndex uses StarDict sort order + case-insensitive match,
  // so a single pass handles all casing variants.
  std::string result = searchIndex(word, shouldCancel);
  if (onProgress) onProgress(100);
  return result;
}