#!/usr/bin/env python3
"""Generate prefix jump tables for StarDict dictionary lookup optimization.

This script parses StarDict .idx and .syn files and generates a C++ header
with pre-computed byte offsets for two-letter prefixes (aa-zz). This enables
near-instant lookup by jumping directly to the relevant section of the index.

Usage:
    ./scripts/generate_dict_index.py --idx path/to/dict.idx --syn path/to/dict.syn --output lib/StarDict/DictPrefixIndex.generated.h

Or extract from a zip file:
    ./scripts/generate_dict_index.py --zip dict-en-en.zip --output lib/StarDict/DictPrefixIndex.generated.h
"""

from __future__ import annotations

import argparse
import pathlib
import struct
import zipfile
from typing import BinaryIO


def prefix_to_index(c1: str, c2: str) -> int:
    """Convert two-letter prefix to index (0-675).
    
    'aa' -> 0, 'ab' -> 1, ... 'zz' -> 675
    """
    return (ord(c1.lower()) - ord('a')) * 26 + (ord(c2.lower()) - ord('a'))


def index_to_prefix(idx: int) -> str:
    """Convert index back to two-letter prefix for debugging."""
    c1 = chr(ord('a') + idx // 26)
    c2 = chr(ord('a') + idx % 26)
    return c1 + c2


def is_alpha(c: str) -> bool:
    """Check if character is a-z or A-Z."""
    return ('a' <= c <= 'z') or ('A' <= c <= 'Z')


def read_null_terminated_string(f: BinaryIO) -> tuple[str, int]:
    """Read a null-terminated string from file.
    
    Returns (string, bytes_read including null terminator).
    """
    chars = []
    bytes_read = 0
    while True:
        b = f.read(1)
        if not b:
            break
        bytes_read += 1
        if b == b'\x00':
            break
        chars.append(b.decode('utf-8', errors='replace'))
    return ''.join(chars), bytes_read


def parse_idx_file(f: BinaryIO, file_size: int) -> dict[int, int]:
    """Parse StarDict .idx file and build prefix -> offset mapping.
    
    The .idx file format is:
        [word\0][offset:4 bytes BE][size:4 bytes BE]
        ...repeated for each word...
    
    Returns dict mapping prefix index (0-675) to first byte offset for that prefix.
    """
    prefix_offsets: dict[int, int] = {}
    current_position = 0
    words_processed = 0
    
    while current_position < file_size:
        entry_start = current_position
        
        # Read the word
        word, word_bytes = read_null_terminated_string(f)
        if not word:
            break
        
        current_position += word_bytes
        
        # Read 8 bytes (offset + size, both big-endian)
        data = f.read(8)
        if len(data) != 8:
            break
        current_position += 8
        
        # Extract prefix if word has at least 2 alphabetic characters
        if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):
            prefix_idx = prefix_to_index(word[0], word[1])
            
            # Only record the first occurrence of each prefix
            if prefix_idx not in prefix_offsets:
                prefix_offsets[prefix_idx] = entry_start
        
        words_processed += 1
        if words_processed % 100000 == 0:
            print(f"  Processed {words_processed} words...")
    
    print(f"  Total words processed: {words_processed}")
    print(f"  Unique prefixes found: {len(prefix_offsets)}")
    
    return prefix_offsets


def parse_syn_file(f: BinaryIO, file_size: int) -> dict[int, int]:
    """Parse StarDict .syn file and build prefix -> offset mapping.
    
    The .syn file format is:
        [synonym_word\0][main_entry_index:4 bytes BE]
        ...repeated for each synonym...
    
    Returns dict mapping prefix index (0-675) to first byte offset for that prefix.
    """
    prefix_offsets: dict[int, int] = {}
    current_position = 0
    synonyms_processed = 0
    
    while current_position < file_size:
        entry_start = current_position
        
        # Read the synonym word
        word, word_bytes = read_null_terminated_string(f)
        if not word:
            break
        
        current_position += word_bytes
        
        # Read 4 bytes (index to main entry, big-endian)
        data = f.read(4)
        if len(data) != 4:
            break
        current_position += 4
        
        # Extract prefix if word has at least 2 alphabetic characters
        if len(word) >= 2 and is_alpha(word[0]) and is_alpha(word[1]):
            prefix_idx = prefix_to_index(word[0], word[1])
            
            # Only record the first occurrence of each prefix
            if prefix_idx not in prefix_offsets:
                prefix_offsets[prefix_idx] = entry_start
        
        synonyms_processed += 1
        if synonyms_processed % 100000 == 0:
            print(f"  Processed {synonyms_processed} synonyms...")
    
    print(f"  Total synonyms processed: {synonyms_processed}")
    print(f"  Unique prefixes found: {len(prefix_offsets)}")
    
    return prefix_offsets


def fill_missing_prefixes(prefix_offsets: dict[int, int], file_size: int) -> list[int]:
    """Fill in missing prefixes with the next available offset.
    
    If a prefix doesn't exist (e.g., no words starting with 'qx'), 
    we set its offset to the next prefix's offset so the scan will
    quickly find nothing and move on.
    """
    result = [0] * 676
    
    # First pass: fill in known offsets
    for idx, offset in prefix_offsets.items():
        result[idx] = offset
    
    # Second pass: fill missing with next known offset (or file_size)
    # Work backwards so each missing entry gets the next valid offset
    next_valid = file_size
    for idx in range(675, -1, -1):
        if idx in prefix_offsets:
            next_valid = prefix_offsets[idx]
        else:
            result[idx] = next_valid
    
    return result


def format_offset_array(offsets: list[int], name: str) -> str:
    """Format offset array as C++ constexpr with nice formatting."""
    lines = [f"// Two-letter prefix jump table: {name}[prefix_to_index(c1, c2)] = byte offset"]
    lines.append(f"// Prefixes: aa=0, ab=1, ... az=25, ba=26, ... zz=675")
    lines.append(f"constexpr uint32_t {name}[676] = {{")
    
    # Format 13 values per line (fits nicely with 10-digit numbers + commas)
    values_per_line = 13
    for i in range(0, 676, values_per_line):
        chunk = offsets[i:i + values_per_line]
        prefix_start = index_to_prefix(i)
        prefix_end = index_to_prefix(min(i + values_per_line - 1, 675))
        values_str = ', '.join(f'{v:>10}' for v in chunk)
        lines.append(f"    {values_str},  // {prefix_start}-{prefix_end}")
    
    lines.append("};")
    return '\n'.join(lines)


def generate_header(idx_offsets: list[int], syn_offsets: list[int] | None, output_path: pathlib.Path) -> None:
    """Generate the C++ header file with prefix offset tables."""
    
    content = '''#pragma once

// Auto-generated by generate_dict_index.py. Do not edit manually.
// This file contains pre-computed prefix jump tables for fast dictionary lookup.

#include <cstdint>

namespace DictPrefixIndex {

// Convert two-letter prefix to index (0-675)
// "aa" -> 0, "ab" -> 1, ... "az" -> 25, "ba" -> 26, ... "zz" -> 675
inline uint16_t prefixToIndex(char c1, char c2) {
  // Convert to lowercase and compute index
  const int i1 = (c1 | 0x20) - 'a';  // tolower via OR with 0x20
  const int i2 = (c2 | 0x20) - 'a';
  // Bounds check (returns 0 for non-alpha characters)
  if (i1 < 0 || i1 > 25 || i2 < 0 || i2 > 25) return 0;
  return static_cast<uint16_t>(i1 * 26 + i2);
}

// Check if character is alphabetic (a-z or A-Z)
inline bool isAlpha(char c) {
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

'''
    
    content += format_offset_array(idx_offsets, "dictPrefixOffsets")
    content += '\n\n'
    
    if syn_offsets:
        content += format_offset_array(syn_offsets, "synPrefixOffsets")
    else:
        content += "// No synonym file processed - synPrefixOffsets not generated\n"
        content += "constexpr uint32_t synPrefixOffsets[676] = {0};\n"
    
    content += '\n}  // namespace DictPrefixIndex\n'
    
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(content)
    print(f"Generated: {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Generate StarDict prefix jump tables")
    parser.add_argument('--idx', type=str, help='Path to .idx file')
    parser.add_argument('--syn', type=str, help='Path to .syn file (optional)')
    parser.add_argument('--zip', type=str, help='Path to dictionary zip file (alternative to --idx/--syn)')
    parser.add_argument('--output', type=str, required=True, help='Output header path')
    
    args = parser.parse_args()
    
    idx_offsets: dict[int, int] = {}
    syn_offsets: dict[int, int] | None = None
    idx_file_size = 0
    syn_file_size = 0
    
    if args.zip:
        # Extract from zip file
        zip_path = pathlib.Path(args.zip)
        print(f"Processing zip file: {zip_path}")
        
        with zipfile.ZipFile(zip_path, 'r') as zf:
            # Find .idx file
            idx_name = None
            syn_name = None
            for name in zf.namelist():
                if name.endswith('.idx'):
                    idx_name = name
                elif name.endswith('.syn'):
                    syn_name = name
            
            if not idx_name:
                raise SystemExit("No .idx file found in zip")
            
            print(f"\nParsing index file: {idx_name}")
            with zf.open(idx_name) as f:
                idx_file_size = zf.getinfo(idx_name).file_size
                idx_offsets = parse_idx_file(f, idx_file_size)
            
            if syn_name:
                print(f"\nParsing synonym file: {syn_name}")
                with zf.open(syn_name) as f:
                    syn_file_size = zf.getinfo(syn_name).file_size
                    syn_offsets = parse_syn_file(f, syn_file_size)
    else:
        # Read from individual files
        if not args.idx:
            raise SystemExit("Either --zip or --idx must be provided")
        
        idx_path = pathlib.Path(args.idx)
        print(f"Processing index file: {idx_path}")
        idx_file_size = idx_path.stat().st_size
        with open(idx_path, 'rb') as f:
            idx_offsets = parse_idx_file(f, idx_file_size)
        
        if args.syn:
            syn_path = pathlib.Path(args.syn)
            print(f"\nProcessing synonym file: {syn_path}")
            syn_file_size = syn_path.stat().st_size
            with open(syn_path, 'rb') as f:
                syn_offsets = parse_syn_file(f, syn_file_size)
    
    # Fill in missing prefixes
    print("\nFilling missing prefixes...")
    idx_offsets_filled = fill_missing_prefixes(idx_offsets, idx_file_size)
    syn_offsets_filled = fill_missing_prefixes(syn_offsets, syn_file_size) if syn_offsets else None
    
    # Generate header
    print("\nGenerating header file...")
    generate_header(idx_offsets_filled, syn_offsets_filled, pathlib.Path(args.output))
    
    # Print some statistics
    print("\n=== Statistics ===")
    print(f"Index file size: {idx_file_size:,} bytes")
    if syn_file_size:
        print(f"Synonym file size: {syn_file_size:,} bytes")
    
    # Show distribution of some common prefixes
    print("\nSample prefix offsets:")
    for prefix in ['aa', 'he', 'th', 'wo', 'zz']:
        idx = prefix_to_index(prefix[0], prefix[1])
        offset = idx_offsets_filled[idx]
        pct = (offset / idx_file_size) * 100 if idx_file_size else 0
        print(f"  '{prefix}' (index {idx}): offset {offset:,} ({pct:.1f}% into file)")


if __name__ == '__main__':
    main()