Code/python/persian-tutor/scripts/generate_vocab.py

#!/usr/bin/env python3
"""One-time script to generate/update vocabulary.json with AI-assisted transliterations.

Usage:
    python scripts/generate_vocab.py

This reads an existing vocabulary.json, finds entries missing finglish
transliterations, and uses Ollama to generate them.
"""

import json
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))
from ai import ask_ollama

VOCAB_PATH = Path(__file__).parent.parent / "data" / "vocabulary.json"


def generate_transliterations(vocab):
    """Fill in missing finglish transliterations using AI."""
    missing = [e for e in vocab if not e.get("finglish")]
    if not missing:
        print("All entries already have finglish transliterations.")
        return vocab

    print(f"Generating transliterations for {len(missing)} entries...")

    # Process in batches of 20
    batch_size = 20
    for i in range(0, len(missing), batch_size):
        batch = missing[i : i + batch_size]
        pairs = "\n".join(f"{e['persian']} = {e['english']}" for e in batch)

        prompt = f"""For each Persian word below, provide the Finglish (romanized) transliteration.
Use these conventions: â for آ, kh for خ, sh for ش, zh for ژ, gh for ق/غ, ch for چ.
Reply with ONLY the transliterations, one per line, in the same order.

{pairs}"""

        try:
            response = ask_ollama(prompt, model="qwen2.5:7b")
            lines = [l.strip() for l in response.strip().split("\n") if l.strip()]

            for j, entry in enumerate(batch):
                if j < len(lines):
                    # Clean up the response line
                    line = lines[j]
                    # Remove any numbering or equals signs
                    for sep in ["=", ":", "-", "."]:
                        if sep in line:
                            line = line.split(sep)[-1].strip()
                    entry["finglish"] = line

            print(f"  Processed {min(i + batch_size, len(missing))}/{len(missing)}")
        except Exception as e:
            print(f"  Error processing batch: {e}")

    return vocab


def main():
    if not VOCAB_PATH.exists():
        print(f"No vocabulary file found at {VOCAB_PATH}")
        return

    with open(VOCAB_PATH, encoding="utf-8") as f:
        vocab = json.load(f)

    print(f"Loaded {len(vocab)} entries")
    vocab = generate_transliterations(vocab)

    with open(VOCAB_PATH, "w", encoding="utf-8") as f:
        json.dump(vocab, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(vocab)} entries to {VOCAB_PATH}")


if __name__ == "__main__":
    main()