#!/usr/bin/env python3 """One-time script to generate/update vocabulary.json with AI-assisted transliterations. Usage: python scripts/generate_vocab.py This reads an existing vocabulary.json, finds entries missing finglish transliterations, and uses Ollama to generate them. """ import json import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from ai import ask_ollama VOCAB_PATH = Path(__file__).parent.parent / "data" / "vocabulary.json" def generate_transliterations(vocab): """Fill in missing finglish transliterations using AI.""" missing = [e for e in vocab if not e.get("finglish")] if not missing: print("All entries already have finglish transliterations.") return vocab print(f"Generating transliterations for {len(missing)} entries...") # Process in batches of 20 batch_size = 20 for i in range(0, len(missing), batch_size): batch = missing[i : i + batch_size] pairs = "\n".join(f"{e['persian']} = {e['english']}" for e in batch) prompt = f"""For each Persian word below, provide the Finglish (romanized) transliteration. Use these conventions: â for آ, kh for خ, sh for ش, zh for ژ, gh for ق/غ, ch for چ. Reply with ONLY the transliterations, one per line, in the same order. {pairs}""" try: response = ask_ollama(prompt, model="qwen2.5:7b") lines = [l.strip() for l in response.strip().split("\n") if l.strip()] for j, entry in enumerate(batch): if j < len(lines): # Clean up the response line line = lines[j] # Remove any numbering or equals signs for sep in ["=", ":", "-", "."]: if sep in line: line = line.split(sep)[-1].strip() entry["finglish"] = line print(f" Processed {min(i + batch_size, len(missing))}/{len(missing)}") except Exception as e: print(f" Error processing batch: {e}") return vocab def main(): if not VOCAB_PATH.exists(): print(f"No vocabulary file found at {VOCAB_PATH}") return with open(VOCAB_PATH, encoding="utf-8") as f: vocab = json.load(f) print(f"Loaded {len(vocab)} entries") vocab = generate_transliterations(vocab) with open(VOCAB_PATH, "w", encoding="utf-8") as f: json.dump(vocab, f, ensure_ascii=False, indent=2) print(f"Saved {len(vocab)} entries to {VOCAB_PATH}") if __name__ == "__main__": main()