Add persian-tutor: Gradio-based GCSE Persian language learning app

Vocabulary study with FSRS spaced repetition, AI tutoring (Ollama/Claude), essay marking, idioms browser, Anki export, and dashboard. 918 vocabulary entries across 39 categories. 41 tests passing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 01:57:44 +00:00
parent 104da381fb
commit 2e8c2c11d0
22 changed files with 10664 additions and 0 deletions
--- a/python/persian-tutor/scripts/build_vocab.py
+++ b/python/persian-tutor/scripts/build_vocab.py
--- a/python/persian-tutor/scripts/generate_vocab.py
+++ b/python/persian-tutor/scripts/generate_vocab.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""One-time script to generate/update vocabulary.json with AI-assisted transliterations.
+
+Usage:
+    python scripts/generate_vocab.py
+
+This reads an existing vocabulary.json, finds entries missing finglish
+transliterations, and uses Ollama to generate them.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from ai import ask_ollama
+
+VOCAB_PATH = Path(__file__).parent.parent / "data" / "vocabulary.json"
+
+
+def generate_transliterations(vocab):
+    """Fill in missing finglish transliterations using AI."""
+    missing = [e for e in vocab if not e.get("finglish")]
+    if not missing:
+        print("All entries already have finglish transliterations.")
+        return vocab
+
+    print(f"Generating transliterations for {len(missing)} entries...")
+
+    # Process in batches of 20
+    batch_size = 20
+    for i in range(0, len(missing), batch_size):
+        batch = missing[i : i + batch_size]
+        pairs = "\n".join(f"{e['persian']} = {e['english']}" for e in batch)
+
+        prompt = f"""For each Persian word below, provide the Finglish (romanized) transliteration.
+Use these conventions: â for آ, kh for خ, sh for ش, zh for ژ, gh for ق/غ, ch for چ.
+Reply with ONLY the transliterations, one per line, in the same order.
+
+{pairs}"""
+
+        try:
+            response = ask_ollama(prompt, model="qwen2.5:7b")
+            lines = [l.strip() for l in response.strip().split("\n") if l.strip()]
+
+            for j, entry in enumerate(batch):
+                if j < len(lines):
+                    # Clean up the response line
+                    line = lines[j]
+                    # Remove any numbering or equals signs
+                    for sep in ["=", ":", "-", "."]:
+                        if sep in line:
+                            line = line.split(sep)[-1].strip()
+                    entry["finglish"] = line
+
+            print(f"  Processed {min(i + batch_size, len(missing))}/{len(missing)}")
+        except Exception as e:
+            print(f"  Error processing batch: {e}")
+
+    return vocab
+
+
+def main():
+    if not VOCAB_PATH.exists():
+        print(f"No vocabulary file found at {VOCAB_PATH}")
+        return
+
+    with open(VOCAB_PATH, encoding="utf-8") as f:
+        vocab = json.load(f)
+
+    print(f"Loaded {len(vocab)} entries")
+    vocab = generate_transliterations(vocab)
+
+    with open(VOCAB_PATH, "w", encoding="utf-8") as f:
+        json.dump(vocab, f, ensure_ascii=False, indent=2)
+
+    print(f"Saved {len(vocab)} entries to {VOCAB_PATH}")
+
+
+if __name__ == "__main__":
+    main()