Add persian-tutor: Gradio-based GCSE Persian language learning app
Vocabulary study with FSRS spaced repetition, AI tutoring (Ollama/Claude), essay marking, idioms browser, Anki export, and dashboard. 918 vocabulary entries across 39 categories. 41 tests passing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
81
python/persian-tutor/scripts/generate_vocab.py
Normal file
81
python/persian-tutor/scripts/generate_vocab.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
"""One-time script to generate/update vocabulary.json with AI-assisted transliterations.
|
||||
|
||||
Usage:
|
||||
python scripts/generate_vocab.py
|
||||
|
||||
This reads an existing vocabulary.json, finds entries missing finglish
|
||||
transliterations, and uses Ollama to generate them.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from ai import ask_ollama
|
||||
|
||||
VOCAB_PATH = Path(__file__).parent.parent / "data" / "vocabulary.json"
|
||||
|
||||
|
||||
def generate_transliterations(vocab):
|
||||
"""Fill in missing finglish transliterations using AI."""
|
||||
missing = [e for e in vocab if not e.get("finglish")]
|
||||
if not missing:
|
||||
print("All entries already have finglish transliterations.")
|
||||
return vocab
|
||||
|
||||
print(f"Generating transliterations for {len(missing)} entries...")
|
||||
|
||||
# Process in batches of 20
|
||||
batch_size = 20
|
||||
for i in range(0, len(missing), batch_size):
|
||||
batch = missing[i : i + batch_size]
|
||||
pairs = "\n".join(f"{e['persian']} = {e['english']}" for e in batch)
|
||||
|
||||
prompt = f"""For each Persian word below, provide the Finglish (romanized) transliteration.
|
||||
Use these conventions: â for آ, kh for خ, sh for ش, zh for ژ, gh for ق/غ, ch for چ.
|
||||
Reply with ONLY the transliterations, one per line, in the same order.
|
||||
|
||||
{pairs}"""
|
||||
|
||||
try:
|
||||
response = ask_ollama(prompt, model="qwen2.5:7b")
|
||||
lines = [l.strip() for l in response.strip().split("\n") if l.strip()]
|
||||
|
||||
for j, entry in enumerate(batch):
|
||||
if j < len(lines):
|
||||
# Clean up the response line
|
||||
line = lines[j]
|
||||
# Remove any numbering or equals signs
|
||||
for sep in ["=", ":", "-", "."]:
|
||||
if sep in line:
|
||||
line = line.split(sep)[-1].strip()
|
||||
entry["finglish"] = line
|
||||
|
||||
print(f" Processed {min(i + batch_size, len(missing))}/{len(missing)}")
|
||||
except Exception as e:
|
||||
print(f" Error processing batch: {e}")
|
||||
|
||||
return vocab
|
||||
|
||||
|
||||
def main():
|
||||
if not VOCAB_PATH.exists():
|
||||
print(f"No vocabulary file found at {VOCAB_PATH}")
|
||||
return
|
||||
|
||||
with open(VOCAB_PATH, encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
|
||||
print(f"Loaded {len(vocab)} entries")
|
||||
vocab = generate_transliterations(vocab)
|
||||
|
||||
with open(VOCAB_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(vocab, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"Saved {len(vocab)} entries to {VOCAB_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user