Vocabulary study with FSRS spaced repetition, AI tutoring (Ollama/Claude), essay marking, idioms browser, Anki export, and dashboard. 918 vocabulary entries across 39 categories. 41 tests passing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
82 lines
2.5 KiB
Python
82 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""One-time script to generate/update vocabulary.json with AI-assisted transliterations.
|
|
|
|
Usage:
|
|
python scripts/generate_vocab.py
|
|
|
|
This reads an existing vocabulary.json, finds entries missing finglish
|
|
transliterations, and uses Ollama to generate them.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from ai import ask_ollama
|
|
|
|
VOCAB_PATH = Path(__file__).parent.parent / "data" / "vocabulary.json"
|
|
|
|
|
|
def generate_transliterations(vocab):
|
|
"""Fill in missing finglish transliterations using AI."""
|
|
missing = [e for e in vocab if not e.get("finglish")]
|
|
if not missing:
|
|
print("All entries already have finglish transliterations.")
|
|
return vocab
|
|
|
|
print(f"Generating transliterations for {len(missing)} entries...")
|
|
|
|
# Process in batches of 20
|
|
batch_size = 20
|
|
for i in range(0, len(missing), batch_size):
|
|
batch = missing[i : i + batch_size]
|
|
pairs = "\n".join(f"{e['persian']} = {e['english']}" for e in batch)
|
|
|
|
prompt = f"""For each Persian word below, provide the Finglish (romanized) transliteration.
|
|
Use these conventions: â for آ, kh for خ, sh for ش, zh for ژ, gh for ق/غ, ch for چ.
|
|
Reply with ONLY the transliterations, one per line, in the same order.
|
|
|
|
{pairs}"""
|
|
|
|
try:
|
|
response = ask_ollama(prompt, model="qwen2.5:7b")
|
|
lines = [l.strip() for l in response.strip().split("\n") if l.strip()]
|
|
|
|
for j, entry in enumerate(batch):
|
|
if j < len(lines):
|
|
# Clean up the response line
|
|
line = lines[j]
|
|
# Remove any numbering or equals signs
|
|
for sep in ["=", ":", "-", "."]:
|
|
if sep in line:
|
|
line = line.split(sep)[-1].strip()
|
|
entry["finglish"] = line
|
|
|
|
print(f" Processed {min(i + batch_size, len(missing))}/{len(missing)}")
|
|
except Exception as e:
|
|
print(f" Error processing batch: {e}")
|
|
|
|
return vocab
|
|
|
|
|
|
def main():
|
|
if not VOCAB_PATH.exists():
|
|
print(f"No vocabulary file found at {VOCAB_PATH}")
|
|
return
|
|
|
|
with open(VOCAB_PATH, encoding="utf-8") as f:
|
|
vocab = json.load(f)
|
|
|
|
print(f"Loaded {len(vocab)} entries")
|
|
vocab = generate_transliterations(vocab)
|
|
|
|
with open(VOCAB_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(vocab, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"Saved {len(vocab)} entries to {VOCAB_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|