Add persian-tutor: Gradio-based GCSE Persian language learning app
Vocabulary study with FSRS spaced repetition, AI tutoring (Ollama/Claude), essay marking, idioms browser, Anki export, and dashboard. 918 vocabulary entries across 39 categories. 41 tests passing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
65
python/persian-tutor/stt.py
Normal file
65
python/persian-tutor/stt.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Persian speech-to-text wrapper using sttlib."""
|
||||
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, "/home/ys/family-repo/Code/python/tool-speechtotext")
|
||||
from sttlib import load_whisper_model, transcribe, is_hallucination
|
||||
|
||||
_model = None
|
||||
|
||||
# Common Whisper hallucinations in Persian/silence
|
||||
PERSIAN_HALLUCINATIONS = [
|
||||
"ممنون", # "thank you" hallucination
|
||||
"خداحافظ", # "goodbye" hallucination
|
||||
"تماشا کنید", # "watch" hallucination
|
||||
"لایک کنید", # "like" hallucination
|
||||
]
|
||||
|
||||
|
||||
def get_model(size="medium"):
|
||||
"""Load Whisper model (cached singleton)."""
|
||||
global _model
|
||||
if _model is None:
|
||||
_model = load_whisper_model(size)
|
||||
return _model
|
||||
|
||||
|
||||
def transcribe_persian(audio_tuple):
|
||||
"""Transcribe Persian audio from Gradio audio component.
|
||||
|
||||
Args:
|
||||
audio_tuple: (sample_rate, numpy_array) from gr.Audio component.
|
||||
|
||||
Returns:
|
||||
Transcribed text string, or empty string on failure/hallucination.
|
||||
"""
|
||||
if audio_tuple is None:
|
||||
return ""
|
||||
|
||||
sr, audio = audio_tuple
|
||||
model = get_model()
|
||||
|
||||
# Convert to float32 normalized [-1, 1]
|
||||
if audio.dtype == np.int16:
|
||||
audio_float = audio.astype(np.float32) / 32768.0
|
||||
elif audio.dtype == np.float32:
|
||||
audio_float = audio
|
||||
else:
|
||||
audio_float = audio.astype(np.float32) / np.iinfo(audio.dtype).max
|
||||
|
||||
# Mono conversion if stereo
|
||||
if audio_float.ndim > 1:
|
||||
audio_float = audio_float.mean(axis=1)
|
||||
|
||||
# Use sttlib transcribe
|
||||
text = transcribe(model, audio_float)
|
||||
|
||||
# Filter hallucinations (English + Persian)
|
||||
if is_hallucination(text):
|
||||
return ""
|
||||
if text.strip() in PERSIAN_HALLUCINATIONS:
|
||||
return ""
|
||||
|
||||
return text
|
||||
Reference in New Issue
Block a user