- Replace inline __import__("datetime").timedelta hack with proper import
- Remove unused import random in anki_export.py
- Add error handling for Claude CLI subprocess failures in ai.py
- Fix hardcoded absolute path in stt.py with relative Path resolution
- Fix N+1 DB queries in vocab.get_flashcard_batch and dashboard.get_category_breakdown
by adding db.get_all_word_progress() batch query
- Wire Ollama model and Whisper size settings to actually update config
via ai.set_ollama_model() and stt.set_whisper_size()
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
78 lines
2.0 KiB
Python
78 lines
2.0 KiB
Python
"""Persian speech-to-text wrapper using sttlib."""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
|
|
# sttlib lives in sibling project tool-speechtotext
|
|
_sttlib_path = str(Path(__file__).resolve().parent.parent / "tool-speechtotext")
|
|
sys.path.insert(0, _sttlib_path)
|
|
from sttlib import load_whisper_model, transcribe, is_hallucination
|
|
|
|
_model = None
|
|
_whisper_size = "medium"
|
|
|
|
# Common Whisper hallucinations in Persian/silence
|
|
PERSIAN_HALLUCINATIONS = [
|
|
"ممنون", # "thank you" hallucination
|
|
"خداحافظ", # "goodbye" hallucination
|
|
"تماشا کنید", # "watch" hallucination
|
|
"لایک کنید", # "like" hallucination
|
|
]
|
|
|
|
|
|
def set_whisper_size(size):
|
|
"""Change the Whisper model size. Reloads on next transcription."""
|
|
global _whisper_size, _model
|
|
if size != _whisper_size:
|
|
_whisper_size = size
|
|
_model = None
|
|
|
|
|
|
def get_model():
|
|
"""Load Whisper model (cached singleton)."""
|
|
global _model
|
|
if _model is None:
|
|
_model = load_whisper_model(_whisper_size)
|
|
return _model
|
|
|
|
|
|
def transcribe_persian(audio_tuple):
|
|
"""Transcribe Persian audio from Gradio audio component.
|
|
|
|
Args:
|
|
audio_tuple: (sample_rate, numpy_array) from gr.Audio component.
|
|
|
|
Returns:
|
|
Transcribed text string, or empty string on failure/hallucination.
|
|
"""
|
|
if audio_tuple is None:
|
|
return ""
|
|
|
|
sr, audio = audio_tuple
|
|
model = get_model()
|
|
|
|
# Convert to float32 normalized [-1, 1]
|
|
if audio.dtype == np.int16:
|
|
audio_float = audio.astype(np.float32) / 32768.0
|
|
elif audio.dtype == np.float32:
|
|
audio_float = audio
|
|
else:
|
|
audio_float = audio.astype(np.float32) / np.iinfo(audio.dtype).max
|
|
|
|
# Mono conversion if stereo
|
|
if audio_float.ndim > 1:
|
|
audio_float = audio_float.mean(axis=1)
|
|
|
|
# Use sttlib transcribe
|
|
text = transcribe(model, audio_float)
|
|
|
|
# Filter hallucinations (English + Persian)
|
|
if is_hallucination(text):
|
|
return ""
|
|
if text.strip() in PERSIAN_HALLUCINATIONS:
|
|
return ""
|
|
|
|
return text
|