"""Persian speech-to-text wrapper using sttlib.""" import sys from pathlib import Path import numpy as np # sttlib lives in sibling project tool-speechtotext _sttlib_path = str(Path(__file__).resolve().parent.parent / "tool-speechtotext") sys.path.insert(0, _sttlib_path) from sttlib import load_whisper_model, transcribe, is_hallucination _model = None _whisper_size = "medium" # Common Whisper hallucinations in Persian/silence PERSIAN_HALLUCINATIONS = [ "ممنون", # "thank you" hallucination "خداحافظ", # "goodbye" hallucination "تماشا کنید", # "watch" hallucination "لایک کنید", # "like" hallucination ] def set_whisper_size(size): """Change the Whisper model size. Reloads on next transcription.""" global _whisper_size, _model if size != _whisper_size: _whisper_size = size _model = None def get_model(): """Load Whisper model (cached singleton).""" global _model if _model is None: _model = load_whisper_model(_whisper_size) return _model def transcribe_persian(audio_tuple): """Transcribe Persian audio from Gradio audio component. Args: audio_tuple: (sample_rate, numpy_array) from gr.Audio component. Returns: Transcribed text string, or empty string on failure/hallucination. """ if audio_tuple is None: return "" sr, audio = audio_tuple model = get_model() # Convert to float32 normalized [-1, 1] if audio.dtype == np.int16: audio_float = audio.astype(np.float32) / 32768.0 elif audio.dtype == np.float32: audio_float = audio else: audio_float = audio.astype(np.float32) / np.iinfo(audio.dtype).max # Mono conversion if stereo if audio_float.ndim > 1: audio_float = audio_float.mean(axis=1) # Use sttlib transcribe text = transcribe(model, audio_float) # Filter hallucinations (English + Persian) if is_hallucination(text): return "" if text.strip() in PERSIAN_HALLUCINATIONS: return "" return text