"""Persian speech-to-text wrapper using sttlib.""" import sys import numpy as np sys.path.insert(0, "/home/ys/family-repo/Code/python/tool-speechtotext") from sttlib import load_whisper_model, transcribe, is_hallucination _model = None # Common Whisper hallucinations in Persian/silence PERSIAN_HALLUCINATIONS = [ "ممنون", # "thank you" hallucination "خداحافظ", # "goodbye" hallucination "تماشا کنید", # "watch" hallucination "لایک کنید", # "like" hallucination ] def get_model(size="medium"): """Load Whisper model (cached singleton).""" global _model if _model is None: _model = load_whisper_model(size) return _model def transcribe_persian(audio_tuple): """Transcribe Persian audio from Gradio audio component. Args: audio_tuple: (sample_rate, numpy_array) from gr.Audio component. Returns: Transcribed text string, or empty string on failure/hallucination. """ if audio_tuple is None: return "" sr, audio = audio_tuple model = get_model() # Convert to float32 normalized [-1, 1] if audio.dtype == np.int16: audio_float = audio.astype(np.float32) / 32768.0 elif audio.dtype == np.float32: audio_float = audio else: audio_float = audio.astype(np.float32) / np.iinfo(audio.dtype).max # Mono conversion if stereo if audio_float.ndim > 1: audio_float = audio_float.mean(axis=1) # Use sttlib transcribe text = transcribe(model, audio_float) # Filter hallucinations (English + Persian) if is_hallucination(text): return "" if text.strip() in PERSIAN_HALLUCINATIONS: return "" return text