Code/python/persian-tutor/stt.py

"""Persian speech-to-text wrapper using sttlib."""

import sys
from pathlib import Path

import numpy as np

# sttlib lives in sibling project tool-speechtotext
_sttlib_path = str(Path(__file__).resolve().parent.parent / "tool-speechtotext")
sys.path.insert(0, _sttlib_path)
from sttlib import load_whisper_model, transcribe, is_hallucination

_model = None
_whisper_size = "medium"

# Common Whisper hallucinations in Persian/silence
PERSIAN_HALLUCINATIONS = [
    "ممنون",  # "thank you" hallucination
    "خداحافظ",  # "goodbye" hallucination
    "تماشا کنید",  # "watch" hallucination
    "لایک کنید",  # "like" hallucination
]


def set_whisper_size(size):
    """Change the Whisper model size. Reloads on next transcription."""
    global _whisper_size, _model
    if size != _whisper_size:
        _whisper_size = size
        _model = None


def get_model():
    """Load Whisper model (cached singleton)."""
    global _model
    if _model is None:
        _model = load_whisper_model(_whisper_size)
    return _model


def transcribe_persian(audio_tuple):
    """Transcribe Persian audio from Gradio audio component.

    Args:
        audio_tuple: (sample_rate, numpy_array) from gr.Audio component.

    Returns:
        Transcribed text string, or empty string on failure/hallucination.
    """
    if audio_tuple is None:
        return ""

    sr, audio = audio_tuple
    model = get_model()

    # Convert to float32 normalized [-1, 1]
    if audio.dtype == np.int16:
        audio_float = audio.astype(np.float32) / 32768.0
    elif audio.dtype == np.float32:
        audio_float = audio
    else:
        audio_float = audio.astype(np.float32) / np.iinfo(audio.dtype).max

    # Mono conversion if stereo
    if audio_float.ndim > 1:
        audio_float = audio_float.mean(axis=1)

    # Use sttlib transcribe
    text = transcribe(model, audio_float)

    # Filter hallucinations (English + Persian)
    if is_hallucination(text):
        return ""
    if text.strip() in PERSIAN_HALLUCINATIONS:
        return ""

    return text