Code/python/tool-speechtotext/tests/test_vad.py

from unittest.mock import patch, MagicMock
from sttlib.vad import VADProcessor, FRAME_DURATION_MS, MIN_UTTERANCE_FRAMES


def _make_vad_processor(aggressiveness=3, silence_threshold=0.8):
    """Create VADProcessor with a mocked webrtcvad.Vad."""
    with patch("sttlib.vad.webrtcvad.Vad") as mock_vad_cls:
        mock_vad = MagicMock()
        mock_vad_cls.return_value = mock_vad
        proc = VADProcessor(aggressiveness, silence_threshold)
    return proc, mock_vad


def _frame(label="x"):
    """Return a fake 30ms frame (just needs to be distinct bytes)."""
    return label.encode() * 960  # 480 samples * 2 bytes


def test_no_speech_returns_none():
    proc, mock_vad = _make_vad_processor()
    mock_vad.is_speech.return_value = False

    for _ in range(100):
        assert proc.process_frame(_frame()) is None


def test_speech_then_silence_triggers_utterance():
    proc, mock_vad = _make_vad_processor(silence_threshold=0.3)

    # Feed enough speech frames
    speech_count = MIN_UTTERANCE_FRAMES + 5
    mock_vad.is_speech.return_value = True
    for _ in range(speech_count):
        result = proc.process_frame(_frame("s"))
        assert result is None  # not done yet

    # Feed silence frames until threshold (0.3s = 10 frames at 30ms)
    mock_vad.is_speech.return_value = False
    result = None
    for _ in range(20):
        result = proc.process_frame(_frame("q"))
        if result is not None:
            break

    assert result is not None
    assert len(result) > 0


def test_short_utterance_filtered():
    # Use very short silence threshold so silence frames don't push total
    # past MIN_UTTERANCE_FRAMES. With threshold=0.09s (3 frames of silence):
    # 0 pre-buffer + 1 speech + 3 silence = 4 total < MIN_UTTERANCE_FRAMES (10)
    proc, mock_vad = _make_vad_processor(silence_threshold=0.09)

    # Single speech frame triggers VAD
    mock_vad.is_speech.return_value = True
    proc.process_frame(_frame("s"))

    # Immediately go silent — threshold reached in 3 frames
    mock_vad.is_speech.return_value = False
    result = None
    for _ in range(20):
        result = proc.process_frame(_frame("q"))
        if result is not None:
            break

    # Should be filtered (too short — only 4 total frames)
    assert result is None


def test_pre_buffer_included():
    proc, mock_vad = _make_vad_processor(silence_threshold=0.3)

    # Fill pre-buffer with non-speech frames
    mock_vad.is_speech.return_value = False
    pre_frame = _frame("p")
    for _ in range(10):
        proc.process_frame(pre_frame)

    # Speech starts
    mock_vad.is_speech.return_value = True
    speech_frame = _frame("s")
    for _ in range(MIN_UTTERANCE_FRAMES):
        proc.process_frame(speech_frame)

    # Silence to trigger
    mock_vad.is_speech.return_value = False
    result = None
    for _ in range(20):
        result = proc.process_frame(_frame("q"))
        if result is not None:
            break

    assert result is not None
    # Result should contain pre-buffer frames
    assert pre_frame in result


def test_reset_after_utterance():
    proc, mock_vad = _make_vad_processor(silence_threshold=0.3)

    # First utterance
    mock_vad.is_speech.return_value = True
    for _ in range(MIN_UTTERANCE_FRAMES + 5):
        proc.process_frame(_frame("s"))

    mock_vad.is_speech.return_value = False
    for _ in range(20):
        result = proc.process_frame(_frame("q"))
        if result is not None:
            break
    assert result is not None

    # After reset, should be able to collect a second utterance
    assert not proc.triggered
    assert proc.utterance_frames == []

    mock_vad.is_speech.return_value = True
    for _ in range(MIN_UTTERANCE_FRAMES + 5):
        proc.process_frame(_frame("s"))

    mock_vad.is_speech.return_value = False
    result2 = None
    for _ in range(20):
        result2 = proc.process_frame(_frame("q"))
        if result2 is not None:
            break
    assert result2 is not None


def test_silence_threshold_boundary():
    # Use 0.3s threshold: 0.3 / 0.03 = exactly 10 frames needed
    threshold = 0.3
    proc, mock_vad = _make_vad_processor(silence_threshold=threshold)

    # Start with speech
    mock_vad.is_speech.return_value = True
    for _ in range(MIN_UTTERANCE_FRAMES + 5):
        proc.process_frame(_frame("s"))

    frames_needed = 10  # 0.3s / 0.03s per frame
    mock_vad.is_speech.return_value = False

    # Feed one less than needed — should NOT trigger
    for i in range(frames_needed - 1):
        result = proc.process_frame(_frame("q"))
        assert result is None, f"Triggered too early at frame {i}"

    # The 10th frame should trigger (silence_duration = 0.3 >= 0.3)
    result = proc.process_frame(_frame("q"))
    assert result is not None