Files
Code/python/tool-speechtotext/tests/test_vad.py
local 104da381fb Refactor tool-speechtotext: extract sttlib shared library and add tests
Extract duplicated code (Whisper loading, audio recording, transcription,
VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as
thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback
bug in assistant.py and args.system assignment bug.
2026-02-08 00:40:31 +00:00

152 lines
4.7 KiB
Python

from unittest.mock import patch, MagicMock
from sttlib.vad import VADProcessor, FRAME_DURATION_MS, MIN_UTTERANCE_FRAMES
def _make_vad_processor(aggressiveness=3, silence_threshold=0.8):
"""Create VADProcessor with a mocked webrtcvad.Vad."""
with patch("sttlib.vad.webrtcvad.Vad") as mock_vad_cls:
mock_vad = MagicMock()
mock_vad_cls.return_value = mock_vad
proc = VADProcessor(aggressiveness, silence_threshold)
return proc, mock_vad
def _frame(label="x"):
"""Return a fake 30ms frame (just needs to be distinct bytes)."""
return label.encode() * 960 # 480 samples * 2 bytes
def test_no_speech_returns_none():
proc, mock_vad = _make_vad_processor()
mock_vad.is_speech.return_value = False
for _ in range(100):
assert proc.process_frame(_frame()) is None
def test_speech_then_silence_triggers_utterance():
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
# Feed enough speech frames
speech_count = MIN_UTTERANCE_FRAMES + 5
mock_vad.is_speech.return_value = True
for _ in range(speech_count):
result = proc.process_frame(_frame("s"))
assert result is None # not done yet
# Feed silence frames until threshold (0.3s = 10 frames at 30ms)
mock_vad.is_speech.return_value = False
result = None
for _ in range(20):
result = proc.process_frame(_frame("q"))
if result is not None:
break
assert result is not None
assert len(result) > 0
def test_short_utterance_filtered():
# Use very short silence threshold so silence frames don't push total
# past MIN_UTTERANCE_FRAMES. With threshold=0.09s (3 frames of silence):
# 0 pre-buffer + 1 speech + 3 silence = 4 total < MIN_UTTERANCE_FRAMES (10)
proc, mock_vad = _make_vad_processor(silence_threshold=0.09)
# Single speech frame triggers VAD
mock_vad.is_speech.return_value = True
proc.process_frame(_frame("s"))
# Immediately go silent — threshold reached in 3 frames
mock_vad.is_speech.return_value = False
result = None
for _ in range(20):
result = proc.process_frame(_frame("q"))
if result is not None:
break
# Should be filtered (too short — only 4 total frames)
assert result is None
def test_pre_buffer_included():
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
# Fill pre-buffer with non-speech frames
mock_vad.is_speech.return_value = False
pre_frame = _frame("p")
for _ in range(10):
proc.process_frame(pre_frame)
# Speech starts
mock_vad.is_speech.return_value = True
speech_frame = _frame("s")
for _ in range(MIN_UTTERANCE_FRAMES):
proc.process_frame(speech_frame)
# Silence to trigger
mock_vad.is_speech.return_value = False
result = None
for _ in range(20):
result = proc.process_frame(_frame("q"))
if result is not None:
break
assert result is not None
# Result should contain pre-buffer frames
assert pre_frame in result
def test_reset_after_utterance():
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
# First utterance
mock_vad.is_speech.return_value = True
for _ in range(MIN_UTTERANCE_FRAMES + 5):
proc.process_frame(_frame("s"))
mock_vad.is_speech.return_value = False
for _ in range(20):
result = proc.process_frame(_frame("q"))
if result is not None:
break
assert result is not None
# After reset, should be able to collect a second utterance
assert not proc.triggered
assert proc.utterance_frames == []
mock_vad.is_speech.return_value = True
for _ in range(MIN_UTTERANCE_FRAMES + 5):
proc.process_frame(_frame("s"))
mock_vad.is_speech.return_value = False
result2 = None
for _ in range(20):
result2 = proc.process_frame(_frame("q"))
if result2 is not None:
break
assert result2 is not None
def test_silence_threshold_boundary():
# Use 0.3s threshold: 0.3 / 0.03 = exactly 10 frames needed
threshold = 0.3
proc, mock_vad = _make_vad_processor(silence_threshold=threshold)
# Start with speech
mock_vad.is_speech.return_value = True
for _ in range(MIN_UTTERANCE_FRAMES + 5):
proc.process_frame(_frame("s"))
frames_needed = 10 # 0.3s / 0.03s per frame
mock_vad.is_speech.return_value = False
# Feed one less than needed — should NOT trigger
for i in range(frames_needed - 1):
result = proc.process_frame(_frame("q"))
assert result is None, f"Triggered too early at frame {i}"
# The 10th frame should trigger (silence_duration = 0.3 >= 0.3)
result = proc.process_frame(_frame("q"))
assert result is not None