Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
152 lines
4.7 KiB
Python
152 lines
4.7 KiB
Python
from unittest.mock import patch, MagicMock
|
|
from sttlib.vad import VADProcessor, FRAME_DURATION_MS, MIN_UTTERANCE_FRAMES
|
|
|
|
|
|
def _make_vad_processor(aggressiveness=3, silence_threshold=0.8):
|
|
"""Create VADProcessor with a mocked webrtcvad.Vad."""
|
|
with patch("sttlib.vad.webrtcvad.Vad") as mock_vad_cls:
|
|
mock_vad = MagicMock()
|
|
mock_vad_cls.return_value = mock_vad
|
|
proc = VADProcessor(aggressiveness, silence_threshold)
|
|
return proc, mock_vad
|
|
|
|
|
|
def _frame(label="x"):
|
|
"""Return a fake 30ms frame (just needs to be distinct bytes)."""
|
|
return label.encode() * 960 # 480 samples * 2 bytes
|
|
|
|
|
|
def test_no_speech_returns_none():
|
|
proc, mock_vad = _make_vad_processor()
|
|
mock_vad.is_speech.return_value = False
|
|
|
|
for _ in range(100):
|
|
assert proc.process_frame(_frame()) is None
|
|
|
|
|
|
def test_speech_then_silence_triggers_utterance():
|
|
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
|
|
|
|
# Feed enough speech frames
|
|
speech_count = MIN_UTTERANCE_FRAMES + 5
|
|
mock_vad.is_speech.return_value = True
|
|
for _ in range(speech_count):
|
|
result = proc.process_frame(_frame("s"))
|
|
assert result is None # not done yet
|
|
|
|
# Feed silence frames until threshold (0.3s = 10 frames at 30ms)
|
|
mock_vad.is_speech.return_value = False
|
|
result = None
|
|
for _ in range(20):
|
|
result = proc.process_frame(_frame("q"))
|
|
if result is not None:
|
|
break
|
|
|
|
assert result is not None
|
|
assert len(result) > 0
|
|
|
|
|
|
def test_short_utterance_filtered():
|
|
# Use very short silence threshold so silence frames don't push total
|
|
# past MIN_UTTERANCE_FRAMES. With threshold=0.09s (3 frames of silence):
|
|
# 0 pre-buffer + 1 speech + 3 silence = 4 total < MIN_UTTERANCE_FRAMES (10)
|
|
proc, mock_vad = _make_vad_processor(silence_threshold=0.09)
|
|
|
|
# Single speech frame triggers VAD
|
|
mock_vad.is_speech.return_value = True
|
|
proc.process_frame(_frame("s"))
|
|
|
|
# Immediately go silent — threshold reached in 3 frames
|
|
mock_vad.is_speech.return_value = False
|
|
result = None
|
|
for _ in range(20):
|
|
result = proc.process_frame(_frame("q"))
|
|
if result is not None:
|
|
break
|
|
|
|
# Should be filtered (too short — only 4 total frames)
|
|
assert result is None
|
|
|
|
|
|
def test_pre_buffer_included():
|
|
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
|
|
|
|
# Fill pre-buffer with non-speech frames
|
|
mock_vad.is_speech.return_value = False
|
|
pre_frame = _frame("p")
|
|
for _ in range(10):
|
|
proc.process_frame(pre_frame)
|
|
|
|
# Speech starts
|
|
mock_vad.is_speech.return_value = True
|
|
speech_frame = _frame("s")
|
|
for _ in range(MIN_UTTERANCE_FRAMES):
|
|
proc.process_frame(speech_frame)
|
|
|
|
# Silence to trigger
|
|
mock_vad.is_speech.return_value = False
|
|
result = None
|
|
for _ in range(20):
|
|
result = proc.process_frame(_frame("q"))
|
|
if result is not None:
|
|
break
|
|
|
|
assert result is not None
|
|
# Result should contain pre-buffer frames
|
|
assert pre_frame in result
|
|
|
|
|
|
def test_reset_after_utterance():
|
|
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
|
|
|
|
# First utterance
|
|
mock_vad.is_speech.return_value = True
|
|
for _ in range(MIN_UTTERANCE_FRAMES + 5):
|
|
proc.process_frame(_frame("s"))
|
|
|
|
mock_vad.is_speech.return_value = False
|
|
for _ in range(20):
|
|
result = proc.process_frame(_frame("q"))
|
|
if result is not None:
|
|
break
|
|
assert result is not None
|
|
|
|
# After reset, should be able to collect a second utterance
|
|
assert not proc.triggered
|
|
assert proc.utterance_frames == []
|
|
|
|
mock_vad.is_speech.return_value = True
|
|
for _ in range(MIN_UTTERANCE_FRAMES + 5):
|
|
proc.process_frame(_frame("s"))
|
|
|
|
mock_vad.is_speech.return_value = False
|
|
result2 = None
|
|
for _ in range(20):
|
|
result2 = proc.process_frame(_frame("q"))
|
|
if result2 is not None:
|
|
break
|
|
assert result2 is not None
|
|
|
|
|
|
def test_silence_threshold_boundary():
|
|
# Use 0.3s threshold: 0.3 / 0.03 = exactly 10 frames needed
|
|
threshold = 0.3
|
|
proc, mock_vad = _make_vad_processor(silence_threshold=threshold)
|
|
|
|
# Start with speech
|
|
mock_vad.is_speech.return_value = True
|
|
for _ in range(MIN_UTTERANCE_FRAMES + 5):
|
|
proc.process_frame(_frame("s"))
|
|
|
|
frames_needed = 10 # 0.3s / 0.03s per frame
|
|
mock_vad.is_speech.return_value = False
|
|
|
|
# Feed one less than needed — should NOT trigger
|
|
for i in range(frames_needed - 1):
|
|
result = proc.process_frame(_frame("q"))
|
|
assert result is None, f"Triggered too early at frame {i}"
|
|
|
|
# The 10th frame should trigger (silence_duration = 0.3 >= 0.3)
|
|
result = proc.process_frame(_frame("q"))
|
|
assert result is not None
|