Refactor tool-speechtotext: extract sttlib shared library and add tests
Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
This commit is contained in:
0
python/tool-speechtotext/tests/__init__.py
Normal file
0
python/tool-speechtotext/tests/__init__.py
Normal file
38
python/tool-speechtotext/tests/test_audio.py
Normal file
38
python/tool-speechtotext/tests/test_audio.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import struct
|
||||
import numpy as np
|
||||
from sttlib.audio import pcm_bytes_to_float32
|
||||
|
||||
|
||||
def test_known_value():
|
||||
# 16384 in int16 -> 0.5 in float32
|
||||
pcm = struct.pack("<h", 16384)
|
||||
result = pcm_bytes_to_float32(pcm)
|
||||
assert abs(result[0] - 0.5) < 1e-5
|
||||
|
||||
|
||||
def test_silence():
|
||||
pcm = b"\x00\x00" * 10
|
||||
result = pcm_bytes_to_float32(pcm)
|
||||
assert np.all(result == 0.0)
|
||||
|
||||
|
||||
def test_full_scale():
|
||||
# max int16 = 32767 -> ~1.0
|
||||
pcm = struct.pack("<h", 32767)
|
||||
result = pcm_bytes_to_float32(pcm)
|
||||
assert abs(result[0] - (32767 / 32768.0)) < 1e-5
|
||||
|
||||
|
||||
def test_negative():
|
||||
# min int16 = -32768 -> -1.0
|
||||
pcm = struct.pack("<h", -32768)
|
||||
result = pcm_bytes_to_float32(pcm)
|
||||
assert result[0] == -1.0
|
||||
|
||||
|
||||
def test_round_trip_shape():
|
||||
# 100 samples worth of bytes
|
||||
pcm = b"\x00\x00" * 100
|
||||
result = pcm_bytes_to_float32(pcm)
|
||||
assert result.shape == (100,)
|
||||
assert result.dtype == np.float32
|
||||
78
python/tool-speechtotext/tests/test_transcription.py
Normal file
78
python/tool-speechtotext/tests/test_transcription.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from unittest.mock import MagicMock
|
||||
from sttlib.transcription import transcribe, is_hallucination
|
||||
|
||||
|
||||
# --- is_hallucination tests ---
|
||||
|
||||
def test_known_hallucinations():
|
||||
assert is_hallucination("Thank you")
|
||||
assert is_hallucination("thanks for watching")
|
||||
assert is_hallucination("Subscribe")
|
||||
assert is_hallucination("the end")
|
||||
|
||||
|
||||
def test_short_text():
|
||||
assert is_hallucination("hi")
|
||||
assert is_hallucination("")
|
||||
assert is_hallucination("a")
|
||||
|
||||
|
||||
def test_normal_text():
|
||||
assert not is_hallucination("Hello how are you")
|
||||
assert not is_hallucination("Please open the terminal")
|
||||
|
||||
|
||||
def test_case_insensitivity():
|
||||
assert is_hallucination("THANK YOU")
|
||||
assert is_hallucination("Thank You For Watching")
|
||||
|
||||
|
||||
def test_substring_match():
|
||||
assert is_hallucination("I want to subscribe to your channel")
|
||||
|
||||
|
||||
def test_exactly_three_chars():
|
||||
assert not is_hallucination("hey")
|
||||
|
||||
|
||||
# --- transcribe tests ---
|
||||
|
||||
def _make_segment(text):
|
||||
seg = MagicMock()
|
||||
seg.text = text
|
||||
return seg
|
||||
|
||||
|
||||
def test_transcribe_joins_segments():
|
||||
model = MagicMock()
|
||||
model.transcribe.return_value = (
|
||||
[_make_segment("Hello "), _make_segment("world")],
|
||||
None,
|
||||
)
|
||||
result = transcribe(model, MagicMock())
|
||||
assert result == "Hello world"
|
||||
|
||||
|
||||
def test_transcribe_empty():
|
||||
model = MagicMock()
|
||||
model.transcribe.return_value = ([], None)
|
||||
result = transcribe(model, MagicMock())
|
||||
assert result == ""
|
||||
|
||||
|
||||
def test_transcribe_strips_whitespace():
|
||||
model = MagicMock()
|
||||
model.transcribe.return_value = (
|
||||
[_make_segment(" hello ")],
|
||||
None,
|
||||
)
|
||||
result = transcribe(model, MagicMock())
|
||||
assert result == "hello"
|
||||
|
||||
|
||||
def test_transcribe_passes_beam_size():
|
||||
model = MagicMock()
|
||||
model.transcribe.return_value = ([], None)
|
||||
audio = MagicMock()
|
||||
transcribe(model, audio)
|
||||
model.transcribe.assert_called_once_with(audio, beam_size=5)
|
||||
151
python/tool-speechtotext/tests/test_vad.py
Normal file
151
python/tool-speechtotext/tests/test_vad.py
Normal file
@@ -0,0 +1,151 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
from sttlib.vad import VADProcessor, FRAME_DURATION_MS, MIN_UTTERANCE_FRAMES
|
||||
|
||||
|
||||
def _make_vad_processor(aggressiveness=3, silence_threshold=0.8):
|
||||
"""Create VADProcessor with a mocked webrtcvad.Vad."""
|
||||
with patch("sttlib.vad.webrtcvad.Vad") as mock_vad_cls:
|
||||
mock_vad = MagicMock()
|
||||
mock_vad_cls.return_value = mock_vad
|
||||
proc = VADProcessor(aggressiveness, silence_threshold)
|
||||
return proc, mock_vad
|
||||
|
||||
|
||||
def _frame(label="x"):
|
||||
"""Return a fake 30ms frame (just needs to be distinct bytes)."""
|
||||
return label.encode() * 960 # 480 samples * 2 bytes
|
||||
|
||||
|
||||
def test_no_speech_returns_none():
|
||||
proc, mock_vad = _make_vad_processor()
|
||||
mock_vad.is_speech.return_value = False
|
||||
|
||||
for _ in range(100):
|
||||
assert proc.process_frame(_frame()) is None
|
||||
|
||||
|
||||
def test_speech_then_silence_triggers_utterance():
|
||||
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
|
||||
|
||||
# Feed enough speech frames
|
||||
speech_count = MIN_UTTERANCE_FRAMES + 5
|
||||
mock_vad.is_speech.return_value = True
|
||||
for _ in range(speech_count):
|
||||
result = proc.process_frame(_frame("s"))
|
||||
assert result is None # not done yet
|
||||
|
||||
# Feed silence frames until threshold (0.3s = 10 frames at 30ms)
|
||||
mock_vad.is_speech.return_value = False
|
||||
result = None
|
||||
for _ in range(20):
|
||||
result = proc.process_frame(_frame("q"))
|
||||
if result is not None:
|
||||
break
|
||||
|
||||
assert result is not None
|
||||
assert len(result) > 0
|
||||
|
||||
|
||||
def test_short_utterance_filtered():
|
||||
# Use very short silence threshold so silence frames don't push total
|
||||
# past MIN_UTTERANCE_FRAMES. With threshold=0.09s (3 frames of silence):
|
||||
# 0 pre-buffer + 1 speech + 3 silence = 4 total < MIN_UTTERANCE_FRAMES (10)
|
||||
proc, mock_vad = _make_vad_processor(silence_threshold=0.09)
|
||||
|
||||
# Single speech frame triggers VAD
|
||||
mock_vad.is_speech.return_value = True
|
||||
proc.process_frame(_frame("s"))
|
||||
|
||||
# Immediately go silent — threshold reached in 3 frames
|
||||
mock_vad.is_speech.return_value = False
|
||||
result = None
|
||||
for _ in range(20):
|
||||
result = proc.process_frame(_frame("q"))
|
||||
if result is not None:
|
||||
break
|
||||
|
||||
# Should be filtered (too short — only 4 total frames)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_pre_buffer_included():
|
||||
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
|
||||
|
||||
# Fill pre-buffer with non-speech frames
|
||||
mock_vad.is_speech.return_value = False
|
||||
pre_frame = _frame("p")
|
||||
for _ in range(10):
|
||||
proc.process_frame(pre_frame)
|
||||
|
||||
# Speech starts
|
||||
mock_vad.is_speech.return_value = True
|
||||
speech_frame = _frame("s")
|
||||
for _ in range(MIN_UTTERANCE_FRAMES):
|
||||
proc.process_frame(speech_frame)
|
||||
|
||||
# Silence to trigger
|
||||
mock_vad.is_speech.return_value = False
|
||||
result = None
|
||||
for _ in range(20):
|
||||
result = proc.process_frame(_frame("q"))
|
||||
if result is not None:
|
||||
break
|
||||
|
||||
assert result is not None
|
||||
# Result should contain pre-buffer frames
|
||||
assert pre_frame in result
|
||||
|
||||
|
||||
def test_reset_after_utterance():
|
||||
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
|
||||
|
||||
# First utterance
|
||||
mock_vad.is_speech.return_value = True
|
||||
for _ in range(MIN_UTTERANCE_FRAMES + 5):
|
||||
proc.process_frame(_frame("s"))
|
||||
|
||||
mock_vad.is_speech.return_value = False
|
||||
for _ in range(20):
|
||||
result = proc.process_frame(_frame("q"))
|
||||
if result is not None:
|
||||
break
|
||||
assert result is not None
|
||||
|
||||
# After reset, should be able to collect a second utterance
|
||||
assert not proc.triggered
|
||||
assert proc.utterance_frames == []
|
||||
|
||||
mock_vad.is_speech.return_value = True
|
||||
for _ in range(MIN_UTTERANCE_FRAMES + 5):
|
||||
proc.process_frame(_frame("s"))
|
||||
|
||||
mock_vad.is_speech.return_value = False
|
||||
result2 = None
|
||||
for _ in range(20):
|
||||
result2 = proc.process_frame(_frame("q"))
|
||||
if result2 is not None:
|
||||
break
|
||||
assert result2 is not None
|
||||
|
||||
|
||||
def test_silence_threshold_boundary():
|
||||
# Use 0.3s threshold: 0.3 / 0.03 = exactly 10 frames needed
|
||||
threshold = 0.3
|
||||
proc, mock_vad = _make_vad_processor(silence_threshold=threshold)
|
||||
|
||||
# Start with speech
|
||||
mock_vad.is_speech.return_value = True
|
||||
for _ in range(MIN_UTTERANCE_FRAMES + 5):
|
||||
proc.process_frame(_frame("s"))
|
||||
|
||||
frames_needed = 10 # 0.3s / 0.03s per frame
|
||||
mock_vad.is_speech.return_value = False
|
||||
|
||||
# Feed one less than needed — should NOT trigger
|
||||
for i in range(frames_needed - 1):
|
||||
result = proc.process_frame(_frame("q"))
|
||||
assert result is None, f"Triggered too early at frame {i}"
|
||||
|
||||
# The 10th frame should trigger (silence_duration = 0.3 >= 0.3)
|
||||
result = proc.process_frame(_frame("q"))
|
||||
assert result is not None
|
||||
37
python/tool-speechtotext/tests/test_whisper_loader.py
Normal file
37
python/tool-speechtotext/tests/test_whisper_loader.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
from sttlib.whisper_loader import load_whisper_model
|
||||
|
||||
|
||||
@patch("sttlib.whisper_loader.WhisperModel")
|
||||
def test_gpu_success(mock_cls):
|
||||
mock_model = MagicMock()
|
||||
mock_cls.return_value = mock_model
|
||||
|
||||
result = load_whisper_model("base")
|
||||
|
||||
assert result is mock_model
|
||||
mock_cls.assert_called_once_with("base", device="cuda", compute_type="float16")
|
||||
|
||||
|
||||
@patch("sttlib.whisper_loader.WhisperModel")
|
||||
def test_gpu_fails_cpu_fallback(mock_cls):
|
||||
mock_model = MagicMock()
|
||||
mock_cls.side_effect = [RuntimeError("no CUDA"), mock_model]
|
||||
|
||||
result = load_whisper_model("base")
|
||||
|
||||
assert result is mock_model
|
||||
assert mock_cls.call_count == 2
|
||||
_, kwargs = mock_cls.call_args
|
||||
assert kwargs == {"device": "cpu", "compute_type": "int8"}
|
||||
|
||||
|
||||
@patch("sttlib.whisper_loader.WhisperModel")
|
||||
def test_both_fail_propagates(mock_cls):
|
||||
mock_cls.side_effect = RuntimeError("no device")
|
||||
|
||||
try:
|
||||
load_whisper_model("base")
|
||||
assert False, "Should have raised"
|
||||
except RuntimeError:
|
||||
pass
|
||||
Reference in New Issue
Block a user