Refactor tool-speechtotext: extract sttlib shared library and add tests

Extract duplicated code (Whisper loading, audio recording, transcription,
VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as
thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback
bug in assistant.py and args.system assignment bug.
This commit is contained in:
local
2026-02-08 00:40:31 +00:00
parent 848681087e
commit 104da381fb
15 changed files with 480 additions and 195 deletions

View File

@@ -0,0 +1,38 @@
import struct
import numpy as np
from sttlib.audio import pcm_bytes_to_float32
def test_known_value():
# 16384 in int16 -> 0.5 in float32
pcm = struct.pack("<h", 16384)
result = pcm_bytes_to_float32(pcm)
assert abs(result[0] - 0.5) < 1e-5
def test_silence():
pcm = b"\x00\x00" * 10
result = pcm_bytes_to_float32(pcm)
assert np.all(result == 0.0)
def test_full_scale():
# max int16 = 32767 -> ~1.0
pcm = struct.pack("<h", 32767)
result = pcm_bytes_to_float32(pcm)
assert abs(result[0] - (32767 / 32768.0)) < 1e-5
def test_negative():
# min int16 = -32768 -> -1.0
pcm = struct.pack("<h", -32768)
result = pcm_bytes_to_float32(pcm)
assert result[0] == -1.0
def test_round_trip_shape():
# 100 samples worth of bytes
pcm = b"\x00\x00" * 100
result = pcm_bytes_to_float32(pcm)
assert result.shape == (100,)
assert result.dtype == np.float32

View File

@@ -0,0 +1,78 @@
from unittest.mock import MagicMock
from sttlib.transcription import transcribe, is_hallucination
# --- is_hallucination tests ---
def test_known_hallucinations():
assert is_hallucination("Thank you")
assert is_hallucination("thanks for watching")
assert is_hallucination("Subscribe")
assert is_hallucination("the end")
def test_short_text():
assert is_hallucination("hi")
assert is_hallucination("")
assert is_hallucination("a")
def test_normal_text():
assert not is_hallucination("Hello how are you")
assert not is_hallucination("Please open the terminal")
def test_case_insensitivity():
assert is_hallucination("THANK YOU")
assert is_hallucination("Thank You For Watching")
def test_substring_match():
assert is_hallucination("I want to subscribe to your channel")
def test_exactly_three_chars():
assert not is_hallucination("hey")
# --- transcribe tests ---
def _make_segment(text):
seg = MagicMock()
seg.text = text
return seg
def test_transcribe_joins_segments():
model = MagicMock()
model.transcribe.return_value = (
[_make_segment("Hello "), _make_segment("world")],
None,
)
result = transcribe(model, MagicMock())
assert result == "Hello world"
def test_transcribe_empty():
model = MagicMock()
model.transcribe.return_value = ([], None)
result = transcribe(model, MagicMock())
assert result == ""
def test_transcribe_strips_whitespace():
model = MagicMock()
model.transcribe.return_value = (
[_make_segment(" hello ")],
None,
)
result = transcribe(model, MagicMock())
assert result == "hello"
def test_transcribe_passes_beam_size():
model = MagicMock()
model.transcribe.return_value = ([], None)
audio = MagicMock()
transcribe(model, audio)
model.transcribe.assert_called_once_with(audio, beam_size=5)

View File

@@ -0,0 +1,151 @@
from unittest.mock import patch, MagicMock
from sttlib.vad import VADProcessor, FRAME_DURATION_MS, MIN_UTTERANCE_FRAMES
def _make_vad_processor(aggressiveness=3, silence_threshold=0.8):
"""Create VADProcessor with a mocked webrtcvad.Vad."""
with patch("sttlib.vad.webrtcvad.Vad") as mock_vad_cls:
mock_vad = MagicMock()
mock_vad_cls.return_value = mock_vad
proc = VADProcessor(aggressiveness, silence_threshold)
return proc, mock_vad
def _frame(label="x"):
"""Return a fake 30ms frame (just needs to be distinct bytes)."""
return label.encode() * 960 # 480 samples * 2 bytes
def test_no_speech_returns_none():
proc, mock_vad = _make_vad_processor()
mock_vad.is_speech.return_value = False
for _ in range(100):
assert proc.process_frame(_frame()) is None
def test_speech_then_silence_triggers_utterance():
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
# Feed enough speech frames
speech_count = MIN_UTTERANCE_FRAMES + 5
mock_vad.is_speech.return_value = True
for _ in range(speech_count):
result = proc.process_frame(_frame("s"))
assert result is None # not done yet
# Feed silence frames until threshold (0.3s = 10 frames at 30ms)
mock_vad.is_speech.return_value = False
result = None
for _ in range(20):
result = proc.process_frame(_frame("q"))
if result is not None:
break
assert result is not None
assert len(result) > 0
def test_short_utterance_filtered():
# Use very short silence threshold so silence frames don't push total
# past MIN_UTTERANCE_FRAMES. With threshold=0.09s (3 frames of silence):
# 0 pre-buffer + 1 speech + 3 silence = 4 total < MIN_UTTERANCE_FRAMES (10)
proc, mock_vad = _make_vad_processor(silence_threshold=0.09)
# Single speech frame triggers VAD
mock_vad.is_speech.return_value = True
proc.process_frame(_frame("s"))
# Immediately go silent — threshold reached in 3 frames
mock_vad.is_speech.return_value = False
result = None
for _ in range(20):
result = proc.process_frame(_frame("q"))
if result is not None:
break
# Should be filtered (too short — only 4 total frames)
assert result is None
def test_pre_buffer_included():
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
# Fill pre-buffer with non-speech frames
mock_vad.is_speech.return_value = False
pre_frame = _frame("p")
for _ in range(10):
proc.process_frame(pre_frame)
# Speech starts
mock_vad.is_speech.return_value = True
speech_frame = _frame("s")
for _ in range(MIN_UTTERANCE_FRAMES):
proc.process_frame(speech_frame)
# Silence to trigger
mock_vad.is_speech.return_value = False
result = None
for _ in range(20):
result = proc.process_frame(_frame("q"))
if result is not None:
break
assert result is not None
# Result should contain pre-buffer frames
assert pre_frame in result
def test_reset_after_utterance():
proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
# First utterance
mock_vad.is_speech.return_value = True
for _ in range(MIN_UTTERANCE_FRAMES + 5):
proc.process_frame(_frame("s"))
mock_vad.is_speech.return_value = False
for _ in range(20):
result = proc.process_frame(_frame("q"))
if result is not None:
break
assert result is not None
# After reset, should be able to collect a second utterance
assert not proc.triggered
assert proc.utterance_frames == []
mock_vad.is_speech.return_value = True
for _ in range(MIN_UTTERANCE_FRAMES + 5):
proc.process_frame(_frame("s"))
mock_vad.is_speech.return_value = False
result2 = None
for _ in range(20):
result2 = proc.process_frame(_frame("q"))
if result2 is not None:
break
assert result2 is not None
def test_silence_threshold_boundary():
# Use 0.3s threshold: 0.3 / 0.03 = exactly 10 frames needed
threshold = 0.3
proc, mock_vad = _make_vad_processor(silence_threshold=threshold)
# Start with speech
mock_vad.is_speech.return_value = True
for _ in range(MIN_UTTERANCE_FRAMES + 5):
proc.process_frame(_frame("s"))
frames_needed = 10 # 0.3s / 0.03s per frame
mock_vad.is_speech.return_value = False
# Feed one less than needed — should NOT trigger
for i in range(frames_needed - 1):
result = proc.process_frame(_frame("q"))
assert result is None, f"Triggered too early at frame {i}"
# The 10th frame should trigger (silence_duration = 0.3 >= 0.3)
result = proc.process_frame(_frame("q"))
assert result is not None

View File

@@ -0,0 +1,37 @@
from unittest.mock import patch, MagicMock
from sttlib.whisper_loader import load_whisper_model
@patch("sttlib.whisper_loader.WhisperModel")
def test_gpu_success(mock_cls):
mock_model = MagicMock()
mock_cls.return_value = mock_model
result = load_whisper_model("base")
assert result is mock_model
mock_cls.assert_called_once_with("base", device="cuda", compute_type="float16")
@patch("sttlib.whisper_loader.WhisperModel")
def test_gpu_fails_cpu_fallback(mock_cls):
mock_model = MagicMock()
mock_cls.side_effect = [RuntimeError("no CUDA"), mock_model]
result = load_whisper_model("base")
assert result is mock_model
assert mock_cls.call_count == 2
_, kwargs = mock_cls.call_args
assert kwargs == {"device": "cpu", "compute_type": "int8"}
@patch("sttlib.whisper_loader.WhisperModel")
def test_both_fail_propagates(mock_cls):
mock_cls.side_effect = RuntimeError("no device")
try:
load_whisper_model("base")
assert False, "Should have raised"
except RuntimeError:
pass