Refactor tool-speechtotext: extract sttlib shared library and add tests

Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
2026-02-08 00:40:31 +00:00
parent 848681087e
commit 104da381fb
15 changed files with 480 additions and 195 deletions
--- a/python/tool-speechtotext/tests/init.py
+++ b/python/tool-speechtotext/tests/init.py
--- a/python/tool-speechtotext/tests/test_audio.py
+++ b/python/tool-speechtotext/tests/test_audio.py
@@ -0,0 +1,38 @@
+import struct
+import numpy as np
+from sttlib.audio import pcm_bytes_to_float32
+
+
+def test_known_value():
+    # 16384 in int16 -> 0.5 in float32
+    pcm = struct.pack("<h", 16384)
+    result = pcm_bytes_to_float32(pcm)
+    assert abs(result[0] - 0.5) < 1e-5
+
+
+def test_silence():
+    pcm = b"\x00\x00" * 10
+    result = pcm_bytes_to_float32(pcm)
+    assert np.all(result == 0.0)
+
+
+def test_full_scale():
+    # max int16 = 32767 -> ~1.0
+    pcm = struct.pack("<h", 32767)
+    result = pcm_bytes_to_float32(pcm)
+    assert abs(result[0] - (32767 / 32768.0)) < 1e-5
+
+
+def test_negative():
+    # min int16 = -32768 -> -1.0
+    pcm = struct.pack("<h", -32768)
+    result = pcm_bytes_to_float32(pcm)
+    assert result[0] == -1.0
+
+
+def test_round_trip_shape():
+    # 100 samples worth of bytes
+    pcm = b"\x00\x00" * 100
+    result = pcm_bytes_to_float32(pcm)
+    assert result.shape == (100,)
+    assert result.dtype == np.float32
--- a/python/tool-speechtotext/tests/test_transcription.py
+++ b/python/tool-speechtotext/tests/test_transcription.py
@@ -0,0 +1,78 @@
+from unittest.mock import MagicMock
+from sttlib.transcription import transcribe, is_hallucination
+
+
+# --- is_hallucination tests ---
+
+def test_known_hallucinations():
+    assert is_hallucination("Thank you")
+    assert is_hallucination("thanks for watching")
+    assert is_hallucination("Subscribe")
+    assert is_hallucination("the end")
+
+
+def test_short_text():
+    assert is_hallucination("hi")
+    assert is_hallucination("")
+    assert is_hallucination("a")
+
+
+def test_normal_text():
+    assert not is_hallucination("Hello how are you")
+    assert not is_hallucination("Please open the terminal")
+
+
+def test_case_insensitivity():
+    assert is_hallucination("THANK YOU")
+    assert is_hallucination("Thank You For Watching")
+
+
+def test_substring_match():
+    assert is_hallucination("I want to subscribe to your channel")
+
+
+def test_exactly_three_chars():
+    assert not is_hallucination("hey")
+
+
+# --- transcribe tests ---
+
+def _make_segment(text):
+    seg = MagicMock()
+    seg.text = text
+    return seg
+
+
+def test_transcribe_joins_segments():
+    model = MagicMock()
+    model.transcribe.return_value = (
+        [_make_segment("Hello "), _make_segment("world")],
+        None,
+    )
+    result = transcribe(model, MagicMock())
+    assert result == "Hello world"
+
+
+def test_transcribe_empty():
+    model = MagicMock()
+    model.transcribe.return_value = ([], None)
+    result = transcribe(model, MagicMock())
+    assert result == ""
+
+
+def test_transcribe_strips_whitespace():
+    model = MagicMock()
+    model.transcribe.return_value = (
+        [_make_segment("  hello  ")],
+        None,
+    )
+    result = transcribe(model, MagicMock())
+    assert result == "hello"
+
+
+def test_transcribe_passes_beam_size():
+    model = MagicMock()
+    model.transcribe.return_value = ([], None)
+    audio = MagicMock()
+    transcribe(model, audio)
+    model.transcribe.assert_called_once_with(audio, beam_size=5)
--- a/python/tool-speechtotext/tests/test_vad.py
+++ b/python/tool-speechtotext/tests/test_vad.py
@@ -0,0 +1,151 @@
+from unittest.mock import patch, MagicMock
+from sttlib.vad import VADProcessor, FRAME_DURATION_MS, MIN_UTTERANCE_FRAMES
+
+
+def _make_vad_processor(aggressiveness=3, silence_threshold=0.8):
+    """Create VADProcessor with a mocked webrtcvad.Vad."""
+    with patch("sttlib.vad.webrtcvad.Vad") as mock_vad_cls:
+        mock_vad = MagicMock()
+        mock_vad_cls.return_value = mock_vad
+        proc = VADProcessor(aggressiveness, silence_threshold)
+    return proc, mock_vad
+
+
+def _frame(label="x"):
+    """Return a fake 30ms frame (just needs to be distinct bytes)."""
+    return label.encode() * 960  # 480 samples * 2 bytes
+
+
+def test_no_speech_returns_none():
+    proc, mock_vad = _make_vad_processor()
+    mock_vad.is_speech.return_value = False
+
+    for _ in range(100):
+        assert proc.process_frame(_frame()) is None
+
+
+def test_speech_then_silence_triggers_utterance():
+    proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
+
+    # Feed enough speech frames
+    speech_count = MIN_UTTERANCE_FRAMES + 5
+    mock_vad.is_speech.return_value = True
+    for _ in range(speech_count):
+        result = proc.process_frame(_frame("s"))
+        assert result is None  # not done yet
+
+    # Feed silence frames until threshold (0.3s = 10 frames at 30ms)
+    mock_vad.is_speech.return_value = False
+    result = None
+    for _ in range(20):
+        result = proc.process_frame(_frame("q"))
+        if result is not None:
+            break
+
+    assert result is not None
+    assert len(result) > 0
+
+
+def test_short_utterance_filtered():
+    # Use very short silence threshold so silence frames don't push total
+    # past MIN_UTTERANCE_FRAMES. With threshold=0.09s (3 frames of silence):
+    # 0 pre-buffer + 1 speech + 3 silence = 4 total < MIN_UTTERANCE_FRAMES (10)
+    proc, mock_vad = _make_vad_processor(silence_threshold=0.09)
+
+    # Single speech frame triggers VAD
+    mock_vad.is_speech.return_value = True
+    proc.process_frame(_frame("s"))
+
+    # Immediately go silent — threshold reached in 3 frames
+    mock_vad.is_speech.return_value = False
+    result = None
+    for _ in range(20):
+        result = proc.process_frame(_frame("q"))
+        if result is not None:
+            break
+
+    # Should be filtered (too short — only 4 total frames)
+    assert result is None
+
+
+def test_pre_buffer_included():
+    proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
+
+    # Fill pre-buffer with non-speech frames
+    mock_vad.is_speech.return_value = False
+    pre_frame = _frame("p")
+    for _ in range(10):
+        proc.process_frame(pre_frame)
+
+    # Speech starts
+    mock_vad.is_speech.return_value = True
+    speech_frame = _frame("s")
+    for _ in range(MIN_UTTERANCE_FRAMES):
+        proc.process_frame(speech_frame)
+
+    # Silence to trigger
+    mock_vad.is_speech.return_value = False
+    result = None
+    for _ in range(20):
+        result = proc.process_frame(_frame("q"))
+        if result is not None:
+            break
+
+    assert result is not None
+    # Result should contain pre-buffer frames
+    assert pre_frame in result
+
+
+def test_reset_after_utterance():
+    proc, mock_vad = _make_vad_processor(silence_threshold=0.3)
+
+    # First utterance
+    mock_vad.is_speech.return_value = True
+    for _ in range(MIN_UTTERANCE_FRAMES + 5):
+        proc.process_frame(_frame("s"))
+
+    mock_vad.is_speech.return_value = False
+    for _ in range(20):
+        result = proc.process_frame(_frame("q"))
+        if result is not None:
+            break
+    assert result is not None
+
+    # After reset, should be able to collect a second utterance
+    assert not proc.triggered
+    assert proc.utterance_frames == []
+
+    mock_vad.is_speech.return_value = True
+    for _ in range(MIN_UTTERANCE_FRAMES + 5):
+        proc.process_frame(_frame("s"))
+
+    mock_vad.is_speech.return_value = False
+    result2 = None
+    for _ in range(20):
+        result2 = proc.process_frame(_frame("q"))
+        if result2 is not None:
+            break
+    assert result2 is not None
+
+
+def test_silence_threshold_boundary():
+    # Use 0.3s threshold: 0.3 / 0.03 = exactly 10 frames needed
+    threshold = 0.3
+    proc, mock_vad = _make_vad_processor(silence_threshold=threshold)
+
+    # Start with speech
+    mock_vad.is_speech.return_value = True
+    for _ in range(MIN_UTTERANCE_FRAMES + 5):
+        proc.process_frame(_frame("s"))
+
+    frames_needed = 10  # 0.3s / 0.03s per frame
+    mock_vad.is_speech.return_value = False
+
+    # Feed one less than needed — should NOT trigger
+    for i in range(frames_needed - 1):
+        result = proc.process_frame(_frame("q"))
+        assert result is None, f"Triggered too early at frame {i}"
+
+    # The 10th frame should trigger (silence_duration = 0.3 >= 0.3)
+    result = proc.process_frame(_frame("q"))
+    assert result is not None
--- a/python/tool-speechtotext/tests/test_whisper_loader.py
+++ b/python/tool-speechtotext/tests/test_whisper_loader.py
@@ -0,0 +1,37 @@
+from unittest.mock import patch, MagicMock
+from sttlib.whisper_loader import load_whisper_model
+
+
+@patch("sttlib.whisper_loader.WhisperModel")
+def test_gpu_success(mock_cls):
+    mock_model = MagicMock()
+    mock_cls.return_value = mock_model
+
+    result = load_whisper_model("base")
+
+    assert result is mock_model
+    mock_cls.assert_called_once_with("base", device="cuda", compute_type="float16")
+
+
+@patch("sttlib.whisper_loader.WhisperModel")
+def test_gpu_fails_cpu_fallback(mock_cls):
+    mock_model = MagicMock()
+    mock_cls.side_effect = [RuntimeError("no CUDA"), mock_model]
+
+    result = load_whisper_model("base")
+
+    assert result is mock_model
+    assert mock_cls.call_count == 2
+    _, kwargs = mock_cls.call_args
+    assert kwargs == {"device": "cpu", "compute_type": "int8"}
+
+
+@patch("sttlib.whisper_loader.WhisperModel")
+def test_both_fail_propagates(mock_cls):
+    mock_cls.side_effect = RuntimeError("no device")
+
+    try:
+        load_whisper_model("base")
+        assert False, "Should have raised"
+    except RuntimeError:
+        pass