Refactor tool-speechtotext: extract sttlib shared library and add tests

Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
2026-02-08 00:40:31 +00:00
parent 848681087e
commit 104da381fb
15 changed files with 480 additions and 195 deletions
--- a/python/tool-speechtotext/sttlib/init.py
+++ b/python/tool-speechtotext/sttlib/init.py
@@ -0,0 +1,7 @@
+from sttlib.whisper_loader import load_whisper_model
+from sttlib.audio import record_until_enter, pcm_bytes_to_float32
+from sttlib.transcription import transcribe, is_hallucination, HALLUCINATION_PATTERNS
+from sttlib.vad import (
+    VADProcessor, audio_callback, audio_queue,
+    SAMPLE_RATE, CHANNELS, FRAME_DURATION_MS, FRAME_SIZE, MIN_UTTERANCE_FRAMES,
+)
--- a/python/tool-speechtotext/sttlib/audio.py
+++ b/python/tool-speechtotext/sttlib/audio.py
@@ -0,0 +1,28 @@
+import sys
+import numpy as np
+import sounddevice as sd
+
+
+def record_until_enter(sample_rate=16000):
+    """Record audio until user presses Enter. Returns float32 numpy array."""
+    print("\n[READY] Press Enter to START recording...")
+    input()
+    print("[RECORDING] Press Enter to STOP...")
+
+    recording = []
+
+    def callback(indata, frames, time, status):
+        if status:
+            print(status, file=sys.stderr)
+        recording.append(indata.copy())
+
+    with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback):
+        input()
+
+    return np.concatenate(recording, axis=0)
+
+
+def pcm_bytes_to_float32(pcm_bytes):
+    """Convert raw 16-bit PCM bytes to float32 array normalized to [-1, 1]."""
+    audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
+    return audio_int16.astype(np.float32) / 32768.0
--- a/python/tool-speechtotext/sttlib/transcription.py
+++ b/python/tool-speechtotext/sttlib/transcription.py
@@ -0,0 +1,19 @@
+HALLUCINATION_PATTERNS = [
+    "thank you", "thanks for watching", "subscribe",
+    "bye", "the end", "thank you for watching",
+    "please subscribe", "like and subscribe",
+]
+
+
+def transcribe(model, audio_float32):
+    """Transcribe audio using Whisper. Returns stripped text."""
+    segments, _ = model.transcribe(audio_float32, beam_size=5)
+    return "".join(segment.text for segment in segments).strip()
+
+
+def is_hallucination(text):
+    """Return True if text looks like a Whisper hallucination."""
+    lowered = text.lower().strip()
+    if len(lowered) < 3:
+        return True
+    return any(p in lowered for p in HALLUCINATION_PATTERNS)
--- a/python/tool-speechtotext/sttlib/vad.py
+++ b/python/tool-speechtotext/sttlib/vad.py
@@ -0,0 +1,58 @@
+import sys
+import queue
+import collections
+import webrtcvad
+
+SAMPLE_RATE = 16000
+CHANNELS = 1
+FRAME_DURATION_MS = 30
+FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000)  # 480 samples
+MIN_UTTERANCE_FRAMES = 10  # ~300ms minimum to filter coughs/clicks
+
+audio_queue = queue.Queue()
+
+
+def audio_callback(indata, frames, time_info, status):
+    """sounddevice callback that pushes raw bytes to the audio queue."""
+    if status:
+        print(status, file=sys.stderr)
+    audio_queue.put(bytes(indata))
+
+
+class VADProcessor:
+    def __init__(self, aggressiveness, silence_threshold):
+        self.vad = webrtcvad.Vad(aggressiveness)
+        self.silence_threshold = silence_threshold
+        self.reset()
+
+    def reset(self):
+        self.triggered = False
+        self.utterance_frames = []
+        self.silence_duration = 0.0
+        self.pre_buffer = collections.deque(maxlen=10)  # ~300ms pre-roll
+
+    def process_frame(self, frame_bytes):
+        """Process one 30ms frame. Returns utterance bytes when complete, else None."""
+        is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE)
+
+        if not self.triggered:
+            self.pre_buffer.append(frame_bytes)
+            if is_speech:
+                self.triggered = True
+                self.silence_duration = 0.0
+                self.utterance_frames = list(self.pre_buffer)
+                self.utterance_frames.append(frame_bytes)
+        else:
+            self.utterance_frames.append(frame_bytes)
+            if is_speech:
+                self.silence_duration = 0.0
+            else:
+                self.silence_duration += FRAME_DURATION_MS / 1000.0
+                if self.silence_duration >= self.silence_threshold:
+                    if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES:
+                        self.reset()
+                        return None
+                    result = b"".join(self.utterance_frames)
+                    self.reset()
+                    return result
+        return None
--- a/python/tool-speechtotext/sttlib/whisper_loader.py
+++ b/python/tool-speechtotext/sttlib/whisper_loader.py
@@ -0,0 +1,15 @@
+import os
+from faster_whisper import WhisperModel
+
+os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
+
+
+def load_whisper_model(model_size):
+    """Load Whisper with GPU (cuda/float16) -> CPU (cpu/int8) fallback."""
+    print(f"Loading Whisper model ({model_size})...")
+    try:
+        return WhisperModel(model_size, device="cuda", compute_type="float16")
+    except Exception as e:
+        print(f"GPU loading failed: {e}")
+        print("Falling back to CPU (int8)")
+        return WhisperModel(model_size, device="cpu", compute_type="int8")