Refactor tool-speechtotext: extract sttlib shared library and add tests

Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
2026-02-08 00:40:31 +00:00
parent 848681087e
commit 104da381fb
15 changed files with 480 additions and 195 deletions
--- a/python/tool-speechtotext/sttlib/vad.py
+++ b/python/tool-speechtotext/sttlib/vad.py
@@ -0,0 +1,58 @@
+import sys
+import queue
+import collections
+import webrtcvad
+
+SAMPLE_RATE = 16000
+CHANNELS = 1
+FRAME_DURATION_MS = 30
+FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000)  # 480 samples
+MIN_UTTERANCE_FRAMES = 10  # ~300ms minimum to filter coughs/clicks
+
+audio_queue = queue.Queue()
+
+
+def audio_callback(indata, frames, time_info, status):
+    """sounddevice callback that pushes raw bytes to the audio queue."""
+    if status:
+        print(status, file=sys.stderr)
+    audio_queue.put(bytes(indata))
+
+
+class VADProcessor:
+    def __init__(self, aggressiveness, silence_threshold):
+        self.vad = webrtcvad.Vad(aggressiveness)
+        self.silence_threshold = silence_threshold
+        self.reset()
+
+    def reset(self):
+        self.triggered = False
+        self.utterance_frames = []
+        self.silence_duration = 0.0
+        self.pre_buffer = collections.deque(maxlen=10)  # ~300ms pre-roll
+
+    def process_frame(self, frame_bytes):
+        """Process one 30ms frame. Returns utterance bytes when complete, else None."""
+        is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE)
+
+        if not self.triggered:
+            self.pre_buffer.append(frame_bytes)
+            if is_speech:
+                self.triggered = True
+                self.silence_duration = 0.0
+                self.utterance_frames = list(self.pre_buffer)
+                self.utterance_frames.append(frame_bytes)
+        else:
+            self.utterance_frames.append(frame_bytes)
+            if is_speech:
+                self.silence_duration = 0.0
+            else:
+                self.silence_duration += FRAME_DURATION_MS / 1000.0
+                if self.silence_duration >= self.silence_threshold:
+                    if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES:
+                        self.reset()
+                        return None
+                    result = b"".join(self.utterance_frames)
+                    self.reset()
+                    return result
+        return None