Refactor tool-speechtotext: extract sttlib shared library and add tests

Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
2026-02-08 00:40:31 +00:00
parent 848681087e
commit 104da381fb
15 changed files with 480 additions and 195 deletions
--- a/python/tool-speechtotext/sttlib/audio.py
+++ b/python/tool-speechtotext/sttlib/audio.py
@@ -0,0 +1,28 @@
+import sys
+import numpy as np
+import sounddevice as sd
+
+
+def record_until_enter(sample_rate=16000):
+    """Record audio until user presses Enter. Returns float32 numpy array."""
+    print("\n[READY] Press Enter to START recording...")
+    input()
+    print("[RECORDING] Press Enter to STOP...")
+
+    recording = []
+
+    def callback(indata, frames, time, status):
+        if status:
+            print(status, file=sys.stderr)
+        recording.append(indata.copy())
+
+    with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback):
+        input()
+
+    return np.concatenate(recording, axis=0)
+
+
+def pcm_bytes_to_float32(pcm_bytes):
+    """Convert raw 16-bit PCM bytes to float32 array normalized to [-1, 1]."""
+    audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
+    return audio_int16.astype(np.float32) / 32768.0