Refactor tool-speechtotext: extract sttlib shared library and add tests

Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
2026-02-08 00:40:31 +00:00
parent 848681087e
commit 104da381fb
15 changed files with 480 additions and 195 deletions
--- a/python/tool-speechtotext/voice_to_terminal.py
+++ b/python/tool-speechtotext/voice_to_terminal.py
@@ -1,31 +1,16 @@
-import sounddevice as sd
-import numpy as np
-import pyperclip
-import sys
 import argparse
-import os
 import subprocess
-import ollama
 import json
-from faster_whisper import WhisperModel
+import ollama
+from sttlib import load_whisper_model, record_until_enter, transcribe

 # --- Configuration ---
-os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
-MODEL_SIZE = "medium"
 OLLAMA_MODEL = "qwen2.5-coder:7b"
 CONFIRM_COMMANDS = True  # Set to False to run commands instantly

-# Load Whisper on GPU
-print("Loading Whisper model...")
-try:
-    model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
-except Exception as e:
-    print(f"Error loading GPU: {e}, falling back to CPU")
-    model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")

 # --- Terminal Tool ---

-
 def run_terminal_command(command: str):
    """
    Executes a bash command in the Linux terminal.
@@ -33,8 +18,7 @@ def run_terminal_command(command: str):
    """
    if CONFIRM_COMMANDS:
        print(f"\n{'='*40}")
-        print(f"⚠️  AI SUGGESTED: \033[1;32m{command}\033[0m")
-        # Allow user to provide feedback if they say 'n'
+        print(f"\u26a0\ufe0f  AI SUGGESTED: \033[1;32m{command}\033[0m")
        choice = input("   Confirm? [Y/n] or provide feedback: ").strip()

        if choice.lower() == 'n':
@@ -57,22 +41,15 @@ def run_terminal_command(command: str):
        return f"Execution Error: {str(e)}"


-def record_audio():
-    fs, recording = 16000, []
-    print("\n[READY] Press Enter to START...")
-    input()
-    print("[RECORDING] Press Enter to STOP...")
-    def cb(indata, f, t, s): recording.append(indata.copy())
-    with sd.InputStream(samplerate=fs, channels=1, callback=cb):
-        input()
-    return np.concatenate(recording, axis=0)
-
-
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default=OLLAMA_MODEL)
+    parser.add_argument("--model-size", default="medium",
+                        help="Whisper model size")
    args, _ = parser.parse_known_args()

+    whisper_model = load_whisper_model(args.model_size)
+
    # Initial System Prompt
    messages = [{
        'role': 'system',
@@ -88,9 +65,8 @@ def main():
    while True:
        try:
            # 1. Voice Capture
-            audio_data = record_audio()
-            segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
-            user_text = "".join([s.text for s in segments]).strip()
+            audio_data = record_until_enter()
+            user_text = transcribe(whisper_model, audio_data.flatten())
            if not user_text:
                continue