Refactor tool-speechtotext: extract sttlib shared library and add tests

Extract duplicated code (Whisper loading, audio recording, transcription,
VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as
thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback
bug in assistant.py and args.system assignment bug.
This commit is contained in:
local
2026-02-08 00:40:31 +00:00
parent 848681087e
commit 104da381fb
15 changed files with 480 additions and 195 deletions

View File

@@ -1,57 +1,17 @@
import sounddevice as sd
import numpy as np
import argparse
import pyperclip
import requests
import sys
import argparse
from faster_whisper import WhisperModel
import os
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
from sttlib import load_whisper_model, record_until_enter, transcribe
# --- Configuration ---
MODEL_SIZE = "medium" # Options: "base", "small", "medium", "large-v3"
OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434
OLLAMA_URL = "http://localhost:11434/api/generate"
DEFAULT_OLLAMA_MODEL = "qwen3:latest"
# Load Whisper on GPU
# float16 is faster and uses less VRAM on NVIDIA cards
print("Loading Whisper model...")
try:
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
except Exception as e:
print(f"Error loading GPU: {e}")
print("Falling back to CPU (Check your CUDA/cuDNN installation)")
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int16")
def record_audio():
fs = 16000
print("\n[READY] Press Enter to START recording...")
input()
print("[RECORDING] Press Enter to STOP...")
recording = []
def callback(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
recording.append(indata.copy())
with sd.InputStream(samplerate=fs, channels=1, callback=callback):
input()
return np.concatenate(recording, axis=0)
def main():
# 1. Setup Parser
print(f"System active. Model: {DEFAULT_OLLAMA_MODEL}")
parser = argparse.ArgumentParser(description="Whisper + Ollama CLI")
# Known Arguments (Hardcoded logic)
parser.add_argument("--nollm", "-n", action='store_true',
help="turn off llm")
parser.add_argument("--system", "-s", default=None,
@@ -65,30 +25,27 @@ def main():
parser.add_argument(
"--temp", default='0.7', help="temperature")
# 2. Capture "Unknown" arguments
# args = known values, unknown = a list like ['--num_ctx', '4096', '--temp', '0.7']
args, unknown = parser.parse_known_args()
# Convert unknown list to a dictionary for the Ollama 'options' field
# This logic pairs ['--key', 'value'] into {key: value}
extra_options = {}
for i in range(0, len(unknown), 2):
key = unknown[i].lstrip('-') # remove the '--'
key = unknown[i].lstrip('-')
val = unknown[i+1]
# Try to convert numbers to actual ints/floats
try:
val = float(val) if '.' in val else int(val)
except ValueError:
pass
extra_options[key] = val
model = load_whisper_model(args.model_size)
while True:
try:
audio_data = record_audio()
audio_data = record_until_enter()
print("[TRANSCRIBING]...")
segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
text = "".join([segment.text for segment in segments]).strip()
text = transcribe(model, audio_data.flatten())
if not text:
print("No speech detected. Try again.")
@@ -97,8 +54,7 @@ def main():
print(f"You said: {text}")
pyperclip.copy(text)
if (args.nollm == False):
# Send to Ollama
if not args.nollm:
print(f"[OLLAMA] Thinking...")
payload = {
"model": args.ollama_model,
@@ -108,9 +64,9 @@ def main():
}
if args.system:
payload["system"] = args
response = requests.post(OLLAMA_URL, json=payload)
payload["system"] = args.system
response = requests.post(OLLAMA_URL, json=payload)
result = response.json().get("response", "")
print(f"\nLLM Response:\n{result}\n")
else: