Refactor tool-speechtotext: extract sttlib shared library and add tests
Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
This commit is contained in:
@@ -1,31 +1,16 @@
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
import pyperclip
|
||||
import sys
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import ollama
|
||||
import json
|
||||
from faster_whisper import WhisperModel
|
||||
import ollama
|
||||
from sttlib import load_whisper_model, record_until_enter, transcribe
|
||||
|
||||
# --- Configuration ---
|
||||
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
|
||||
MODEL_SIZE = "medium"
|
||||
OLLAMA_MODEL = "qwen2.5-coder:7b"
|
||||
CONFIRM_COMMANDS = True # Set to False to run commands instantly
|
||||
|
||||
# Load Whisper on GPU
|
||||
print("Loading Whisper model...")
|
||||
try:
|
||||
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
|
||||
except Exception as e:
|
||||
print(f"Error loading GPU: {e}, falling back to CPU")
|
||||
model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
|
||||
|
||||
# --- Terminal Tool ---
|
||||
|
||||
|
||||
def run_terminal_command(command: str):
|
||||
"""
|
||||
Executes a bash command in the Linux terminal.
|
||||
@@ -33,8 +18,7 @@ def run_terminal_command(command: str):
|
||||
"""
|
||||
if CONFIRM_COMMANDS:
|
||||
print(f"\n{'='*40}")
|
||||
print(f"⚠️ AI SUGGESTED: \033[1;32m{command}\033[0m")
|
||||
# Allow user to provide feedback if they say 'n'
|
||||
print(f"\u26a0\ufe0f AI SUGGESTED: \033[1;32m{command}\033[0m")
|
||||
choice = input(" Confirm? [Y/n] or provide feedback: ").strip()
|
||||
|
||||
if choice.lower() == 'n':
|
||||
@@ -57,22 +41,15 @@ def run_terminal_command(command: str):
|
||||
return f"Execution Error: {str(e)}"
|
||||
|
||||
|
||||
def record_audio():
|
||||
fs, recording = 16000, []
|
||||
print("\n[READY] Press Enter to START...")
|
||||
input()
|
||||
print("[RECORDING] Press Enter to STOP...")
|
||||
def cb(indata, f, t, s): recording.append(indata.copy())
|
||||
with sd.InputStream(samplerate=fs, channels=1, callback=cb):
|
||||
input()
|
||||
return np.concatenate(recording, axis=0)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model", default=OLLAMA_MODEL)
|
||||
parser.add_argument("--model-size", default="medium",
|
||||
help="Whisper model size")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
whisper_model = load_whisper_model(args.model_size)
|
||||
|
||||
# Initial System Prompt
|
||||
messages = [{
|
||||
'role': 'system',
|
||||
@@ -88,9 +65,8 @@ def main():
|
||||
while True:
|
||||
try:
|
||||
# 1. Voice Capture
|
||||
audio_data = record_audio()
|
||||
segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
|
||||
user_text = "".join([s.text for s in segments]).strip()
|
||||
audio_data = record_until_enter()
|
||||
user_text = transcribe(whisper_model, audio_data.flatten())
|
||||
if not user_text:
|
||||
continue
|
||||
|
||||
|
||||
Reference in New Issue
Block a user