command line app STT, text to local LLM

This commit is contained in:
dl92
2026-01-14 00:20:56 +00:00
parent f09b390d90
commit 781659a552
2 changed files with 77 additions and 23 deletions

View File

@@ -3,6 +3,7 @@ import numpy as np
import pyperclip
import requests
import sys
import argparse
from faster_whisper import WhisperModel
import os
@@ -10,12 +11,14 @@ os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
# --- Configuration ---
MODEL_SIZE = "medium" # Options: "base", "small", "medium", "large-v3"
OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434
OLLAMA_MODEL = "qwen3:latest"
OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434
DEFAULT_OLLAMA_MODEL = "qwen3:latest"
# Load Whisper on GPU
# Load Whisper on GPU
# float16 is faster and uses less VRAM on NVIDIA cards
print("Loading Whisper model...")
try:
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
except Exception as e:
@@ -23,14 +26,15 @@ except Exception as e:
print("Falling back to CPU (Check your CUDA/cuDNN installation)")
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int16")
def record_audio():
fs = 16000
fs = 16000
print("\n[READY] Press Enter to START recording...")
input()
print("[RECORDING] Press Enter to STOP...")
recording = []
def callback(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
@@ -38,42 +42,86 @@ def record_audio():
with sd.InputStream(samplerate=fs, channels=1, callback=callback):
input()
return np.concatenate(recording, axis=0)
def main():
print(f"System active. Model: {OLLAMA_MODEL}")
# 1. Setup Parser
print(f"System active. Model: {DEFAULT_OLLAMA_MODEL}")
parser = argparse.ArgumentParser(description="Whisper + Ollama CLI")
# Known Arguments (Hardcoded logic)
parser.add_argument("--nollm", "-n", action='store_true',
help="turn off llm")
parser.add_argument("--system", "-s", default=None,
help="The system prompt for Ollama")
parser.add_argument("--model_size", default="base",
help="Whisper model size: base, small, medium")
parser.add_argument(
"--ollama_model", default=DEFAULT_OLLAMA_MODEL, help="Ollama model name")
parser.add_argument(
"--num_ctx", default='5000', help="context length")
parser.add_argument(
"--temp", default='0.7', help="temperature")
# 2. Capture "Unknown" arguments
# args = known values, unknown = a list like ['--num_ctx', '4096', '--temp', '0.7']
args, unknown = parser.parse_known_args()
# Convert unknown list to a dictionary for the Ollama 'options' field
# This logic pairs ['--key', 'value'] into {key: value}
extra_options = {}
for i in range(0, len(unknown), 2):
key = unknown[i].lstrip('-') # remove the '--'
val = unknown[i+1]
# Try to convert numbers to actual ints/floats
try:
val = float(val) if '.' in val else int(val)
except ValueError:
pass
extra_options[key] = val
while True:
try:
audio_data = record_audio()
print("[TRANSCRIBING]...")
segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
text = "".join([segment.text for segment in segments]).strip()
if not text:
print("No speech detected. Try again.")
continue
print(f"You said: {text}")
# print(f"You said: {text}")
pyperclip.copy(text)
# Send to Ollama
print(f"[OLLAMA] Thinking...")
response = requests.post(OLLAMA_URL, json={
"model": OLLAMA_MODEL,
"prompt": text,
"stream": False
})
result = response.json().get("response", "")
print(f"\nLLM Response:\n{result}\n")
if (args.nollm == False):
# Send to Ollama
print(f"[OLLAMA] Thinking...")
payload = {
"model": args.ollama_model,
"prompt": text,
"stream": False,
"options": extra_options,
}
if args.system:
payload["system"] = args
response = requests.post(OLLAMA_URL, json=payload)
result = response.json().get("response", "")
print(f"\nLLM Response:\n{result}\n")
else:
print(f"\n{text}\n")
except KeyboardInterrupt:
print("\nExiting...")
break
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
main()
main()