diff --git a/python/tool-speechtotext/assistant.py b/python/tool-speechtotext/assistant.py index 79eb1df..bd94819 100644 --- a/python/tool-speechtotext/assistant.py +++ b/python/tool-speechtotext/assistant.py @@ -3,6 +3,7 @@ import numpy as np import pyperclip import requests import sys +import argparse from faster_whisper import WhisperModel import os @@ -10,12 +11,14 @@ os.environ["CT2_CUDA_ALLOW_FP16"] = "1" # --- Configuration --- MODEL_SIZE = "medium" # Options: "base", "small", "medium", "large-v3" -OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434 -OLLAMA_MODEL = "qwen3:latest" +OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434 +DEFAULT_OLLAMA_MODEL = "qwen3:latest" -# Load Whisper on GPU +# Load Whisper on GPU # float16 is faster and uses less VRAM on NVIDIA cards print("Loading Whisper model...") + + try: model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16") except Exception as e: @@ -23,14 +26,15 @@ except Exception as e: print("Falling back to CPU (Check your CUDA/cuDNN installation)") model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int16") + def record_audio(): - fs = 16000 + fs = 16000 print("\n[READY] Press Enter to START recording...") input() print("[RECORDING] Press Enter to STOP...") - + recording = [] - + def callback(indata, frames, time, status): if status: print(status, file=sys.stderr) @@ -38,42 +42,86 @@ def record_audio(): with sd.InputStream(samplerate=fs, channels=1, callback=callback): input() - + return np.concatenate(recording, axis=0) + def main(): - print(f"System active. Model: {OLLAMA_MODEL}") + # 1. Setup Parser + print(f"System active. Model: {DEFAULT_OLLAMA_MODEL}") + parser = argparse.ArgumentParser(description="Whisper + Ollama CLI") + + # Known Arguments (Hardcoded logic) + parser.add_argument("--nollm", "-n", action='store_true', + help="turn off llm") + parser.add_argument("--system", "-s", default=None, + help="The system prompt for Ollama") + parser.add_argument("--model_size", default="base", + help="Whisper model size: base, small, medium") + parser.add_argument( + "--ollama_model", default=DEFAULT_OLLAMA_MODEL, help="Ollama model name") + parser.add_argument( + "--num_ctx", default='5000', help="context length") + parser.add_argument( + "--temp", default='0.7', help="temperature") + + # 2. Capture "Unknown" arguments + # args = known values, unknown = a list like ['--num_ctx', '4096', '--temp', '0.7'] + args, unknown = parser.parse_known_args() + + # Convert unknown list to a dictionary for the Ollama 'options' field + # This logic pairs ['--key', 'value'] into {key: value} + extra_options = {} + for i in range(0, len(unknown), 2): + key = unknown[i].lstrip('-') # remove the '--' + val = unknown[i+1] + # Try to convert numbers to actual ints/floats + try: + val = float(val) if '.' in val else int(val) + except ValueError: + pass + extra_options[key] = val + while True: try: audio_data = record_audio() - + print("[TRANSCRIBING]...") segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) text = "".join([segment.text for segment in segments]).strip() - + if not text: print("No speech detected. Try again.") continue - print(f"You said: {text}") + # print(f"You said: {text}") pyperclip.copy(text) - # Send to Ollama - print(f"[OLLAMA] Thinking...") - response = requests.post(OLLAMA_URL, json={ - "model": OLLAMA_MODEL, - "prompt": text, - "stream": False - }) - - result = response.json().get("response", "") - print(f"\nLLM Response:\n{result}\n") - + if (args.nollm == False): + # Send to Ollama + print(f"[OLLAMA] Thinking...") + payload = { + "model": args.ollama_model, + "prompt": text, + "stream": False, + "options": extra_options, + } + + if args.system: + payload["system"] = args + response = requests.post(OLLAMA_URL, json=payload) + + result = response.json().get("response", "") + print(f"\nLLM Response:\n{result}\n") + else: + print(f"\n{text}\n") + except KeyboardInterrupt: print("\nExiting...") break except Exception as e: print(f"An error occurred: {e}") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/python/tool-speechtotext/talk.sh b/python/tool-speechtotext/talk.sh new file mode 100755 index 0000000..bd8b81b --- /dev/null +++ b/python/tool-speechtotext/talk.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export CT2_CUDA_ALLOW_FP16=1 + +# 'mamba run' executes the command within the context of the environment +# without needing to source .bashrc or shell hooks manually. +mamba run -n whisper-ollama python ~/family-repo/Code/python/tool-speechtotext/assistant.py "$@"