import sounddevice as sd import numpy as np import pyperclip import requests import sys import argparse from faster_whisper import WhisperModel import os os.environ["CT2_CUDA_ALLOW_FP16"] = "1" # --- Configuration --- MODEL_SIZE = "medium" # Options: "base", "small", "medium", "large-v3" OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434 DEFAULT_OLLAMA_MODEL = "qwen3:latest" # Load Whisper on GPU # float16 is faster and uses less VRAM on NVIDIA cards print("Loading Whisper model...") try: model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16") except Exception as e: print(f"Error loading GPU: {e}") print("Falling back to CPU (Check your CUDA/cuDNN installation)") model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int16") def record_audio(): fs = 16000 print("\n[READY] Press Enter to START recording...") input() print("[RECORDING] Press Enter to STOP...") recording = [] def callback(indata, frames, time, status): if status: print(status, file=sys.stderr) recording.append(indata.copy()) with sd.InputStream(samplerate=fs, channels=1, callback=callback): input() return np.concatenate(recording, axis=0) def main(): # 1. Setup Parser print(f"System active. Model: {DEFAULT_OLLAMA_MODEL}") parser = argparse.ArgumentParser(description="Whisper + Ollama CLI") # Known Arguments (Hardcoded logic) parser.add_argument("--nollm", "-n", action='store_true', help="turn off llm") parser.add_argument("--system", "-s", default=None, help="The system prompt for Ollama") parser.add_argument("--model_size", default="base", help="Whisper model size: base, small, medium") parser.add_argument( "--ollama_model", default=DEFAULT_OLLAMA_MODEL, help="Ollama model name") parser.add_argument( "--num_ctx", default='5000', help="context length") parser.add_argument( "--temp", default='0.7', help="temperature") # 2. Capture "Unknown" arguments # args = known values, unknown = a list like ['--num_ctx', '4096', '--temp', '0.7'] args, unknown = parser.parse_known_args() # Convert unknown list to a dictionary for the Ollama 'options' field # This logic pairs ['--key', 'value'] into {key: value} extra_options = {} for i in range(0, len(unknown), 2): key = unknown[i].lstrip('-') # remove the '--' val = unknown[i+1] # Try to convert numbers to actual ints/floats try: val = float(val) if '.' in val else int(val) except ValueError: pass extra_options[key] = val while True: try: audio_data = record_audio() print("[TRANSCRIBING]...") segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) text = "".join([segment.text for segment in segments]).strip() if not text: print("No speech detected. Try again.") continue print(f"You said: {text}") pyperclip.copy(text) if (args.nollm == False): # Send to Ollama print(f"[OLLAMA] Thinking...") payload = { "model": args.ollama_model, "prompt": text, "stream": False, "options": extra_options, } if args.system: payload["system"] = args response = requests.post(OLLAMA_URL, json=payload) result = response.json().get("response", "") print(f"\nLLM Response:\n{result}\n") else: print(f"\n{text}\n") except KeyboardInterrupt: print("\nExiting...") break except Exception as e: print(f"An error occurred: {e}") if __name__ == "__main__": main()