import sounddevice as sd import numpy as np import pyperclip import requests from faster_whisper import WhisperModel # Configuration MODEL_SIZE = "base" # "base" is fast, "small" is more accurate OLLAMA_URL = "http://localhost:11435/api/generate" OLLAMA_MODEL = "llama3" # Load Whisper on GPU model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="default") def record_audio(): fs = 16000 # Sample rate print("\n--- Press Enter to START recording ---") input() print("Recording... Press Enter to STOP.") recording = [] def callback(indata, frames, time, status): recording.append(indata.copy()) with sd.InputStream(samplerate=fs, channels=1, callback=callback): input() return np.concatenate(recording, axis=0) def main(): while True: audio_data = record_audio() # Transcribe segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) text = "".join([segment.text for segment in segments]).strip() print(f"You said: {text}") pyperclip.copy(text) # Copies to clipboard automatically # Send to Ollama response = requests.post(OLLAMA_URL, json={ "model": OLLAMA_MODEL, "prompt": text, "stream": False }) result = response.json().get("response", "") print(f"\nLLM Response:\n{result}\n") if __name__ == "__main__": main()