diff --git a/python/tool-speechtotext/assistant.py b/python/tool-speechtotext/assistant.py index 7dd1c39..79eb1df 100644 --- a/python/tool-speechtotext/assistant.py +++ b/python/tool-speechtotext/assistant.py @@ -2,52 +2,78 @@ import sounddevice as sd import numpy as np import pyperclip import requests +import sys from faster_whisper import WhisperModel -# Configuration -MODEL_SIZE = "base" # "base" is fast, "small" is more accurate -OLLAMA_URL = "http://localhost:11435/api/generate" -OLLAMA_MODEL = "llama3" +import os +os.environ["CT2_CUDA_ALLOW_FP16"] = "1" -# Load Whisper on GPU -model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="default") +# --- Configuration --- +MODEL_SIZE = "medium" # Options: "base", "small", "medium", "large-v3" +OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434 +OLLAMA_MODEL = "qwen3:latest" + +# Load Whisper on GPU +# float16 is faster and uses less VRAM on NVIDIA cards +print("Loading Whisper model...") +try: + model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16") +except Exception as e: + print(f"Error loading GPU: {e}") + print("Falling back to CPU (Check your CUDA/cuDNN installation)") + model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int16") def record_audio(): - - fs = 16000 # Sample rate - print("\n--- Press Enter to START recording ---") + fs = 16000 + print("\n[READY] Press Enter to START recording...") input() - print("Recording... Press Enter to STOP.") + print("[RECORDING] Press Enter to STOP...") recording = [] - def callback(indata, frames, time, status): - recording.append(indata.copy()) + def callback(indata, frames, time, status): + if status: + print(status, file=sys.stderr) + recording.append(indata.copy()) + with sd.InputStream(samplerate=fs, channels=1, callback=callback): input() - + return np.concatenate(recording, axis=0) def main(): + print(f"System active. Model: {OLLAMA_MODEL}") while True: - audio_data = record_audio() - - # Transcribe - segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) - text = "".join([segment.text for segment in segments]).strip() - - print(f"You said: {text}") - pyperclip.copy(text) # Copies to clipboard automatically - - # Send to Ollama - response = requests.post(OLLAMA_URL, json={ - "model": OLLAMA_MODEL, - "prompt": text, - "stream": False - }) - - result = response.json().get("response", "") - print(f"\nLLM Response:\n{result}\n") + try: + audio_data = record_audio() + + print("[TRANSCRIBING]...") + segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) + text = "".join([segment.text for segment in segments]).strip() + + if not text: + print("No speech detected. Try again.") + continue + + print(f"You said: {text}") + pyperclip.copy(text) + + # Send to Ollama + print(f"[OLLAMA] Thinking...") + response = requests.post(OLLAMA_URL, json={ + "model": OLLAMA_MODEL, + "prompt": text, + "stream": False + }) + + result = response.json().get("response", "") + print(f"\nLLM Response:\n{result}\n") + + except KeyboardInterrupt: + print("\nExiting...") + break + except Exception as e: + print(f"An error occurred: {e}") if __name__ == "__main__": - main() + main() \ No newline at end of file