diff --git a/python/tool-speechtotext/assistant.py b/python/tool-speechtotext/assistant.py
index 7dd1c39..79eb1df 100644
--- a/python/tool-speechtotext/assistant.py
+++ b/python/tool-speechtotext/assistant.py
@@ -2,52 +2,78 @@ import sounddevice as sd
 import numpy as np
 import pyperclip
 import requests
+import sys
 from faster_whisper import WhisperModel
 
-# Configuration
-MODEL_SIZE = "base" # "base" is fast, "small" is more accurate
-OLLAMA_URL = "http://localhost:11435/api/generate"
-OLLAMA_MODEL = "llama3"
+import os
+os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
 
-# Load Whisper on GPU
-model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="default")
+# --- Configuration ---
+MODEL_SIZE = "medium"  # Options: "base", "small", "medium", "large-v3"
+OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434
+OLLAMA_MODEL = "qwen3:latest"
+
+# Load Whisper on GPU 
+# float16 is faster and uses less VRAM on NVIDIA cards
+print("Loading Whisper model...")
+try:
+    model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
+except Exception as e:
+    print(f"Error loading GPU: {e}")
+    print("Falling back to CPU (Check your CUDA/cuDNN installation)")
+    model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int16")
 
 def record_audio():
-
-    fs = 16000  # Sample rate
-    print("\n--- Press Enter to START recording ---")
+    fs = 16000 
+    print("\n[READY] Press Enter to START recording...")
     input()
-    print("Recording... Press Enter to STOP.")
+    print("[RECORDING] Press Enter to STOP...")
     
     recording = []
-    def callback(indata, frames, time, status):
-        recording.append(indata.copy())
     
+    def callback(indata, frames, time, status):
+        if status:
+            print(status, file=sys.stderr)
+        recording.append(indata.copy())
+
     with sd.InputStream(samplerate=fs, channels=1, callback=callback):
         input()
-    
+        
     return np.concatenate(recording, axis=0)
 
 def main():
+    print(f"System active. Model: {OLLAMA_MODEL}")
     while True:
-        audio_data = record_audio()
-        
-        # Transcribe
-        segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
-        text = "".join([segment.text for segment in segments]).strip()
-        
-        print(f"You said: {text}")
-        pyperclip.copy(text) # Copies to clipboard automatically
-        
-        # Send to Ollama
-        response = requests.post(OLLAMA_URL, json={
-            "model": OLLAMA_MODEL,
-            "prompt": text,
-            "stream": False
-        })
-        
-        result = response.json().get("response", "")
-        print(f"\nLLM Response:\n{result}\n")
+        try:
+            audio_data = record_audio()
+            
+            print("[TRANSCRIBING]...")
+            segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
+            text = "".join([segment.text for segment in segments]).strip()
+            
+            if not text:
+                print("No speech detected. Try again.")
+                continue
+
+            print(f"You said: {text}")
+            pyperclip.copy(text)
+
+            # Send to Ollama
+            print(f"[OLLAMA] Thinking...")
+            response = requests.post(OLLAMA_URL, json={
+                "model": OLLAMA_MODEL,
+                "prompt": text,
+                "stream": False
+            })
+            
+            result = response.json().get("response", "")
+            print(f"\nLLM Response:\n{result}\n")
+            
+        except KeyboardInterrupt:
+            print("\nExiting...")
+            break
+        except Exception as e:
+            print(f"An error occurred: {e}")
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file