import sounddevice as sd
import numpy as np
import pyperclip
import requests
import sys
from faster_whisper import WhisperModel

import os
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"

# --- Configuration ---
MODEL_SIZE = "medium"  # Options: "base", "small", "medium", "large-v3"
OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434
OLLAMA_MODEL = "qwen3:latest"

# Load Whisper on GPU 
# float16 is faster and uses less VRAM on NVIDIA cards
print("Loading Whisper model...")
try:
    model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
except Exception as e:
    print(f"Error loading GPU: {e}")
    print("Falling back to CPU (Check your CUDA/cuDNN installation)")
    model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int16")

def record_audio():
    fs = 16000 
    print("\n[READY] Press Enter to START recording...")
    input()
    print("[RECORDING] Press Enter to STOP...")
    
    recording = []
    
    def callback(indata, frames, time, status):
        if status:
            print(status, file=sys.stderr)
        recording.append(indata.copy())

    with sd.InputStream(samplerate=fs, channels=1, callback=callback):
        input()
        
    return np.concatenate(recording, axis=0)

def main():
    print(f"System active. Model: {OLLAMA_MODEL}")
    while True:
        try:
            audio_data = record_audio()
            
            print("[TRANSCRIBING]...")
            segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
            text = "".join([segment.text for segment in segments]).strip()
            
            if not text:
                print("No speech detected. Try again.")
                continue

            print(f"You said: {text}")
            pyperclip.copy(text)

            # Send to Ollama
            print(f"[OLLAMA] Thinking...")
            response = requests.post(OLLAMA_URL, json={
                "model": OLLAMA_MODEL,
                "prompt": text,
                "stream": False
            })
            
            result = response.json().get("response", "")
            print(f"\nLLM Response:\n{result}\n")
            
        except KeyboardInterrupt:
            print("\nExiting...")
            break
        except Exception as e:
            print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()