initial

gemini v1
2026-01-13 16:47:04 +00:00
parent f8d7fdda5d
commit 33248895ff
2 changed files with 58 additions and 0 deletions
--- a/python/tool-speechtotext/assistant.py
+++ b/python/tool-speechtotext/assistant.py
@@ -0,0 +1,53 @@
+import sounddevice as sd
+import numpy as np
+import pyperclip
+import requests
+from faster_whisper import WhisperModel
+
+# Configuration
+MODEL_SIZE = "base" # "base" is fast, "small" is more accurate
+OLLAMA_URL = "http://localhost:11435/api/generate"
+OLLAMA_MODEL = "llama3"
+
+# Load Whisper on GPU
+model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="default")
+
+def record_audio():
+
+    fs = 16000  # Sample rate
+    print("\n--- Press Enter to START recording ---")
+    input()
+    print("Recording... Press Enter to STOP.")
+    
+    recording = []
+    def callback(indata, frames, time, status):
+        recording.append(indata.copy())
+    
+    with sd.InputStream(samplerate=fs, channels=1, callback=callback):
+        input()
+    
+    return np.concatenate(recording, axis=0)
+
+def main():
+    while True:
+        audio_data = record_audio()
+        
+        # Transcribe
+        segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
+        text = "".join([segment.text for segment in segments]).strip()
+        
+        print(f"You said: {text}")
+        pyperclip.copy(text) # Copies to clipboard automatically
+        
+        # Send to Ollama
+        response = requests.post(OLLAMA_URL, json={
+            "model": OLLAMA_MODEL,
+            "prompt": text,
+            "stream": False
+        })
+        
+        result = response.json().get("response", "")
+        print(f"\nLLM Response:\n{result}\n")
+
+if __name__ == "__main__":
+    main()