initial

gemini v1
2026-01-13 16:47:04 +00:00
parent f8d7fdda5d
commit 33248895ff
2 changed files with 58 additions and 0 deletions
--- a/python/tool-speechtotext/.vscode/settings.json
+++ b/python/tool-speechtotext/.vscode/settings.json
@@ -0,0 +1,5 @@
 {
    "python-envs.defaultEnvManager": "ms-python.python:conda",
    "python-envs.defaultPackageManager": "ms-python.python:conda",
    "python-envs.pythonProjects": []
 }
--- a/python/tool-speechtotext/assistant.py
+++ b/python/tool-speechtotext/assistant.py
@@ -0,0 +1,53 @@
 import sounddevice as sd
 import numpy as np
 import pyperclip
 import requests
 from faster_whisper import WhisperModel
 # Configuration
 MODEL_SIZE = "base" # "base" is fast, "small" is more accurate
 OLLAMA_URL = "http://localhost:11435/api/generate"
 OLLAMA_MODEL = "llama3"
 # Load Whisper on GPU
 model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="default")
 def record_audio():
    fs = 16000  # Sample rate
    print("\n--- Press Enter to START recording ---")
    input()
    print("Recording... Press Enter to STOP.")
    recording = []
    def callback(indata, frames, time, status):
        recording.append(indata.copy())
    with sd.InputStream(samplerate=fs, channels=1, callback=callback):
        input()
    return np.concatenate(recording, axis=0)
 def main():
    while True:
        audio_data = record_audio()
        # Transcribe
        segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
        text = "".join([segment.text for segment in segments]).strip()
        print(f"You said: {text}")
        pyperclip.copy(text) # Copies to clipboard automatically
        # Send to Ollama
        response = requests.post(OLLAMA_URL, json={
            "model": OLLAMA_MODEL,
            "prompt": text,
            "stream": False
        })
        result = response.json().get("response", "")
        print(f"\nLLM Response:\n{result}\n")
 if __name__ == "__main__":
    main()