diff --git a/python/tool-speechtotext/.vscode/settings.json b/python/tool-speechtotext/.vscode/settings.json new file mode 100644 index 0000000..a8c2003 --- /dev/null +++ b/python/tool-speechtotext/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda", + "python-envs.pythonProjects": [] +} \ No newline at end of file diff --git a/python/tool-speechtotext/assistant.py b/python/tool-speechtotext/assistant.py new file mode 100644 index 0000000..7dd1c39 --- /dev/null +++ b/python/tool-speechtotext/assistant.py @@ -0,0 +1,53 @@ +import sounddevice as sd +import numpy as np +import pyperclip +import requests +from faster_whisper import WhisperModel + +# Configuration +MODEL_SIZE = "base" # "base" is fast, "small" is more accurate +OLLAMA_URL = "http://localhost:11435/api/generate" +OLLAMA_MODEL = "llama3" + +# Load Whisper on GPU +model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="default") + +def record_audio(): + + fs = 16000 # Sample rate + print("\n--- Press Enter to START recording ---") + input() + print("Recording... Press Enter to STOP.") + + recording = [] + def callback(indata, frames, time, status): + recording.append(indata.copy()) + + with sd.InputStream(samplerate=fs, channels=1, callback=callback): + input() + + return np.concatenate(recording, axis=0) + +def main(): + while True: + audio_data = record_audio() + + # Transcribe + segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) + text = "".join([segment.text for segment in segments]).strip() + + print(f"You said: {text}") + pyperclip.copy(text) # Copies to clipboard automatically + + # Send to Ollama + response = requests.post(OLLAMA_URL, json={ + "model": OLLAMA_MODEL, + "prompt": text, + "stream": False + }) + + result = response.json().get("response", "") + print(f"\nLLM Response:\n{result}\n") + +if __name__ == "__main__": + main()