From 7f3ea892977daf826df31208dcfed5a795a22891 Mon Sep 17 00:00:00 2001 From: dl92 Date: Wed, 14 Jan 2026 01:46:31 +0000 Subject: [PATCH] voice_to_terminal#1 automate terminal with voice --- python/tool-speechtotext/terminal.sh | 6 + python/tool-speechtotext/voice_to_terminal.py | 164 ++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100755 python/tool-speechtotext/terminal.sh create mode 100644 python/tool-speechtotext/voice_to_terminal.py diff --git a/python/tool-speechtotext/terminal.sh b/python/tool-speechtotext/terminal.sh new file mode 100755 index 0000000..8ecd455 --- /dev/null +++ b/python/tool-speechtotext/terminal.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export CT2_CUDA_ALLOW_FP16=1 + +# 'mamba run' executes the command within the context of the environment +# without needing to source .bashrc or shell hooks manually. +mamba run -n whisper-ollama python ~/family-repo/Code/python/tool-speechtotext/voice_to_terminal.py "$@" diff --git a/python/tool-speechtotext/voice_to_terminal.py b/python/tool-speechtotext/voice_to_terminal.py new file mode 100644 index 0000000..5d67f2b --- /dev/null +++ b/python/tool-speechtotext/voice_to_terminal.py @@ -0,0 +1,164 @@ +import argparse +import sounddevice as sd +import numpy as np +import pyperclip +import sys +import os +import subprocess +import ollama +from faster_whisper import WhisperModel + +# --- Settings --- +os.environ["CT2_CUDA_ALLOW_FP16"] = "1" +MODEL_SIZE = "medium" +OLLAMA_MODEL = "qwen2.5-coder:7b" +CONFIRM_COMMANDS = True # Set to False to run commands instantly + +# Load Whisper +print("Loading Whisper model...") +model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16") + + +def run_terminal_command(command: str): + """Executes a bash command in the terminal. Handles file ops, system info, etc.""" + + # 1. Visual Confirmation Block + if CONFIRM_COMMANDS: + print(f"\n{'='*40}") + print(f"⚠️ AI SUGGESTED COMMAND: \033[1;32m{command}\033[0m") + choice = input(" Confirm execution? [Y/n]: ").strip().lower() + print(f"{'='*40}\n") + + if choice == 'n': + return "User rejected this command." + + # 2. Safety Blacklist (Last line of defense) + blacklist = ["rm -rf /", "mkfs", "dd if=", ":(){ :|:& };:"] + if any(forbidden in command for forbidden in blacklist): + return "Error: Command blocked for security reasons." + + # 3. Execution + try: + result = subprocess.run(command, shell=True, + capture_output=True, text=True, timeout=20) + output = f"STDOUT: {result.stdout}\nSTDERR: {result.stderr}" + return output if output.strip() else "Success (No output)." + except Exception as e: + return f"Execution Error: {str(e)}" + + +# Register tool +available_tools = {'run_terminal_command': run_terminal_command} + + +def record_audio(): + fs, recording = 16000, [] + print("\n[READY] Press Enter to START...") + input() + print("[RECORDING] Press Enter to STOP...") + def cb(indata, f, t, s): recording.append(indata.copy()) + with sd.InputStream(samplerate=fs, channels=1, callback=cb): + input() + return np.concatenate(recording, axis=0) + + +def main(): + # 1. Setup Parser for CLI flags + parser = argparse.ArgumentParser( + description="Whisper + Ollama Terminal Assistant") + parser.add_argument("--model", default=OLLAMA_MODEL, + help="Ollama model name") + parser.add_argument("--confirm", action='store_true', + default=CONFIRM_COMMANDS, help="Confirm commands") + args, unknown = parser.parse_known_args() + + # Initialize conversation with a strict System Role + # This "nudges" the model to use the tool feature rather than just chatting + messages = [{ + 'role': 'system', + 'content': ( + 'You are a Linux terminal expert. When the user asks for a system task, ' + 'you MUST use the "run_terminal_command" tool. Do not explain your actions ' + 'in text; simply provide the command via the tool.' + ) + }] + + print(f"--- Assistant Active (Model: {args.model}) ---") + print(f"Confirmation Mode: {'ON' if args.confirm else 'OFF'}") + + while True: + try: + # A. Record and Transcribe + audio_data = record_audio() + print("[TRANSCRIBING]...") + segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) + user_text = "".join([s.text for s in segments]).strip() + + if not user_text: + continue + + print(f"\nYOU: {user_text}") + messages.append({'role': 'user', 'content': user_text}) + + # B. Get AI Response (Strict Temperature 0 for reliability) + response = ollama.chat( + model=args.model, + messages=messages, + tools=[run_terminal_command], + options={'temperature': 0} + ) + + # C. Detect Tool Calls (Handle both formal calls and raw JSON text) + tool_calls = response.message.tool_calls + + # REPAIR LOGIC: If AI "talks" in JSON instead of using the tool field + if not tool_calls and '"run_terminal_command"' in response.message.content: + import json + try: + content = response.message.content + # Extract JSON block from text + start, end = content.find('{'), content.rfind('}') + 1 + raw_json = json.loads(content[start:end]) + # Reconstruct as a tool call format + tool_calls = [{'function': { + 'name': 'run_terminal_command', + 'arguments': raw_json.get('arguments', raw_json) + }}] + except: + pass + + # D. Execute Tools if found + if tool_calls: + for tool_call in tool_calls: + # Parse arguments based on format (official object vs dictionary) + if hasattr(tool_call, 'function'): + func_args = tool_call.function.arguments + else: + func_args = tool_call['function']['arguments'] + + # Run the terminal command locally + result = run_terminal_command(func_args['command']) + + # Add result back to history so AI can see it + messages.append(response.message) + messages.append({'role': 'tool', 'content': result}) + + # Get the final "Human" explanation from AI + final_response = ollama.chat( + model=args.model, messages=messages) + print(f"AI: {final_response.message.content}") + messages.append(final_response.message) + else: + # Normal Chatting + print(f"AI: {response.message.content}") + messages.append(response.message) + + except KeyboardInterrupt: + print("\nExiting Assistant...") + break + except Exception as e: + print(f"System Error: {e}") + + +if __name__ == "__main__": + main()