voice_to_terminal#1

automate terminal with voice
2026-01-14 01:46:31 +00:00
parent 2a5347d1b9
commit 7f3ea89297
2 changed files with 170 additions and 0 deletions
--- a/python/tool-speechtotext/terminal.sh
+++ b/python/tool-speechtotext/terminal.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export CT2_CUDA_ALLOW_FP16=1
+
+# 'mamba run' executes the command within the context of the environment 
+# without needing to source .bashrc or shell hooks manually.
+mamba run -n whisper-ollama python ~/family-repo/Code/python/tool-speechtotext/voice_to_terminal.py "$@"
--- a/python/tool-speechtotext/voice_to_terminal.py
+++ b/python/tool-speechtotext/voice_to_terminal.py
@@ -0,0 +1,164 @@
+import argparse
+import sounddevice as sd
+import numpy as np
+import pyperclip
+import sys
+import os
+import subprocess
+import ollama
+from faster_whisper import WhisperModel
+
+# --- Settings ---
+os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
+MODEL_SIZE = "medium"
+OLLAMA_MODEL = "qwen2.5-coder:7b"
+CONFIRM_COMMANDS = True  # Set to False to run commands instantly
+
+# Load Whisper
+print("Loading Whisper model...")
+model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
+
+
+def run_terminal_command(command: str):
+    """Executes a bash command in the terminal. Handles file ops, system info, etc."""
+
+    # 1. Visual Confirmation Block
+    if CONFIRM_COMMANDS:
+        print(f"\n{'='*40}")
+        print(f"⚠️  AI SUGGESTED COMMAND: \033[1;32m{command}\033[0m")
+        choice = input("   Confirm execution? [Y/n]: ").strip().lower()
+        print(f"{'='*40}\n")
+
+        if choice == 'n':
+            return "User rejected this command."
+
+    # 2. Safety Blacklist (Last line of defense)
+    blacklist = ["rm -rf /", "mkfs", "dd if=", ":(){ :|:& };:"]
+    if any(forbidden in command for forbidden in blacklist):
+        return "Error: Command blocked for security reasons."
+
+    # 3. Execution
+    try:
+        result = subprocess.run(command, shell=True,
+                                capture_output=True, text=True, timeout=20)
+        output = f"STDOUT: {result.stdout}\nSTDERR: {result.stderr}"
+        return output if output.strip() else "Success (No output)."
+    except Exception as e:
+        return f"Execution Error: {str(e)}"
+
+
+# Register tool
+available_tools = {'run_terminal_command': run_terminal_command}
+
+
+def record_audio():
+    fs, recording = 16000, []
+    print("\n[READY] Press Enter to START...")
+    input()
+    print("[RECORDING] Press Enter to STOP...")
+    def cb(indata, f, t, s): recording.append(indata.copy())
+    with sd.InputStream(samplerate=fs, channels=1, callback=cb):
+        input()
+    return np.concatenate(recording, axis=0)
+
+
+def main():
+    # 1. Setup Parser for CLI flags
+    parser = argparse.ArgumentParser(
+        description="Whisper + Ollama Terminal Assistant")
+    parser.add_argument("--model", default=OLLAMA_MODEL,
+                        help="Ollama model name")
+    parser.add_argument("--confirm", action='store_true',
+                        default=CONFIRM_COMMANDS, help="Confirm commands")
+    args, unknown = parser.parse_known_args()
+
+    # Initialize conversation with a strict System Role
+    # This "nudges" the model to use the tool feature rather than just chatting
+    messages = [{
+        'role': 'system',
+        'content': (
+            'You are a Linux terminal expert. When the user asks for a system task, '
+            'you MUST use the "run_terminal_command" tool. Do not explain your actions '
+            'in text; simply provide the command via the tool.'
+        )
+    }]
+
+    print(f"--- Assistant Active (Model: {args.model}) ---")
+    print(f"Confirmation Mode: {'ON' if args.confirm else 'OFF'}")
+
+    while True:
+        try:
+            # A. Record and Transcribe
+            audio_data = record_audio()
+            print("[TRANSCRIBING]...")
+            segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
+            user_text = "".join([s.text for s in segments]).strip()
+
+            if not user_text:
+                continue
+
+            print(f"\nYOU: {user_text}")
+            messages.append({'role': 'user', 'content': user_text})
+
+            # B. Get AI Response (Strict Temperature 0 for reliability)
+            response = ollama.chat(
+                model=args.model,
+                messages=messages,
+                tools=[run_terminal_command],
+                options={'temperature': 0}
+            )
+
+            # C. Detect Tool Calls (Handle both formal calls and raw JSON text)
+            tool_calls = response.message.tool_calls
+
+            # REPAIR LOGIC: If AI "talks" in JSON instead of using the tool field
+            if not tool_calls and '"run_terminal_command"' in response.message.content:
+                import json
+                try:
+                    content = response.message.content
+                    # Extract JSON block from text
+                    start, end = content.find('{'), content.rfind('}') + 1
+                    raw_json = json.loads(content[start:end])
+                    # Reconstruct as a tool call format
+                    tool_calls = [{'function': {
+                        'name': 'run_terminal_command',
+                        'arguments': raw_json.get('arguments', raw_json)
+                    }}]
+                except:
+                    pass
+
+            # D. Execute Tools if found
+            if tool_calls:
+                for tool_call in tool_calls:
+                    # Parse arguments based on format (official object vs dictionary)
+                    if hasattr(tool_call, 'function'):
+                        func_args = tool_call.function.arguments
+                    else:
+                        func_args = tool_call['function']['arguments']
+
+                    # Run the terminal command locally
+                    result = run_terminal_command(func_args['command'])
+
+                    # Add result back to history so AI can see it
+                    messages.append(response.message)
+                    messages.append({'role': 'tool', 'content': result})
+
+                # Get the final "Human" explanation from AI
+                final_response = ollama.chat(
+                    model=args.model, messages=messages)
+                print(f"AI: {final_response.message.content}")
+                messages.append(final_response.message)
+            else:
+                # Normal Chatting
+                print(f"AI: {response.message.content}")
+                messages.append(response.message)
+
+        except KeyboardInterrupt:
+            print("\nExiting Assistant...")
+            break
+        except Exception as e:
+            print(f"System Error: {e}")
+
+
+if __name__ == "__main__":
+    main()