Code/python/tool-speechtotext/voice_to_terminal.py

import argparse
import sounddevice as sd
import numpy as np
import pyperclip
import sys
import os
import subprocess
import ollama
from faster_whisper import WhisperModel

# --- Settings ---
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
MODEL_SIZE = "medium"
OLLAMA_MODEL = "qwen2.5-coder:7b"
CONFIRM_COMMANDS = True  # Set to False to run commands instantly

# Load Whisper
print("Loading Whisper model...")
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")


def run_terminal_command(command: str):
    """Executes a bash command in the terminal. Handles file ops, system info, etc."""

    # 1. Visual Confirmation Block
    if CONFIRM_COMMANDS:
        print(f"\n{'='*40}")
        print(f"⚠️  AI SUGGESTED COMMAND: \033[1;32m{command}\033[0m")
        choice = input("   Confirm execution? [Y/n]: ").strip().lower()
        print(f"{'='*40}\n")

        if choice == 'n':
            return "User rejected this command."

    # 2. Safety Blacklist (Last line of defense)
    blacklist = ["rm -rf /", "mkfs", "dd if=", ":(){ :|:& };:"]
    if any(forbidden in command for forbidden in blacklist):
        return "Error: Command blocked for security reasons."

    # 3. Execution
    try:
        result = subprocess.run(command, shell=True,
                                capture_output=True, text=True, timeout=20)
        output = f"STDOUT: {result.stdout}\nSTDERR: {result.stderr}"
        return output if output.strip() else "Success (No output)."
    except Exception as e:
        return f"Execution Error: {str(e)}"


# Register tool
available_tools = {'run_terminal_command': run_terminal_command}


def record_audio():
    fs, recording = 16000, []
    print("\n[READY] Press Enter to START...")
    input()
    print("[RECORDING] Press Enter to STOP...")
    def cb(indata, f, t, s): recording.append(indata.copy())
    with sd.InputStream(samplerate=fs, channels=1, callback=cb):
        input()
    return np.concatenate(recording, axis=0)


def main():
    # 1. Setup Parser for CLI flags
    parser = argparse.ArgumentParser(
        description="Whisper + Ollama Terminal Assistant")
    parser.add_argument("--model", default=OLLAMA_MODEL,
                        help="Ollama model name")
    parser.add_argument("--confirm", action='store_true',
                        default=CONFIRM_COMMANDS, help="Confirm commands")
    args, unknown = parser.parse_known_args()

    # Initialize conversation with a strict System Role
    # This "nudges" the model to use the tool feature rather than just chatting
    messages = [{
        'role': 'system',
        'content': (
            'You are a Linux terminal expert. When the user asks for a system task, '
            'you MUST use the "run_terminal_command" tool. Do not explain your actions '
            'in text; simply provide the command via the tool.'
        )
    }]

    print(f"--- Assistant Active (Model: {args.model}) ---")
    print(f"Confirmation Mode: {'ON' if args.confirm else 'OFF'}")

    while True:
        try:
            # A. Record and Transcribe
            audio_data = record_audio()
            print("[TRANSCRIBING]...")
            segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
            user_text = "".join([s.text for s in segments]).strip()

            if not user_text:
                continue

            print(f"\nYOU: {user_text}")
            messages.append({'role': 'user', 'content': user_text})

            # B. Get AI Response (Strict Temperature 0 for reliability)
            response = ollama.chat(
                model=args.model,
                messages=messages,
                tools=[run_terminal_command],
                options={'temperature': 0}
            )

            # C. Detect Tool Calls (Handle both formal calls and raw JSON text)
            tool_calls = response.message.tool_calls

            # REPAIR LOGIC: If AI "talks" in JSON instead of using the tool field
            if not tool_calls and '"run_terminal_command"' in response.message.content:
                import json
                try:
                    content = response.message.content
                    # Extract JSON block from text
                    start, end = content.find('{'), content.rfind('}') + 1
                    raw_json = json.loads(content[start:end])
                    # Reconstruct as a tool call format
                    tool_calls = [{'function': {
                        'name': 'run_terminal_command',
                        'arguments': raw_json.get('arguments', raw_json)
                    }}]
                except:
                    pass

            # D. Execute Tools if found
            if tool_calls:
                for tool_call in tool_calls:
                    # Parse arguments based on format (official object vs dictionary)
                    if hasattr(tool_call, 'function'):
                        func_args = tool_call.function.arguments
                    else:
                        func_args = tool_call['function']['arguments']

                    # Run the terminal command locally
                    result = run_terminal_command(func_args['command'])

                    # Add result back to history so AI can see it
                    messages.append(response.message)
                    messages.append({'role': 'tool', 'content': result})

                # Get the final "Human" explanation from AI
                final_response = ollama.chat(
                    model=args.model, messages=messages)
                print(f"AI: {final_response.message.content}")
                messages.append(final_response.message)
            else:
                # Normal Chatting
                print(f"AI: {response.message.content}")
                messages.append(response.message)

        except KeyboardInterrupt:
            print("\nExiting Assistant...")
            break
        except Exception as e:
            print(f"System Error: {e}")


if __name__ == "__main__":
    main()