Code/python/tool-speechtotext/voice_to_terminal.py

import sounddevice as sd
import numpy as np
import pyperclip
import sys
import argparse
import os
import subprocess
import ollama
import json
from faster_whisper import WhisperModel

# --- Configuration ---
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
MODEL_SIZE = "medium"
OLLAMA_MODEL = "qwen2.5-coder:7b"
CONFIRM_COMMANDS = True  # Set to False to run commands instantly

# Load Whisper on GPU
print("Loading Whisper model...")
try:
    model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
except Exception as e:
    print(f"Error loading GPU: {e}, falling back to CPU")
    model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")

# --- Terminal Tool ---


def run_terminal_command(command: str):
    """
    Executes a bash command in the Linux terminal.
    Used for file management, system info, and terminal tasks.
    """
    if CONFIRM_COMMANDS:
        print(f"\n{'='*40}")
        print(f"⚠️  AI SUGGESTED: \033[1;32m{command}\033[0m")
        # Allow user to provide feedback if they say 'n'
        choice = input("   Confirm? [Y/n] or provide feedback: ").strip()

        if choice.lower() == 'n':
            return "USER_REJECTION: The user did not approve this command. Please suggest an alternative."
        elif choice and choice.lower() != 'y':
            return f"USER_FEEDBACK: The user rejected the command with this reason: '{choice}'. Please adjust."
        print(f"{'='*40}\n")

    # Safety Guardrail
    blacklist = ["rm -rf /", "mkfs", "dd if="]
    if any(forbidden in command for forbidden in blacklist):
        return "Error: Command blocked for security reasons."

    try:
        result = subprocess.run(command, shell=True,
                                capture_output=True, text=True, timeout=20)
        output = f"STDOUT: {result.stdout}\nSTDERR: {result.stderr}"
        return output if output.strip() else "Success (No output)."
    except Exception as e:
        return f"Execution Error: {str(e)}"


def record_audio():
    fs, recording = 16000, []
    print("\n[READY] Press Enter to START...")
    input()
    print("[RECORDING] Press Enter to STOP...")
    def cb(indata, f, t, s): recording.append(indata.copy())
    with sd.InputStream(samplerate=fs, channels=1, callback=cb):
        input()
    return np.concatenate(recording, axis=0)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default=OLLAMA_MODEL)
    args, _ = parser.parse_known_args()

    # Initial System Prompt
    messages = [{
        'role': 'system',
        'content': (
            'You are a Linux expert assistant. When asked for a system task, '
            'use the "run_terminal_command" tool. If the user rejects a command, '
            'analyze their feedback and suggest a corrected alternative.'
        )
    }]

    print(f"--- Assistant Active (Model: {args.model}) ---")

    while True:
        try:
            # 1. Voice Capture
            audio_data = record_audio()
            segments, _ = model.transcribe(audio_data.flatten(), beam_size=5)
            user_text = "".join([s.text for s in segments]).strip()
            if not user_text:
                continue

            print(f"\nYOU: {user_text}")
            messages.append({'role': 'user', 'content': user_text})

            # 2. AI Interaction Loop (Supports up to 3 retries if rejected)
            for attempt in range(3):
                response = ollama.chat(
                    model=args.model,
                    messages=messages,
                    tools=[run_terminal_command],
                    options={'temperature': 0}
                )

                tool_calls = response.message.tool_calls

                # Fallback Repair: Catch raw JSON output
                if not tool_calls and '"run_terminal_command"' in response.message.content:
                    try:
                        c = response.message.content
                        start, end = c.find('{'), c.rfind('}') + 1
                        raw_json = json.loads(c[start:end])
                        tool_calls = [{'function': {
                            'name': 'run_terminal_command',
                            'arguments': raw_json.get('arguments', raw_json)
                        }}]
                    except:
                        pass

                # 3. Execution Logic
                if tool_calls:
                    call = tool_calls[0]
                    # Normalize arguments format
                    f_args = call.function.arguments if hasattr(
                        call, 'function') else call['function']['arguments']

                    result = run_terminal_command(f_args['command'])

                    # Update History
                    messages.append(response.message)
                    messages.append({'role': 'tool', 'content': result})

                    # If REJECTED, the loop continues and the AI sees the feedback
                    if "USER_REJECTION" in result or "USER_FEEDBACK" in result:
                        print("[RETHINKING] AI is adjusting the command...")
                        continue
                    else:
                        # Success: Let AI explain the result
                        final_res = ollama.chat(
                            model=args.model, messages=messages)
                        print(f"AI: {final_res.message.content}")
                        messages.append(final_res.message)
                        break
                else:
                    # Normal Chat
                    print(f"AI: {response.message.content}")
                    messages.append(response.message)
                    break

        except KeyboardInterrupt:
            print("\nExiting...")
            break
        except Exception as e:
            print(f"System Error: {e}")


if __name__ == "__main__":
    main()