diff --git a/python/tool-speechtotext/voice_to_terminal.py b/python/tool-speechtotext/voice_to_terminal.py index 5d67f2b..fc1af78 100644 --- a/python/tool-speechtotext/voice_to_terminal.py +++ b/python/tool-speechtotext/voice_to_terminal.py @@ -1,43 +1,53 @@ -import argparse import sounddevice as sd import numpy as np import pyperclip import sys +import argparse import os import subprocess import ollama +import json from faster_whisper import WhisperModel -# --- Settings --- +# --- Configuration --- os.environ["CT2_CUDA_ALLOW_FP16"] = "1" MODEL_SIZE = "medium" OLLAMA_MODEL = "qwen2.5-coder:7b" CONFIRM_COMMANDS = True # Set to False to run commands instantly -# Load Whisper +# Load Whisper on GPU print("Loading Whisper model...") -model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16") +try: + model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16") +except Exception as e: + print(f"Error loading GPU: {e}, falling back to CPU") + model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8") + +# --- Terminal Tool --- def run_terminal_command(command: str): - """Executes a bash command in the terminal. Handles file ops, system info, etc.""" - - # 1. Visual Confirmation Block + """ + Executes a bash command in the Linux terminal. + Used for file management, system info, and terminal tasks. + """ if CONFIRM_COMMANDS: print(f"\n{'='*40}") - print(f"⚠️ AI SUGGESTED COMMAND: \033[1;32m{command}\033[0m") - choice = input(" Confirm execution? [Y/n]: ").strip().lower() + print(f"⚠️ AI SUGGESTED: \033[1;32m{command}\033[0m") + # Allow user to provide feedback if they say 'n' + choice = input(" Confirm? [Y/n] or provide feedback: ").strip() + + if choice.lower() == 'n': + return "USER_REJECTION: The user did not approve this command. Please suggest an alternative." + elif choice and choice.lower() != 'y': + return f"USER_FEEDBACK: The user rejected the command with this reason: '{choice}'. Please adjust." print(f"{'='*40}\n") - if choice == 'n': - return "User rejected this command." - - # 2. Safety Blacklist (Last line of defense) - blacklist = ["rm -rf /", "mkfs", "dd if=", ":(){ :|:& };:"] + # Safety Guardrail + blacklist = ["rm -rf /", "mkfs", "dd if="] if any(forbidden in command for forbidden in blacklist): return "Error: Command blocked for security reasons." - # 3. Execution try: result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=20) @@ -47,10 +57,6 @@ def run_terminal_command(command: str): return f"Execution Error: {str(e)}" -# Register tool -available_tools = {'run_terminal_command': run_terminal_command} - - def record_audio(): fs, recording = 16000, [] print("\n[READY] Press Enter to START...") @@ -63,98 +69,90 @@ def record_audio(): def main(): - # 1. Setup Parser for CLI flags - parser = argparse.ArgumentParser( - description="Whisper + Ollama Terminal Assistant") - parser.add_argument("--model", default=OLLAMA_MODEL, - help="Ollama model name") - parser.add_argument("--confirm", action='store_true', - default=CONFIRM_COMMANDS, help="Confirm commands") - args, unknown = parser.parse_known_args() + parser = argparse.ArgumentParser() + parser.add_argument("--model", default=OLLAMA_MODEL) + args, _ = parser.parse_known_args() - # Initialize conversation with a strict System Role - # This "nudges" the model to use the tool feature rather than just chatting + # Initial System Prompt messages = [{ 'role': 'system', 'content': ( - 'You are a Linux terminal expert. When the user asks for a system task, ' - 'you MUST use the "run_terminal_command" tool. Do not explain your actions ' - 'in text; simply provide the command via the tool.' + 'You are a Linux expert assistant. When asked for a system task, ' + 'use the "run_terminal_command" tool. If the user rejects a command, ' + 'analyze their feedback and suggest a corrected alternative.' ) }] print(f"--- Assistant Active (Model: {args.model}) ---") - print(f"Confirmation Mode: {'ON' if args.confirm else 'OFF'}") while True: try: - # A. Record and Transcribe + # 1. Voice Capture audio_data = record_audio() - print("[TRANSCRIBING]...") segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) user_text = "".join([s.text for s in segments]).strip() - if not user_text: continue print(f"\nYOU: {user_text}") messages.append({'role': 'user', 'content': user_text}) - # B. Get AI Response (Strict Temperature 0 for reliability) - response = ollama.chat( - model=args.model, - messages=messages, - tools=[run_terminal_command], - options={'temperature': 0} - ) + # 2. AI Interaction Loop (Supports up to 3 retries if rejected) + for attempt in range(3): + response = ollama.chat( + model=args.model, + messages=messages, + tools=[run_terminal_command], + options={'temperature': 0} + ) - # C. Detect Tool Calls (Handle both formal calls and raw JSON text) - tool_calls = response.message.tool_calls + tool_calls = response.message.tool_calls - # REPAIR LOGIC: If AI "talks" in JSON instead of using the tool field - if not tool_calls and '"run_terminal_command"' in response.message.content: - import json - try: - content = response.message.content - # Extract JSON block from text - start, end = content.find('{'), content.rfind('}') + 1 - raw_json = json.loads(content[start:end]) - # Reconstruct as a tool call format - tool_calls = [{'function': { - 'name': 'run_terminal_command', - 'arguments': raw_json.get('arguments', raw_json) - }}] - except: - pass + # Fallback Repair: Catch raw JSON output + if not tool_calls and '"run_terminal_command"' in response.message.content: + try: + c = response.message.content + start, end = c.find('{'), c.rfind('}') + 1 + raw_json = json.loads(c[start:end]) + tool_calls = [{'function': { + 'name': 'run_terminal_command', + 'arguments': raw_json.get('arguments', raw_json) + }}] + except: + pass - # D. Execute Tools if found - if tool_calls: - for tool_call in tool_calls: - # Parse arguments based on format (official object vs dictionary) - if hasattr(tool_call, 'function'): - func_args = tool_call.function.arguments - else: - func_args = tool_call['function']['arguments'] + # 3. Execution Logic + if tool_calls: + call = tool_calls[0] + # Normalize arguments format + f_args = call.function.arguments if hasattr( + call, 'function') else call['function']['arguments'] - # Run the terminal command locally - result = run_terminal_command(func_args['command']) + result = run_terminal_command(f_args['command']) - # Add result back to history so AI can see it + # Update History messages.append(response.message) messages.append({'role': 'tool', 'content': result}) - # Get the final "Human" explanation from AI - final_response = ollama.chat( - model=args.model, messages=messages) - print(f"AI: {final_response.message.content}") - messages.append(final_response.message) - else: - # Normal Chatting - print(f"AI: {response.message.content}") - messages.append(response.message) + # If REJECTED, the loop continues and the AI sees the feedback + if "USER_REJECTION" in result or "USER_FEEDBACK" in result: + print("[RETHINKING] AI is adjusting the command...") + continue + else: + # Success: Let AI explain the result + final_res = ollama.chat( + model=args.model, messages=messages) + print(f"AI: {final_res.message.content}") + messages.append(final_res.message) + break + else: + # Normal Chat + print(f"AI: {response.message.content}") + messages.append(response.message) + break except KeyboardInterrupt: - print("\nExiting Assistant...") + print("\nExiting...") break except Exception as e: print(f"System Error: {e}")