Code/python/tool-speechtotext/voice_to_xdotool.py

import argparse
import subprocess
import threading
import queue
import time
import sounddevice as sd
from sttlib import (
    load_whisper_model, transcribe, is_hallucination, pcm_bytes_to_float32,
    VADProcessor, audio_callback, audio_queue,
    SAMPLE_RATE, CHANNELS, FRAME_SIZE,
)


# --- Typer Interface (xdotool) ---
class Typer:
    def __init__(self, submit=False):
        self.submit = submit

    def start(self):
        try:
            subprocess.run(["xdotool", "version"], capture_output=True, check=True)
        except FileNotFoundError:
            print("ERROR: xdotool not found. Install it:")
            print("  sudo apt-get install xdotool")
            import sys
            sys.exit(1)

    def type_text(self, text, submit_now=False):
        try:
            subprocess.run(
                ["xdotool", "type", "--clearmodifiers", "--delay", "0", "--", text],
                check=True,
            )
            if self.submit or submit_now:
                time.sleep(0.1)
                subprocess.run(
                    ["xdotool", "key", "--clearmodifiers", "Return"],
                    check=True,
                )
        except subprocess.CalledProcessError as e:
            print(f"\n  [xdotool error: {e}]", end="", flush=True)

    def stop(self):
        pass


# --- CLI ---
def parse_args():
    parser = argparse.ArgumentParser(
        description="Voice-to-type: speak and type into any focused window via xdotool"
    )
    parser.add_argument(
        "--submit", action="store_true",
        help="Auto-press Enter after typing (default: off)"
    )
    parser.add_argument(
        "--silence-threshold", type=float, default=0.8,
        help="Seconds of silence to end an utterance (default: 0.8)"
    )
    parser.add_argument(
        "--submit-word", type=str, default="full stop",
        help="Magic word at end of utterance to auto-submit (default: 'full stop')"
    )
    parser.add_argument(
        "--model-size", type=str, default="medium",
        choices=["tiny", "base", "small", "medium", "large-v3"],
        help="Whisper model size (default: medium)"
    )
    parser.add_argument(
        "--vad-aggressiveness", type=int, default=3, choices=[0, 1, 2, 3],
        help="webrtcvad aggressiveness 0-3, higher filters more noise (default: 3)"
    )
    parser.add_argument(
        "--device", type=int, default=None,
        help="Audio input device index (use 'python -m sounddevice' to list)"
    )
    return parser.parse_args()


# --- Main ---
def main():
    args = parse_args()

    whisper_model = load_whisper_model(args.model_size)
    vad = VADProcessor(args.vad_aggressiveness, args.silence_threshold)
    typer = Typer(submit=args.submit)
    typer.start()

    print("=== Voice-to-Type (xdotool) ===")
    print(f"  Model: {args.model_size}")
    print(f"  Silence threshold: {args.silence_threshold}s")
    submit_info = "ON (always)" if args.submit else f'OFF (say "{args.submit_word}" to submit)'
    print(f"  Submit mode: {submit_info}")
    print(f"  VAD aggressiveness: {args.vad_aggressiveness}")

    try:
        while True:
            print("\n[SESSION] Press Enter to start listening (Ctrl+C to quit)...")
            input()
            print("[LISTENING] Speak now. Press Enter to stop session.")
            print("  Waiting for speech...", end="", flush=True)

            stop_event = threading.Event()

            def wait_for_enter():
                input()
                stop_event.set()

            enter_thread = threading.Thread(target=wait_for_enter, daemon=True)
            enter_thread.start()

            try:
                stream = sd.InputStream(
                    samplerate=SAMPLE_RATE,
                    channels=CHANNELS,
                    dtype="int16",
                    blocksize=FRAME_SIZE,
                    callback=audio_callback,
                    device=args.device,
                )
            except sd.PortAudioError as e:
                print(f"\nAudio device error: {e}")
                print("Available devices:")
                print(sd.query_devices())
                continue

            stream.start()

            try:
                while not stop_event.is_set():
                    try:
                        frame_bytes = audio_queue.get(timeout=0.1)
                    except queue.Empty:
                        continue

                    utterance_bytes = vad.process_frame(frame_bytes)
                    if utterance_bytes is not None:
                        audio_float32 = pcm_bytes_to_float32(utterance_bytes)
                        text = transcribe(whisper_model, audio_float32)

                        if text and not is_hallucination(text):
                            submit_now = False
                            if text.lower().rstrip(".,!? ").endswith(args.submit_word):
                                text = text[:text.lower().rfind(args.submit_word)].rstrip(" ,.")
                                submit_now = True
                            if text:
                                marker = " [SUBMIT]" if submit_now else ""
                                print(f"\n  >> \"{text}\"{marker}")
                                typer.type_text(text, submit_now=submit_now)
            finally:
                stream.stop()
                stream.close()
                while not audio_queue.empty():
                    try:
                        audio_queue.get_nowait()
                    except queue.Empty:
                        break
                vad.reset()
                print("\n[SESSION ENDED]")

    except KeyboardInterrupt:
        print("\nShutting down...")
    finally:
        typer.stop()
        print("Goodbye.")


if __name__ == "__main__":
    main()