Files
Code/python/tool-speechtotext/voice_to_xdotool.py
local 848681087e Add voice-to-xdotool: hands-free speech typing via VAD + Whisper + xdotool
New tool that uses webrtcvad for voice activity detection, faster-whisper
for transcription, and xdotool to type into any focused window. Supports
session-based listening, configurable silence threshold, and a "full stop"
magic word to auto-submit.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 23:37:14 +00:00

264 lines
8.7 KiB
Python

import sounddevice as sd
import numpy as np
import webrtcvad
import subprocess
import sys
import os
import argparse
import threading
import queue
import collections
import time
from faster_whisper import WhisperModel
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
# --- Constants ---
SAMPLE_RATE = 16000
CHANNELS = 1
FRAME_DURATION_MS = 30
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples
MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks
HALLUCINATION_PATTERNS = [
"thank you", "thanks for watching", "subscribe",
"bye", "the end", "thank you for watching",
"please subscribe", "like and subscribe",
]
# --- Thread-safe audio queue ---
audio_queue = queue.Queue()
def audio_callback(indata, frames, time_info, status):
if status:
print(status, file=sys.stderr)
audio_queue.put(bytes(indata))
# --- Whisper model loading (reused pattern from assistant.py) ---
def load_whisper_model(model_size):
print(f"Loading Whisper model ({model_size})...")
try:
return WhisperModel(model_size, device="cuda", compute_type="float16")
except Exception as e:
print(f"GPU loading failed: {e}")
print("Falling back to CPU (int8)")
return WhisperModel(model_size, device="cpu", compute_type="int8")
# --- VAD State Machine ---
class VADProcessor:
def __init__(self, aggressiveness, silence_threshold):
self.vad = webrtcvad.Vad(aggressiveness)
self.silence_threshold = silence_threshold
self.reset()
def reset(self):
self.triggered = False
self.utterance_frames = []
self.silence_duration = 0.0
self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll
def process_frame(self, frame_bytes):
"""Process one 30ms frame. Returns utterance bytes when complete, else None."""
is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE)
if not self.triggered:
self.pre_buffer.append(frame_bytes)
if is_speech:
self.triggered = True
self.silence_duration = 0.0
self.utterance_frames = list(self.pre_buffer)
self.utterance_frames.append(frame_bytes)
pass # silent until transcription confirms speech
else:
self.utterance_frames.append(frame_bytes)
if is_speech:
self.silence_duration = 0.0
else:
self.silence_duration += FRAME_DURATION_MS / 1000.0
if self.silence_duration >= self.silence_threshold:
if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES:
self.reset()
return None
result = b"".join(self.utterance_frames)
self.reset()
return result
return None
# --- Typer Interface (xdotool) ---
class Typer:
def __init__(self, submit=False):
self.submit = submit
def start(self):
try:
subprocess.run(["xdotool", "version"], capture_output=True, check=True)
except FileNotFoundError:
print("ERROR: xdotool not found. Install it:")
print(" sudo apt-get install xdotool")
sys.exit(1)
def type_text(self, text, submit_now=False):
try:
subprocess.run(
["xdotool", "type", "--clearmodifiers", "--delay", "0", "--", text],
check=True,
)
if self.submit or submit_now:
time.sleep(0.1)
subprocess.run(
["xdotool", "key", "--clearmodifiers", "Return"],
check=True,
)
except subprocess.CalledProcessError as e:
print(f"\n [xdotool error: {e}]", end="", flush=True)
def stop(self):
pass
# --- Helpers ---
def pcm_bytes_to_float32(pcm_bytes):
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
return audio_int16.astype(np.float32) / 32768.0
def transcribe(model, audio_float32):
segments, _ = model.transcribe(audio_float32, beam_size=5)
return "".join(segment.text for segment in segments).strip()
def is_hallucination(text):
lowered = text.lower().strip()
if len(lowered) < 3:
return True
return any(p in lowered for p in HALLUCINATION_PATTERNS)
# --- CLI ---
def parse_args():
parser = argparse.ArgumentParser(
description="Voice-to-type: speak and type into any focused window via xdotool"
)
parser.add_argument(
"--submit", action="store_true",
help="Auto-press Enter after typing (default: off)"
)
parser.add_argument(
"--silence-threshold", type=float, default=0.8,
help="Seconds of silence to end an utterance (default: 0.8)"
)
parser.add_argument(
"--submit-word", type=str, default="full stop",
help="Magic word at end of utterance to auto-submit (default: 'full stop')"
)
parser.add_argument(
"--model-size", type=str, default="medium",
choices=["tiny", "base", "small", "medium", "large-v3"],
help="Whisper model size (default: medium)"
)
parser.add_argument(
"--vad-aggressiveness", type=int, default=3, choices=[0, 1, 2, 3],
help="webrtcvad aggressiveness 0-3, higher filters more noise (default: 3)"
)
parser.add_argument(
"--device", type=int, default=None,
help="Audio input device index (use 'python -m sounddevice' to list)"
)
return parser.parse_args()
# --- Main ---
def main():
args = parse_args()
whisper_model = load_whisper_model(args.model_size)
vad = VADProcessor(args.vad_aggressiveness, args.silence_threshold)
typer = Typer(submit=args.submit)
typer.start()
print("=== Voice-to-Type (xdotool) ===")
print(f" Model: {args.model_size}")
print(f" Silence threshold: {args.silence_threshold}s")
submit_info = "ON (always)" if args.submit else f'OFF (say "{args.submit_word}" to submit)'
print(f" Submit mode: {submit_info}")
print(f" VAD aggressiveness: {args.vad_aggressiveness}")
try:
while True:
print("\n[SESSION] Press Enter to start listening (Ctrl+C to quit)...")
input()
print("[LISTENING] Speak now. Press Enter to stop session.")
print(" Waiting for speech...", end="", flush=True)
stop_event = threading.Event()
def wait_for_enter():
input()
stop_event.set()
enter_thread = threading.Thread(target=wait_for_enter, daemon=True)
enter_thread.start()
try:
stream = sd.InputStream(
samplerate=SAMPLE_RATE,
channels=CHANNELS,
dtype="int16",
blocksize=FRAME_SIZE,
callback=audio_callback,
device=args.device,
)
except sd.PortAudioError as e:
print(f"\nAudio device error: {e}")
print("Available devices:")
print(sd.query_devices())
continue
stream.start()
try:
while not stop_event.is_set():
try:
frame_bytes = audio_queue.get(timeout=0.1)
except queue.Empty:
continue
utterance_bytes = vad.process_frame(frame_bytes)
if utterance_bytes is not None:
audio_float32 = pcm_bytes_to_float32(utterance_bytes)
text = transcribe(whisper_model, audio_float32)
if text and not is_hallucination(text):
submit_now = False
if text.lower().rstrip(".,!? ").endswith(args.submit_word):
text = text[:text.lower().rfind(args.submit_word)].rstrip(" ,.")
submit_now = True
if text:
marker = " [SUBMIT]" if submit_now else ""
print(f"\n >> \"{text}\"{marker}")
typer.type_text(text, submit_now=submit_now)
finally:
stream.stop()
stream.close()
while not audio_queue.empty():
try:
audio_queue.get_nowait()
except queue.Empty:
break
vad.reset()
print("\n[SESSION ENDED]")
except KeyboardInterrupt:
print("\nShutting down...")
finally:
typer.stop()
print("Goodbye.")
if __name__ == "__main__":
main()