Files
Code/python/tool-speechtotext/voice_to_xdotool.py
local 104da381fb Refactor tool-speechtotext: extract sttlib shared library and add tests
Extract duplicated code (Whisper loading, audio recording, transcription,
VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as
thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback
bug in assistant.py and args.system assignment bug.
2026-02-08 00:40:31 +00:00

170 lines
5.7 KiB
Python

import argparse
import subprocess
import threading
import queue
import time
import sounddevice as sd
from sttlib import (
load_whisper_model, transcribe, is_hallucination, pcm_bytes_to_float32,
VADProcessor, audio_callback, audio_queue,
SAMPLE_RATE, CHANNELS, FRAME_SIZE,
)
# --- Typer Interface (xdotool) ---
class Typer:
def __init__(self, submit=False):
self.submit = submit
def start(self):
try:
subprocess.run(["xdotool", "version"], capture_output=True, check=True)
except FileNotFoundError:
print("ERROR: xdotool not found. Install it:")
print(" sudo apt-get install xdotool")
import sys
sys.exit(1)
def type_text(self, text, submit_now=False):
try:
subprocess.run(
["xdotool", "type", "--clearmodifiers", "--delay", "0", "--", text],
check=True,
)
if self.submit or submit_now:
time.sleep(0.1)
subprocess.run(
["xdotool", "key", "--clearmodifiers", "Return"],
check=True,
)
except subprocess.CalledProcessError as e:
print(f"\n [xdotool error: {e}]", end="", flush=True)
def stop(self):
pass
# --- CLI ---
def parse_args():
parser = argparse.ArgumentParser(
description="Voice-to-type: speak and type into any focused window via xdotool"
)
parser.add_argument(
"--submit", action="store_true",
help="Auto-press Enter after typing (default: off)"
)
parser.add_argument(
"--silence-threshold", type=float, default=0.8,
help="Seconds of silence to end an utterance (default: 0.8)"
)
parser.add_argument(
"--submit-word", type=str, default="full stop",
help="Magic word at end of utterance to auto-submit (default: 'full stop')"
)
parser.add_argument(
"--model-size", type=str, default="medium",
choices=["tiny", "base", "small", "medium", "large-v3"],
help="Whisper model size (default: medium)"
)
parser.add_argument(
"--vad-aggressiveness", type=int, default=3, choices=[0, 1, 2, 3],
help="webrtcvad aggressiveness 0-3, higher filters more noise (default: 3)"
)
parser.add_argument(
"--device", type=int, default=None,
help="Audio input device index (use 'python -m sounddevice' to list)"
)
return parser.parse_args()
# --- Main ---
def main():
args = parse_args()
whisper_model = load_whisper_model(args.model_size)
vad = VADProcessor(args.vad_aggressiveness, args.silence_threshold)
typer = Typer(submit=args.submit)
typer.start()
print("=== Voice-to-Type (xdotool) ===")
print(f" Model: {args.model_size}")
print(f" Silence threshold: {args.silence_threshold}s")
submit_info = "ON (always)" if args.submit else f'OFF (say "{args.submit_word}" to submit)'
print(f" Submit mode: {submit_info}")
print(f" VAD aggressiveness: {args.vad_aggressiveness}")
try:
while True:
print("\n[SESSION] Press Enter to start listening (Ctrl+C to quit)...")
input()
print("[LISTENING] Speak now. Press Enter to stop session.")
print(" Waiting for speech...", end="", flush=True)
stop_event = threading.Event()
def wait_for_enter():
input()
stop_event.set()
enter_thread = threading.Thread(target=wait_for_enter, daemon=True)
enter_thread.start()
try:
stream = sd.InputStream(
samplerate=SAMPLE_RATE,
channels=CHANNELS,
dtype="int16",
blocksize=FRAME_SIZE,
callback=audio_callback,
device=args.device,
)
except sd.PortAudioError as e:
print(f"\nAudio device error: {e}")
print("Available devices:")
print(sd.query_devices())
continue
stream.start()
try:
while not stop_event.is_set():
try:
frame_bytes = audio_queue.get(timeout=0.1)
except queue.Empty:
continue
utterance_bytes = vad.process_frame(frame_bytes)
if utterance_bytes is not None:
audio_float32 = pcm_bytes_to_float32(utterance_bytes)
text = transcribe(whisper_model, audio_float32)
if text and not is_hallucination(text):
submit_now = False
if text.lower().rstrip(".,!? ").endswith(args.submit_word):
text = text[:text.lower().rfind(args.submit_word)].rstrip(" ,.")
submit_now = True
if text:
marker = " [SUBMIT]" if submit_now else ""
print(f"\n >> \"{text}\"{marker}")
typer.type_text(text, submit_now=submit_now)
finally:
stream.stop()
stream.close()
while not audio_queue.empty():
try:
audio_queue.get_nowait()
except queue.Empty:
break
vad.reset()
print("\n[SESSION ENDED]")
except KeyboardInterrupt:
print("\nShutting down...")
finally:
typer.stop()
print("Goodbye.")
if __name__ == "__main__":
main()