import argparse import subprocess import threading import queue import time import sounddevice as sd from sttlib import ( load_whisper_model, transcribe, is_hallucination, pcm_bytes_to_float32, VADProcessor, audio_callback, audio_queue, SAMPLE_RATE, CHANNELS, FRAME_SIZE, ) # --- Typer Interface (xdotool) --- class Typer: def __init__(self, submit=False): self.submit = submit def start(self): try: subprocess.run(["xdotool", "version"], capture_output=True, check=True) except FileNotFoundError: print("ERROR: xdotool not found. Install it:") print(" sudo apt-get install xdotool") import sys sys.exit(1) def type_text(self, text, submit_now=False): try: subprocess.run( ["xdotool", "type", "--clearmodifiers", "--delay", "0", "--", text], check=True, ) if self.submit or submit_now: time.sleep(0.1) subprocess.run( ["xdotool", "key", "--clearmodifiers", "Return"], check=True, ) except subprocess.CalledProcessError as e: print(f"\n [xdotool error: {e}]", end="", flush=True) def stop(self): pass # --- CLI --- def parse_args(): parser = argparse.ArgumentParser( description="Voice-to-type: speak and type into any focused window via xdotool" ) parser.add_argument( "--submit", action="store_true", help="Auto-press Enter after typing (default: off)" ) parser.add_argument( "--silence-threshold", type=float, default=0.8, help="Seconds of silence to end an utterance (default: 0.8)" ) parser.add_argument( "--submit-word", type=str, default="full stop", help="Magic word at end of utterance to auto-submit (default: 'full stop')" ) parser.add_argument( "--model-size", type=str, default="medium", choices=["tiny", "base", "small", "medium", "large-v3"], help="Whisper model size (default: medium)" ) parser.add_argument( "--vad-aggressiveness", type=int, default=3, choices=[0, 1, 2, 3], help="webrtcvad aggressiveness 0-3, higher filters more noise (default: 3)" ) parser.add_argument( "--device", type=int, default=None, help="Audio input device index (use 'python -m sounddevice' to list)" ) return parser.parse_args() # --- Main --- def main(): args = parse_args() whisper_model = load_whisper_model(args.model_size) vad = VADProcessor(args.vad_aggressiveness, args.silence_threshold) typer = Typer(submit=args.submit) typer.start() print("=== Voice-to-Type (xdotool) ===") print(f" Model: {args.model_size}") print(f" Silence threshold: {args.silence_threshold}s") submit_info = "ON (always)" if args.submit else f'OFF (say "{args.submit_word}" to submit)' print(f" Submit mode: {submit_info}") print(f" VAD aggressiveness: {args.vad_aggressiveness}") try: while True: print("\n[SESSION] Press Enter to start listening (Ctrl+C to quit)...") input() print("[LISTENING] Speak now. Press Enter to stop session.") print(" Waiting for speech...", end="", flush=True) stop_event = threading.Event() def wait_for_enter(): input() stop_event.set() enter_thread = threading.Thread(target=wait_for_enter, daemon=True) enter_thread.start() try: stream = sd.InputStream( samplerate=SAMPLE_RATE, channels=CHANNELS, dtype="int16", blocksize=FRAME_SIZE, callback=audio_callback, device=args.device, ) except sd.PortAudioError as e: print(f"\nAudio device error: {e}") print("Available devices:") print(sd.query_devices()) continue stream.start() try: while not stop_event.is_set(): try: frame_bytes = audio_queue.get(timeout=0.1) except queue.Empty: continue utterance_bytes = vad.process_frame(frame_bytes) if utterance_bytes is not None: audio_float32 = pcm_bytes_to_float32(utterance_bytes) text = transcribe(whisper_model, audio_float32) if text and not is_hallucination(text): submit_now = False if text.lower().rstrip(".,!? ").endswith(args.submit_word): text = text[:text.lower().rfind(args.submit_word)].rstrip(" ,.") submit_now = True if text: marker = " [SUBMIT]" if submit_now else "" print(f"\n >> \"{text}\"{marker}") typer.type_text(text, submit_now=submit_now) finally: stream.stop() stream.close() while not audio_queue.empty(): try: audio_queue.get_nowait() except queue.Empty: break vad.reset() print("\n[SESSION ENDED]") except KeyboardInterrupt: print("\nShutting down...") finally: typer.stop() print("Goodbye.") if __name__ == "__main__": main()