import sounddevice as sd import numpy as np import webrtcvad import subprocess import sys import os import argparse import threading import queue import collections import time from faster_whisper import WhisperModel os.environ["CT2_CUDA_ALLOW_FP16"] = "1" # --- Constants --- SAMPLE_RATE = 16000 CHANNELS = 1 FRAME_DURATION_MS = 30 FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks HALLUCINATION_PATTERNS = [ "thank you", "thanks for watching", "subscribe", "bye", "the end", "thank you for watching", "please subscribe", "like and subscribe", ] # --- Thread-safe audio queue --- audio_queue = queue.Queue() def audio_callback(indata, frames, time_info, status): if status: print(status, file=sys.stderr) audio_queue.put(bytes(indata)) # --- Whisper model loading (reused pattern from assistant.py) --- def load_whisper_model(model_size): print(f"Loading Whisper model ({model_size})...") try: return WhisperModel(model_size, device="cuda", compute_type="float16") except Exception as e: print(f"GPU loading failed: {e}") print("Falling back to CPU (int8)") return WhisperModel(model_size, device="cpu", compute_type="int8") # --- VAD State Machine --- class VADProcessor: def __init__(self, aggressiveness, silence_threshold): self.vad = webrtcvad.Vad(aggressiveness) self.silence_threshold = silence_threshold self.reset() def reset(self): self.triggered = False self.utterance_frames = [] self.silence_duration = 0.0 self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll def process_frame(self, frame_bytes): """Process one 30ms frame. Returns utterance bytes when complete, else None.""" is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE) if not self.triggered: self.pre_buffer.append(frame_bytes) if is_speech: self.triggered = True self.silence_duration = 0.0 self.utterance_frames = list(self.pre_buffer) self.utterance_frames.append(frame_bytes) pass # silent until transcription confirms speech else: self.utterance_frames.append(frame_bytes) if is_speech: self.silence_duration = 0.0 else: self.silence_duration += FRAME_DURATION_MS / 1000.0 if self.silence_duration >= self.silence_threshold: if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES: self.reset() return None result = b"".join(self.utterance_frames) self.reset() return result return None # --- Typer Interface (xdotool) --- class Typer: def __init__(self, submit=False): self.submit = submit def start(self): try: subprocess.run(["xdotool", "version"], capture_output=True, check=True) except FileNotFoundError: print("ERROR: xdotool not found. Install it:") print(" sudo apt-get install xdotool") sys.exit(1) def type_text(self, text, submit_now=False): try: subprocess.run( ["xdotool", "type", "--clearmodifiers", "--delay", "0", "--", text], check=True, ) if self.submit or submit_now: time.sleep(0.1) subprocess.run( ["xdotool", "key", "--clearmodifiers", "Return"], check=True, ) except subprocess.CalledProcessError as e: print(f"\n [xdotool error: {e}]", end="", flush=True) def stop(self): pass # --- Helpers --- def pcm_bytes_to_float32(pcm_bytes): audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16) return audio_int16.astype(np.float32) / 32768.0 def transcribe(model, audio_float32): segments, _ = model.transcribe(audio_float32, beam_size=5) return "".join(segment.text for segment in segments).strip() def is_hallucination(text): lowered = text.lower().strip() if len(lowered) < 3: return True return any(p in lowered for p in HALLUCINATION_PATTERNS) # --- CLI --- def parse_args(): parser = argparse.ArgumentParser( description="Voice-to-type: speak and type into any focused window via xdotool" ) parser.add_argument( "--submit", action="store_true", help="Auto-press Enter after typing (default: off)" ) parser.add_argument( "--silence-threshold", type=float, default=0.8, help="Seconds of silence to end an utterance (default: 0.8)" ) parser.add_argument( "--submit-word", type=str, default="full stop", help="Magic word at end of utterance to auto-submit (default: 'full stop')" ) parser.add_argument( "--model-size", type=str, default="medium", choices=["tiny", "base", "small", "medium", "large-v3"], help="Whisper model size (default: medium)" ) parser.add_argument( "--vad-aggressiveness", type=int, default=3, choices=[0, 1, 2, 3], help="webrtcvad aggressiveness 0-3, higher filters more noise (default: 3)" ) parser.add_argument( "--device", type=int, default=None, help="Audio input device index (use 'python -m sounddevice' to list)" ) return parser.parse_args() # --- Main --- def main(): args = parse_args() whisper_model = load_whisper_model(args.model_size) vad = VADProcessor(args.vad_aggressiveness, args.silence_threshold) typer = Typer(submit=args.submit) typer.start() print("=== Voice-to-Type (xdotool) ===") print(f" Model: {args.model_size}") print(f" Silence threshold: {args.silence_threshold}s") submit_info = "ON (always)" if args.submit else f'OFF (say "{args.submit_word}" to submit)' print(f" Submit mode: {submit_info}") print(f" VAD aggressiveness: {args.vad_aggressiveness}") try: while True: print("\n[SESSION] Press Enter to start listening (Ctrl+C to quit)...") input() print("[LISTENING] Speak now. Press Enter to stop session.") print(" Waiting for speech...", end="", flush=True) stop_event = threading.Event() def wait_for_enter(): input() stop_event.set() enter_thread = threading.Thread(target=wait_for_enter, daemon=True) enter_thread.start() try: stream = sd.InputStream( samplerate=SAMPLE_RATE, channels=CHANNELS, dtype="int16", blocksize=FRAME_SIZE, callback=audio_callback, device=args.device, ) except sd.PortAudioError as e: print(f"\nAudio device error: {e}") print("Available devices:") print(sd.query_devices()) continue stream.start() try: while not stop_event.is_set(): try: frame_bytes = audio_queue.get(timeout=0.1) except queue.Empty: continue utterance_bytes = vad.process_frame(frame_bytes) if utterance_bytes is not None: audio_float32 = pcm_bytes_to_float32(utterance_bytes) text = transcribe(whisper_model, audio_float32) if text and not is_hallucination(text): submit_now = False if text.lower().rstrip(".,!? ").endswith(args.submit_word): text = text[:text.lower().rfind(args.submit_word)].rstrip(" ,.") submit_now = True if text: marker = " [SUBMIT]" if submit_now else "" print(f"\n >> \"{text}\"{marker}") typer.type_text(text, submit_now=submit_now) finally: stream.stop() stream.close() while not audio_queue.empty(): try: audio_queue.get_nowait() except queue.Empty: break vad.reset() print("\n[SESSION ENDED]") except KeyboardInterrupt: print("\nShutting down...") finally: typer.stop() print("Goodbye.") if __name__ == "__main__": main()