Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
170 lines
5.7 KiB
Python
170 lines
5.7 KiB
Python
import argparse
|
|
import subprocess
|
|
import threading
|
|
import queue
|
|
import time
|
|
import sounddevice as sd
|
|
from sttlib import (
|
|
load_whisper_model, transcribe, is_hallucination, pcm_bytes_to_float32,
|
|
VADProcessor, audio_callback, audio_queue,
|
|
SAMPLE_RATE, CHANNELS, FRAME_SIZE,
|
|
)
|
|
|
|
|
|
# --- Typer Interface (xdotool) ---
|
|
class Typer:
|
|
def __init__(self, submit=False):
|
|
self.submit = submit
|
|
|
|
def start(self):
|
|
try:
|
|
subprocess.run(["xdotool", "version"], capture_output=True, check=True)
|
|
except FileNotFoundError:
|
|
print("ERROR: xdotool not found. Install it:")
|
|
print(" sudo apt-get install xdotool")
|
|
import sys
|
|
sys.exit(1)
|
|
|
|
def type_text(self, text, submit_now=False):
|
|
try:
|
|
subprocess.run(
|
|
["xdotool", "type", "--clearmodifiers", "--delay", "0", "--", text],
|
|
check=True,
|
|
)
|
|
if self.submit or submit_now:
|
|
time.sleep(0.1)
|
|
subprocess.run(
|
|
["xdotool", "key", "--clearmodifiers", "Return"],
|
|
check=True,
|
|
)
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"\n [xdotool error: {e}]", end="", flush=True)
|
|
|
|
def stop(self):
|
|
pass
|
|
|
|
|
|
# --- CLI ---
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(
|
|
description="Voice-to-type: speak and type into any focused window via xdotool"
|
|
)
|
|
parser.add_argument(
|
|
"--submit", action="store_true",
|
|
help="Auto-press Enter after typing (default: off)"
|
|
)
|
|
parser.add_argument(
|
|
"--silence-threshold", type=float, default=0.8,
|
|
help="Seconds of silence to end an utterance (default: 0.8)"
|
|
)
|
|
parser.add_argument(
|
|
"--submit-word", type=str, default="full stop",
|
|
help="Magic word at end of utterance to auto-submit (default: 'full stop')"
|
|
)
|
|
parser.add_argument(
|
|
"--model-size", type=str, default="medium",
|
|
choices=["tiny", "base", "small", "medium", "large-v3"],
|
|
help="Whisper model size (default: medium)"
|
|
)
|
|
parser.add_argument(
|
|
"--vad-aggressiveness", type=int, default=3, choices=[0, 1, 2, 3],
|
|
help="webrtcvad aggressiveness 0-3, higher filters more noise (default: 3)"
|
|
)
|
|
parser.add_argument(
|
|
"--device", type=int, default=None,
|
|
help="Audio input device index (use 'python -m sounddevice' to list)"
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
# --- Main ---
|
|
def main():
|
|
args = parse_args()
|
|
|
|
whisper_model = load_whisper_model(args.model_size)
|
|
vad = VADProcessor(args.vad_aggressiveness, args.silence_threshold)
|
|
typer = Typer(submit=args.submit)
|
|
typer.start()
|
|
|
|
print("=== Voice-to-Type (xdotool) ===")
|
|
print(f" Model: {args.model_size}")
|
|
print(f" Silence threshold: {args.silence_threshold}s")
|
|
submit_info = "ON (always)" if args.submit else f'OFF (say "{args.submit_word}" to submit)'
|
|
print(f" Submit mode: {submit_info}")
|
|
print(f" VAD aggressiveness: {args.vad_aggressiveness}")
|
|
|
|
try:
|
|
while True:
|
|
print("\n[SESSION] Press Enter to start listening (Ctrl+C to quit)...")
|
|
input()
|
|
print("[LISTENING] Speak now. Press Enter to stop session.")
|
|
print(" Waiting for speech...", end="", flush=True)
|
|
|
|
stop_event = threading.Event()
|
|
|
|
def wait_for_enter():
|
|
input()
|
|
stop_event.set()
|
|
|
|
enter_thread = threading.Thread(target=wait_for_enter, daemon=True)
|
|
enter_thread.start()
|
|
|
|
try:
|
|
stream = sd.InputStream(
|
|
samplerate=SAMPLE_RATE,
|
|
channels=CHANNELS,
|
|
dtype="int16",
|
|
blocksize=FRAME_SIZE,
|
|
callback=audio_callback,
|
|
device=args.device,
|
|
)
|
|
except sd.PortAudioError as e:
|
|
print(f"\nAudio device error: {e}")
|
|
print("Available devices:")
|
|
print(sd.query_devices())
|
|
continue
|
|
|
|
stream.start()
|
|
|
|
try:
|
|
while not stop_event.is_set():
|
|
try:
|
|
frame_bytes = audio_queue.get(timeout=0.1)
|
|
except queue.Empty:
|
|
continue
|
|
|
|
utterance_bytes = vad.process_frame(frame_bytes)
|
|
if utterance_bytes is not None:
|
|
audio_float32 = pcm_bytes_to_float32(utterance_bytes)
|
|
text = transcribe(whisper_model, audio_float32)
|
|
|
|
if text and not is_hallucination(text):
|
|
submit_now = False
|
|
if text.lower().rstrip(".,!? ").endswith(args.submit_word):
|
|
text = text[:text.lower().rfind(args.submit_word)].rstrip(" ,.")
|
|
submit_now = True
|
|
if text:
|
|
marker = " [SUBMIT]" if submit_now else ""
|
|
print(f"\n >> \"{text}\"{marker}")
|
|
typer.type_text(text, submit_now=submit_now)
|
|
finally:
|
|
stream.stop()
|
|
stream.close()
|
|
while not audio_queue.empty():
|
|
try:
|
|
audio_queue.get_nowait()
|
|
except queue.Empty:
|
|
break
|
|
vad.reset()
|
|
print("\n[SESSION ENDED]")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\nShutting down...")
|
|
finally:
|
|
typer.stop()
|
|
print("Goodbye.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|