diff --git a/python/tool-speechtotext/CLAUDE.md b/python/tool-speechtotext/CLAUDE.md new file mode 100644 index 0000000..fa79c5e --- /dev/null +++ b/python/tool-speechtotext/CLAUDE.md @@ -0,0 +1,39 @@ +# Project: speech-to-text tools + +Speech-to-text command line utilities leveraging local models (faster-whisper, Ollama). + +## Environment +- Debian Bookworm, kernel 6.1, X11 +- Conda env: `whisper-ollama` (Python 3.10, CUDA 12.2) +- mamba must be initialized before use — run: `eval "$(micromamba shell hook -s bash)"` +- GPU: NVIDIA (float16 capable) +- xdotool installed for keyboard simulation (X11 only) + +## Tools +- `assistant.py` / `talk.sh` — transcribe speech, copy to clipboard, optionally send to Ollama +- `voice_to_terminal.py` / `terminal.sh` — voice-controlled terminal via Ollama tool calling +- `voice_to_xdotool.py` / `dotool.sh` — hands-free voice typing into any focused window (VAD + xdotool) + +## Testing +- To test scripts: `mamba run -n whisper-ollama python --model-size base` +- Use `--model-size base` for faster iteration during development +- Audio device is available — live mic testing is possible +- Test xdotool output by focusing a text editor window + +## Dependencies +- Conda: faster-whisper, sounddevice, numpy, pyperclip, requests, ollama +- Pip (in conda env): webrtcvad +- System: libportaudio2, xdotool + +## Conventions +- Shell wrappers go in .sh files using `mamba run -n whisper-ollama` +- All scripts set `CT2_CUDA_ALLOW_FP16=1` +- Whisper model loading always has GPU (cuda/float16) -> CPU (cpu/int8) fallback +- Keep scripts self-contained (no shared module) +- Don't print output for non-actionable events + +## Preferences +- Prefer packages available via apt over building from source +- Check availability before recommending a dependency +- Prefer snappy/responsive defaults over cautious ones +- Avoid over-engineering — keep scripts simple and focused diff --git a/python/tool-speechtotext/README.md b/python/tool-speechtotext/README.md index 12bfa59..194d245 100644 --- a/python/tool-speechtotext/README.md +++ b/python/tool-speechtotext/README.md @@ -1,6 +1,14 @@ # Purpose -speech to text command line utility by leveraging off ollama a local speech-to-text model +Speech-to-text command line utilities leveraging local models (faster-whisper, Ollama). + +## Tools + +| Script | Wrapper | Description | +|--------|---------|-------------| +| `assistant.py` | `talk.sh` | Transcribe speech, copy to clipboard, optionally send to Ollama LLM | +| `voice_to_terminal.py` | `terminal.sh` | Voice-controlled terminal — AI suggests and executes bash commands | +| `voice_to_dotool.py` | `dotool.sh` | Hands-free voice typing into any focused window via xdotool (VAD-based) | ## Setup @@ -15,5 +23,44 @@ mamba activate whisper-ollama # Note: portaudio is required for sounddevice to work on Linux sudo apt-get update && sudo apt-get install libportaudio2 -y -pip install faster-whisper sounddevice numpy pyperclip requests +pip install faster-whisper sounddevice numpy pyperclip requests webrtcvad ``` + +## xdotool setup (required for voice_to_dotool.py) + +xdotool simulates keyboard input via X11. Already installed on most Linux desktops. + +```bash +# Install if not already present +sudo apt-get install xdotool +``` + +Note: xdotool is X11-only. For Wayland, swap to ydotool (`sudo apt install ydotool`). + +## Usage: voice_to_dotool.py + +Hands-free speech input — uses VAD to auto-detect when you start/stop speaking, transcribes with Whisper, and types the text into the focused window via xdotool. + +```bash +# Basic: type transcribed text (you press Enter to submit) +./dotool.sh + +# Auto-submit: also presses Enter after typing +./dotool.sh --submit + +# Adjust silence threshold (seconds of silence to end an utterance) +./dotool.sh --silence-threshold 2.0 + +# Use a smaller/faster Whisper model +./dotool.sh --model-size base + +# All options +./dotool.sh --submit --silence-threshold 1.5 --model-size medium --vad-aggressiveness 3 +``` + +### Workflow +1. Press Enter to start a listening session +2. Speak — VAD detects speech automatically +3. Pause — after the silence threshold, text is transcribed and typed +4. Keep speaking for more utterances, or press Enter to end the session +5. Ctrl+C to quit diff --git a/python/tool-speechtotext/voice_to_xdotool.py b/python/tool-speechtotext/voice_to_xdotool.py new file mode 100644 index 0000000..4421fda --- /dev/null +++ b/python/tool-speechtotext/voice_to_xdotool.py @@ -0,0 +1,263 @@ +import sounddevice as sd +import numpy as np +import webrtcvad +import subprocess +import sys +import os +import argparse +import threading +import queue +import collections +import time +from faster_whisper import WhisperModel + +os.environ["CT2_CUDA_ALLOW_FP16"] = "1" + +# --- Constants --- +SAMPLE_RATE = 16000 +CHANNELS = 1 +FRAME_DURATION_MS = 30 +FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples +MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks + +HALLUCINATION_PATTERNS = [ + "thank you", "thanks for watching", "subscribe", + "bye", "the end", "thank you for watching", + "please subscribe", "like and subscribe", +] + +# --- Thread-safe audio queue --- +audio_queue = queue.Queue() + + +def audio_callback(indata, frames, time_info, status): + if status: + print(status, file=sys.stderr) + audio_queue.put(bytes(indata)) + + +# --- Whisper model loading (reused pattern from assistant.py) --- +def load_whisper_model(model_size): + print(f"Loading Whisper model ({model_size})...") + try: + return WhisperModel(model_size, device="cuda", compute_type="float16") + except Exception as e: + print(f"GPU loading failed: {e}") + print("Falling back to CPU (int8)") + return WhisperModel(model_size, device="cpu", compute_type="int8") + + +# --- VAD State Machine --- +class VADProcessor: + def __init__(self, aggressiveness, silence_threshold): + self.vad = webrtcvad.Vad(aggressiveness) + self.silence_threshold = silence_threshold + self.reset() + + def reset(self): + self.triggered = False + self.utterance_frames = [] + self.silence_duration = 0.0 + self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll + + def process_frame(self, frame_bytes): + """Process one 30ms frame. Returns utterance bytes when complete, else None.""" + is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE) + + if not self.triggered: + self.pre_buffer.append(frame_bytes) + if is_speech: + self.triggered = True + self.silence_duration = 0.0 + self.utterance_frames = list(self.pre_buffer) + self.utterance_frames.append(frame_bytes) + pass # silent until transcription confirms speech + else: + self.utterance_frames.append(frame_bytes) + if is_speech: + self.silence_duration = 0.0 + else: + self.silence_duration += FRAME_DURATION_MS / 1000.0 + if self.silence_duration >= self.silence_threshold: + if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES: + self.reset() + return None + result = b"".join(self.utterance_frames) + self.reset() + return result + return None + + +# --- Typer Interface (xdotool) --- +class Typer: + def __init__(self, submit=False): + self.submit = submit + + def start(self): + try: + subprocess.run(["xdotool", "version"], capture_output=True, check=True) + except FileNotFoundError: + print("ERROR: xdotool not found. Install it:") + print(" sudo apt-get install xdotool") + sys.exit(1) + + def type_text(self, text, submit_now=False): + try: + subprocess.run( + ["xdotool", "type", "--clearmodifiers", "--delay", "0", "--", text], + check=True, + ) + if self.submit or submit_now: + time.sleep(0.1) + subprocess.run( + ["xdotool", "key", "--clearmodifiers", "Return"], + check=True, + ) + except subprocess.CalledProcessError as e: + print(f"\n [xdotool error: {e}]", end="", flush=True) + + def stop(self): + pass + + +# --- Helpers --- +def pcm_bytes_to_float32(pcm_bytes): + audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16) + return audio_int16.astype(np.float32) / 32768.0 + + +def transcribe(model, audio_float32): + segments, _ = model.transcribe(audio_float32, beam_size=5) + return "".join(segment.text for segment in segments).strip() + + +def is_hallucination(text): + lowered = text.lower().strip() + if len(lowered) < 3: + return True + return any(p in lowered for p in HALLUCINATION_PATTERNS) + + +# --- CLI --- +def parse_args(): + parser = argparse.ArgumentParser( + description="Voice-to-type: speak and type into any focused window via xdotool" + ) + parser.add_argument( + "--submit", action="store_true", + help="Auto-press Enter after typing (default: off)" + ) + parser.add_argument( + "--silence-threshold", type=float, default=0.8, + help="Seconds of silence to end an utterance (default: 0.8)" + ) + parser.add_argument( + "--submit-word", type=str, default="full stop", + help="Magic word at end of utterance to auto-submit (default: 'full stop')" + ) + parser.add_argument( + "--model-size", type=str, default="medium", + choices=["tiny", "base", "small", "medium", "large-v3"], + help="Whisper model size (default: medium)" + ) + parser.add_argument( + "--vad-aggressiveness", type=int, default=3, choices=[0, 1, 2, 3], + help="webrtcvad aggressiveness 0-3, higher filters more noise (default: 3)" + ) + parser.add_argument( + "--device", type=int, default=None, + help="Audio input device index (use 'python -m sounddevice' to list)" + ) + return parser.parse_args() + + +# --- Main --- +def main(): + args = parse_args() + + whisper_model = load_whisper_model(args.model_size) + vad = VADProcessor(args.vad_aggressiveness, args.silence_threshold) + typer = Typer(submit=args.submit) + typer.start() + + print("=== Voice-to-Type (xdotool) ===") + print(f" Model: {args.model_size}") + print(f" Silence threshold: {args.silence_threshold}s") + submit_info = "ON (always)" if args.submit else f'OFF (say "{args.submit_word}" to submit)' + print(f" Submit mode: {submit_info}") + print(f" VAD aggressiveness: {args.vad_aggressiveness}") + + try: + while True: + print("\n[SESSION] Press Enter to start listening (Ctrl+C to quit)...") + input() + print("[LISTENING] Speak now. Press Enter to stop session.") + print(" Waiting for speech...", end="", flush=True) + + stop_event = threading.Event() + + def wait_for_enter(): + input() + stop_event.set() + + enter_thread = threading.Thread(target=wait_for_enter, daemon=True) + enter_thread.start() + + try: + stream = sd.InputStream( + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype="int16", + blocksize=FRAME_SIZE, + callback=audio_callback, + device=args.device, + ) + except sd.PortAudioError as e: + print(f"\nAudio device error: {e}") + print("Available devices:") + print(sd.query_devices()) + continue + + stream.start() + + try: + while not stop_event.is_set(): + try: + frame_bytes = audio_queue.get(timeout=0.1) + except queue.Empty: + continue + + utterance_bytes = vad.process_frame(frame_bytes) + if utterance_bytes is not None: + audio_float32 = pcm_bytes_to_float32(utterance_bytes) + text = transcribe(whisper_model, audio_float32) + + if text and not is_hallucination(text): + submit_now = False + if text.lower().rstrip(".,!? ").endswith(args.submit_word): + text = text[:text.lower().rfind(args.submit_word)].rstrip(" ,.") + submit_now = True + if text: + marker = " [SUBMIT]" if submit_now else "" + print(f"\n >> \"{text}\"{marker}") + typer.type_text(text, submit_now=submit_now) + finally: + stream.stop() + stream.close() + while not audio_queue.empty(): + try: + audio_queue.get_nowait() + except queue.Empty: + break + vad.reset() + print("\n[SESSION ENDED]") + + except KeyboardInterrupt: + print("\nShutting down...") + finally: + typer.stop() + print("Goodbye.") + + +if __name__ == "__main__": + main() diff --git a/python/tool-speechtotext/xdotool.sh b/python/tool-speechtotext/xdotool.sh new file mode 100755 index 0000000..de6a4d3 --- /dev/null +++ b/python/tool-speechtotext/xdotool.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export CT2_CUDA_ALLOW_FP16=1 + +# 'mamba run' executes the command within the context of the environment +# without needing to source .bashrc or shell hooks manually. +mamba run -n whisper-ollama python ~/family-repo/Code/python/tool-speechtotext/voice_to_xdotool.py "$@"