diff --git a/python/tool-speechtotext/.vscode/settings.json b/python/tool-speechtotext/.vscode/settings.json index a8c2003..3bbdf1b 100644 --- a/python/tool-speechtotext/.vscode/settings.json +++ b/python/tool-speechtotext/.vscode/settings.json @@ -1,5 +1,11 @@ { "python-envs.defaultEnvManager": "ms-python.python:conda", "python-envs.defaultPackageManager": "ms-python.python:conda", - "python-envs.pythonProjects": [] + "python-envs.pythonProjects": [], + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "python.testing.pytestArgs": [ + "tests", + "-v" + ] } \ No newline at end of file diff --git a/python/tool-speechtotext/CLAUDE.md b/python/tool-speechtotext/CLAUDE.md index fa79c5e..00d4e38 100644 --- a/python/tool-speechtotext/CLAUDE.md +++ b/python/tool-speechtotext/CLAUDE.md @@ -12,11 +12,20 @@ Speech-to-text command line utilities leveraging local models (faster-whisper, O ## Tools - `assistant.py` / `talk.sh` — transcribe speech, copy to clipboard, optionally send to Ollama - `voice_to_terminal.py` / `terminal.sh` — voice-controlled terminal via Ollama tool calling -- `voice_to_xdotool.py` / `dotool.sh` — hands-free voice typing into any focused window (VAD + xdotool) +- `voice_to_xdotool.py` / `xdotool.sh` — hands-free voice typing into any focused window (VAD + xdotool) + +## Shared Library +- `sttlib/` — shared package used by all scripts and importable by other projects + - `whisper_loader.py` — model loading with GPU→CPU fallback + - `audio.py` — press-enter recording, PCM conversion + - `transcription.py` — Whisper transcribe wrapper, hallucination filter + - `vad.py` — VADProcessor, audio callback, constants +- Other projects import via: `sys.path.insert(0, "/path/to/tool-speechtotext")` ## Testing -- To test scripts: `mamba run -n whisper-ollama python --model-size base` +- Run tests: `mamba run -n whisper-ollama python -m pytest tests/` - Use `--model-size base` for faster iteration during development +- Tests mock hardware (Whisper model, VAD, mic) — no GPU/mic needed to run them - Audio device is available — live mic testing is possible - Test xdotool output by focusing a text editor window @@ -24,12 +33,13 @@ Speech-to-text command line utilities leveraging local models (faster-whisper, O - Conda: faster-whisper, sounddevice, numpy, pyperclip, requests, ollama - Pip (in conda env): webrtcvad - System: libportaudio2, xdotool +- Dev: pytest ## Conventions - Shell wrappers go in .sh files using `mamba run -n whisper-ollama` -- All scripts set `CT2_CUDA_ALLOW_FP16=1` +- Shared code lives in `sttlib/` — scripts are thin entry points that import from it - Whisper model loading always has GPU (cuda/float16) -> CPU (cpu/int8) fallback -- Keep scripts self-contained (no shared module) +- `CT2_CUDA_ALLOW_FP16=1` is set by `sttlib.whisper_loader` at import time - Don't print output for non-actionable events ## Preferences diff --git a/python/tool-speechtotext/assistant.py b/python/tool-speechtotext/assistant.py index f8eca7b..41eb80d 100644 --- a/python/tool-speechtotext/assistant.py +++ b/python/tool-speechtotext/assistant.py @@ -1,57 +1,17 @@ -import sounddevice as sd -import numpy as np +import argparse import pyperclip import requests -import sys -import argparse -from faster_whisper import WhisperModel - -import os -os.environ["CT2_CUDA_ALLOW_FP16"] = "1" +from sttlib import load_whisper_model, record_until_enter, transcribe # --- Configuration --- -MODEL_SIZE = "medium" # Options: "base", "small", "medium", "large-v3" -OLLAMA_URL = "http://localhost:11434/api/generate" # Default is 11434 +OLLAMA_URL = "http://localhost:11434/api/generate" DEFAULT_OLLAMA_MODEL = "qwen3:latest" -# Load Whisper on GPU -# float16 is faster and uses less VRAM on NVIDIA cards -print("Loading Whisper model...") - - -try: - model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16") -except Exception as e: - print(f"Error loading GPU: {e}") - print("Falling back to CPU (Check your CUDA/cuDNN installation)") - model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int16") - - -def record_audio(): - fs = 16000 - print("\n[READY] Press Enter to START recording...") - input() - print("[RECORDING] Press Enter to STOP...") - - recording = [] - - def callback(indata, frames, time, status): - if status: - print(status, file=sys.stderr) - recording.append(indata.copy()) - - with sd.InputStream(samplerate=fs, channels=1, callback=callback): - input() - - return np.concatenate(recording, axis=0) - def main(): - # 1. Setup Parser print(f"System active. Model: {DEFAULT_OLLAMA_MODEL}") parser = argparse.ArgumentParser(description="Whisper + Ollama CLI") - # Known Arguments (Hardcoded logic) parser.add_argument("--nollm", "-n", action='store_true', help="turn off llm") parser.add_argument("--system", "-s", default=None, @@ -65,30 +25,27 @@ def main(): parser.add_argument( "--temp", default='0.7', help="temperature") - # 2. Capture "Unknown" arguments - # args = known values, unknown = a list like ['--num_ctx', '4096', '--temp', '0.7'] args, unknown = parser.parse_known_args() # Convert unknown list to a dictionary for the Ollama 'options' field - # This logic pairs ['--key', 'value'] into {key: value} extra_options = {} for i in range(0, len(unknown), 2): - key = unknown[i].lstrip('-') # remove the '--' + key = unknown[i].lstrip('-') val = unknown[i+1] - # Try to convert numbers to actual ints/floats try: val = float(val) if '.' in val else int(val) except ValueError: pass extra_options[key] = val + model = load_whisper_model(args.model_size) + while True: try: - audio_data = record_audio() + audio_data = record_until_enter() print("[TRANSCRIBING]...") - segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) - text = "".join([segment.text for segment in segments]).strip() + text = transcribe(model, audio_data.flatten()) if not text: print("No speech detected. Try again.") @@ -97,8 +54,7 @@ def main(): print(f"You said: {text}") pyperclip.copy(text) - if (args.nollm == False): - # Send to Ollama + if not args.nollm: print(f"[OLLAMA] Thinking...") payload = { "model": args.ollama_model, @@ -108,9 +64,9 @@ def main(): } if args.system: - payload["system"] = args - response = requests.post(OLLAMA_URL, json=payload) + payload["system"] = args.system + response = requests.post(OLLAMA_URL, json=payload) result = response.json().get("response", "") print(f"\nLLM Response:\n{result}\n") else: diff --git a/python/tool-speechtotext/sttlib/__init__.py b/python/tool-speechtotext/sttlib/__init__.py new file mode 100644 index 0000000..89c8ced --- /dev/null +++ b/python/tool-speechtotext/sttlib/__init__.py @@ -0,0 +1,7 @@ +from sttlib.whisper_loader import load_whisper_model +from sttlib.audio import record_until_enter, pcm_bytes_to_float32 +from sttlib.transcription import transcribe, is_hallucination, HALLUCINATION_PATTERNS +from sttlib.vad import ( + VADProcessor, audio_callback, audio_queue, + SAMPLE_RATE, CHANNELS, FRAME_DURATION_MS, FRAME_SIZE, MIN_UTTERANCE_FRAMES, +) diff --git a/python/tool-speechtotext/sttlib/audio.py b/python/tool-speechtotext/sttlib/audio.py new file mode 100644 index 0000000..799c30d --- /dev/null +++ b/python/tool-speechtotext/sttlib/audio.py @@ -0,0 +1,28 @@ +import sys +import numpy as np +import sounddevice as sd + + +def record_until_enter(sample_rate=16000): + """Record audio until user presses Enter. Returns float32 numpy array.""" + print("\n[READY] Press Enter to START recording...") + input() + print("[RECORDING] Press Enter to STOP...") + + recording = [] + + def callback(indata, frames, time, status): + if status: + print(status, file=sys.stderr) + recording.append(indata.copy()) + + with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback): + input() + + return np.concatenate(recording, axis=0) + + +def pcm_bytes_to_float32(pcm_bytes): + """Convert raw 16-bit PCM bytes to float32 array normalized to [-1, 1].""" + audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16) + return audio_int16.astype(np.float32) / 32768.0 diff --git a/python/tool-speechtotext/sttlib/transcription.py b/python/tool-speechtotext/sttlib/transcription.py new file mode 100644 index 0000000..96f730c --- /dev/null +++ b/python/tool-speechtotext/sttlib/transcription.py @@ -0,0 +1,19 @@ +HALLUCINATION_PATTERNS = [ + "thank you", "thanks for watching", "subscribe", + "bye", "the end", "thank you for watching", + "please subscribe", "like and subscribe", +] + + +def transcribe(model, audio_float32): + """Transcribe audio using Whisper. Returns stripped text.""" + segments, _ = model.transcribe(audio_float32, beam_size=5) + return "".join(segment.text for segment in segments).strip() + + +def is_hallucination(text): + """Return True if text looks like a Whisper hallucination.""" + lowered = text.lower().strip() + if len(lowered) < 3: + return True + return any(p in lowered for p in HALLUCINATION_PATTERNS) diff --git a/python/tool-speechtotext/sttlib/vad.py b/python/tool-speechtotext/sttlib/vad.py new file mode 100644 index 0000000..1cdc5b6 --- /dev/null +++ b/python/tool-speechtotext/sttlib/vad.py @@ -0,0 +1,58 @@ +import sys +import queue +import collections +import webrtcvad + +SAMPLE_RATE = 16000 +CHANNELS = 1 +FRAME_DURATION_MS = 30 +FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples +MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks + +audio_queue = queue.Queue() + + +def audio_callback(indata, frames, time_info, status): + """sounddevice callback that pushes raw bytes to the audio queue.""" + if status: + print(status, file=sys.stderr) + audio_queue.put(bytes(indata)) + + +class VADProcessor: + def __init__(self, aggressiveness, silence_threshold): + self.vad = webrtcvad.Vad(aggressiveness) + self.silence_threshold = silence_threshold + self.reset() + + def reset(self): + self.triggered = False + self.utterance_frames = [] + self.silence_duration = 0.0 + self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll + + def process_frame(self, frame_bytes): + """Process one 30ms frame. Returns utterance bytes when complete, else None.""" + is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE) + + if not self.triggered: + self.pre_buffer.append(frame_bytes) + if is_speech: + self.triggered = True + self.silence_duration = 0.0 + self.utterance_frames = list(self.pre_buffer) + self.utterance_frames.append(frame_bytes) + else: + self.utterance_frames.append(frame_bytes) + if is_speech: + self.silence_duration = 0.0 + else: + self.silence_duration += FRAME_DURATION_MS / 1000.0 + if self.silence_duration >= self.silence_threshold: + if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES: + self.reset() + return None + result = b"".join(self.utterance_frames) + self.reset() + return result + return None diff --git a/python/tool-speechtotext/sttlib/whisper_loader.py b/python/tool-speechtotext/sttlib/whisper_loader.py new file mode 100644 index 0000000..7df7a86 --- /dev/null +++ b/python/tool-speechtotext/sttlib/whisper_loader.py @@ -0,0 +1,15 @@ +import os +from faster_whisper import WhisperModel + +os.environ["CT2_CUDA_ALLOW_FP16"] = "1" + + +def load_whisper_model(model_size): + """Load Whisper with GPU (cuda/float16) -> CPU (cpu/int8) fallback.""" + print(f"Loading Whisper model ({model_size})...") + try: + return WhisperModel(model_size, device="cuda", compute_type="float16") + except Exception as e: + print(f"GPU loading failed: {e}") + print("Falling back to CPU (int8)") + return WhisperModel(model_size, device="cpu", compute_type="int8") diff --git a/python/tool-speechtotext/tests/__init__.py b/python/tool-speechtotext/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/tool-speechtotext/tests/test_audio.py b/python/tool-speechtotext/tests/test_audio.py new file mode 100644 index 0000000..29988f2 --- /dev/null +++ b/python/tool-speechtotext/tests/test_audio.py @@ -0,0 +1,38 @@ +import struct +import numpy as np +from sttlib.audio import pcm_bytes_to_float32 + + +def test_known_value(): + # 16384 in int16 -> 0.5 in float32 + pcm = struct.pack(" ~1.0 + pcm = struct.pack(" -1.0 + pcm = struct.pack(" 0 + + +def test_short_utterance_filtered(): + # Use very short silence threshold so silence frames don't push total + # past MIN_UTTERANCE_FRAMES. With threshold=0.09s (3 frames of silence): + # 0 pre-buffer + 1 speech + 3 silence = 4 total < MIN_UTTERANCE_FRAMES (10) + proc, mock_vad = _make_vad_processor(silence_threshold=0.09) + + # Single speech frame triggers VAD + mock_vad.is_speech.return_value = True + proc.process_frame(_frame("s")) + + # Immediately go silent — threshold reached in 3 frames + mock_vad.is_speech.return_value = False + result = None + for _ in range(20): + result = proc.process_frame(_frame("q")) + if result is not None: + break + + # Should be filtered (too short — only 4 total frames) + assert result is None + + +def test_pre_buffer_included(): + proc, mock_vad = _make_vad_processor(silence_threshold=0.3) + + # Fill pre-buffer with non-speech frames + mock_vad.is_speech.return_value = False + pre_frame = _frame("p") + for _ in range(10): + proc.process_frame(pre_frame) + + # Speech starts + mock_vad.is_speech.return_value = True + speech_frame = _frame("s") + for _ in range(MIN_UTTERANCE_FRAMES): + proc.process_frame(speech_frame) + + # Silence to trigger + mock_vad.is_speech.return_value = False + result = None + for _ in range(20): + result = proc.process_frame(_frame("q")) + if result is not None: + break + + assert result is not None + # Result should contain pre-buffer frames + assert pre_frame in result + + +def test_reset_after_utterance(): + proc, mock_vad = _make_vad_processor(silence_threshold=0.3) + + # First utterance + mock_vad.is_speech.return_value = True + for _ in range(MIN_UTTERANCE_FRAMES + 5): + proc.process_frame(_frame("s")) + + mock_vad.is_speech.return_value = False + for _ in range(20): + result = proc.process_frame(_frame("q")) + if result is not None: + break + assert result is not None + + # After reset, should be able to collect a second utterance + assert not proc.triggered + assert proc.utterance_frames == [] + + mock_vad.is_speech.return_value = True + for _ in range(MIN_UTTERANCE_FRAMES + 5): + proc.process_frame(_frame("s")) + + mock_vad.is_speech.return_value = False + result2 = None + for _ in range(20): + result2 = proc.process_frame(_frame("q")) + if result2 is not None: + break + assert result2 is not None + + +def test_silence_threshold_boundary(): + # Use 0.3s threshold: 0.3 / 0.03 = exactly 10 frames needed + threshold = 0.3 + proc, mock_vad = _make_vad_processor(silence_threshold=threshold) + + # Start with speech + mock_vad.is_speech.return_value = True + for _ in range(MIN_UTTERANCE_FRAMES + 5): + proc.process_frame(_frame("s")) + + frames_needed = 10 # 0.3s / 0.03s per frame + mock_vad.is_speech.return_value = False + + # Feed one less than needed — should NOT trigger + for i in range(frames_needed - 1): + result = proc.process_frame(_frame("q")) + assert result is None, f"Triggered too early at frame {i}" + + # The 10th frame should trigger (silence_duration = 0.3 >= 0.3) + result = proc.process_frame(_frame("q")) + assert result is not None diff --git a/python/tool-speechtotext/tests/test_whisper_loader.py b/python/tool-speechtotext/tests/test_whisper_loader.py new file mode 100644 index 0000000..c7a0897 --- /dev/null +++ b/python/tool-speechtotext/tests/test_whisper_loader.py @@ -0,0 +1,37 @@ +from unittest.mock import patch, MagicMock +from sttlib.whisper_loader import load_whisper_model + + +@patch("sttlib.whisper_loader.WhisperModel") +def test_gpu_success(mock_cls): + mock_model = MagicMock() + mock_cls.return_value = mock_model + + result = load_whisper_model("base") + + assert result is mock_model + mock_cls.assert_called_once_with("base", device="cuda", compute_type="float16") + + +@patch("sttlib.whisper_loader.WhisperModel") +def test_gpu_fails_cpu_fallback(mock_cls): + mock_model = MagicMock() + mock_cls.side_effect = [RuntimeError("no CUDA"), mock_model] + + result = load_whisper_model("base") + + assert result is mock_model + assert mock_cls.call_count == 2 + _, kwargs = mock_cls.call_args + assert kwargs == {"device": "cpu", "compute_type": "int8"} + + +@patch("sttlib.whisper_loader.WhisperModel") +def test_both_fail_propagates(mock_cls): + mock_cls.side_effect = RuntimeError("no device") + + try: + load_whisper_model("base") + assert False, "Should have raised" + except RuntimeError: + pass diff --git a/python/tool-speechtotext/voice_to_terminal.py b/python/tool-speechtotext/voice_to_terminal.py index fc1af78..217247d 100644 --- a/python/tool-speechtotext/voice_to_terminal.py +++ b/python/tool-speechtotext/voice_to_terminal.py @@ -1,31 +1,16 @@ -import sounddevice as sd -import numpy as np -import pyperclip -import sys import argparse -import os import subprocess -import ollama import json -from faster_whisper import WhisperModel +import ollama +from sttlib import load_whisper_model, record_until_enter, transcribe # --- Configuration --- -os.environ["CT2_CUDA_ALLOW_FP16"] = "1" -MODEL_SIZE = "medium" OLLAMA_MODEL = "qwen2.5-coder:7b" CONFIRM_COMMANDS = True # Set to False to run commands instantly -# Load Whisper on GPU -print("Loading Whisper model...") -try: - model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16") -except Exception as e: - print(f"Error loading GPU: {e}, falling back to CPU") - model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8") # --- Terminal Tool --- - def run_terminal_command(command: str): """ Executes a bash command in the Linux terminal. @@ -33,8 +18,7 @@ def run_terminal_command(command: str): """ if CONFIRM_COMMANDS: print(f"\n{'='*40}") - print(f"⚠️ AI SUGGESTED: \033[1;32m{command}\033[0m") - # Allow user to provide feedback if they say 'n' + print(f"\u26a0\ufe0f AI SUGGESTED: \033[1;32m{command}\033[0m") choice = input(" Confirm? [Y/n] or provide feedback: ").strip() if choice.lower() == 'n': @@ -57,22 +41,15 @@ def run_terminal_command(command: str): return f"Execution Error: {str(e)}" -def record_audio(): - fs, recording = 16000, [] - print("\n[READY] Press Enter to START...") - input() - print("[RECORDING] Press Enter to STOP...") - def cb(indata, f, t, s): recording.append(indata.copy()) - with sd.InputStream(samplerate=fs, channels=1, callback=cb): - input() - return np.concatenate(recording, axis=0) - - def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", default=OLLAMA_MODEL) + parser.add_argument("--model-size", default="medium", + help="Whisper model size") args, _ = parser.parse_known_args() + whisper_model = load_whisper_model(args.model_size) + # Initial System Prompt messages = [{ 'role': 'system', @@ -88,9 +65,8 @@ def main(): while True: try: # 1. Voice Capture - audio_data = record_audio() - segments, _ = model.transcribe(audio_data.flatten(), beam_size=5) - user_text = "".join([s.text for s in segments]).strip() + audio_data = record_until_enter() + user_text = transcribe(whisper_model, audio_data.flatten()) if not user_text: continue diff --git a/python/tool-speechtotext/voice_to_xdotool.py b/python/tool-speechtotext/voice_to_xdotool.py index 4421fda..f4776d9 100644 --- a/python/tool-speechtotext/voice_to_xdotool.py +++ b/python/tool-speechtotext/voice_to_xdotool.py @@ -1,91 +1,14 @@ -import sounddevice as sd -import numpy as np -import webrtcvad -import subprocess -import sys -import os import argparse +import subprocess import threading import queue -import collections import time -from faster_whisper import WhisperModel - -os.environ["CT2_CUDA_ALLOW_FP16"] = "1" - -# --- Constants --- -SAMPLE_RATE = 16000 -CHANNELS = 1 -FRAME_DURATION_MS = 30 -FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples -MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks - -HALLUCINATION_PATTERNS = [ - "thank you", "thanks for watching", "subscribe", - "bye", "the end", "thank you for watching", - "please subscribe", "like and subscribe", -] - -# --- Thread-safe audio queue --- -audio_queue = queue.Queue() - - -def audio_callback(indata, frames, time_info, status): - if status: - print(status, file=sys.stderr) - audio_queue.put(bytes(indata)) - - -# --- Whisper model loading (reused pattern from assistant.py) --- -def load_whisper_model(model_size): - print(f"Loading Whisper model ({model_size})...") - try: - return WhisperModel(model_size, device="cuda", compute_type="float16") - except Exception as e: - print(f"GPU loading failed: {e}") - print("Falling back to CPU (int8)") - return WhisperModel(model_size, device="cpu", compute_type="int8") - - -# --- VAD State Machine --- -class VADProcessor: - def __init__(self, aggressiveness, silence_threshold): - self.vad = webrtcvad.Vad(aggressiveness) - self.silence_threshold = silence_threshold - self.reset() - - def reset(self): - self.triggered = False - self.utterance_frames = [] - self.silence_duration = 0.0 - self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll - - def process_frame(self, frame_bytes): - """Process one 30ms frame. Returns utterance bytes when complete, else None.""" - is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE) - - if not self.triggered: - self.pre_buffer.append(frame_bytes) - if is_speech: - self.triggered = True - self.silence_duration = 0.0 - self.utterance_frames = list(self.pre_buffer) - self.utterance_frames.append(frame_bytes) - pass # silent until transcription confirms speech - else: - self.utterance_frames.append(frame_bytes) - if is_speech: - self.silence_duration = 0.0 - else: - self.silence_duration += FRAME_DURATION_MS / 1000.0 - if self.silence_duration >= self.silence_threshold: - if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES: - self.reset() - return None - result = b"".join(self.utterance_frames) - self.reset() - return result - return None +import sounddevice as sd +from sttlib import ( + load_whisper_model, transcribe, is_hallucination, pcm_bytes_to_float32, + VADProcessor, audio_callback, audio_queue, + SAMPLE_RATE, CHANNELS, FRAME_SIZE, +) # --- Typer Interface (xdotool) --- @@ -99,6 +22,7 @@ class Typer: except FileNotFoundError: print("ERROR: xdotool not found. Install it:") print(" sudo apt-get install xdotool") + import sys sys.exit(1) def type_text(self, text, submit_now=False): @@ -120,24 +44,6 @@ class Typer: pass -# --- Helpers --- -def pcm_bytes_to_float32(pcm_bytes): - audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16) - return audio_int16.astype(np.float32) / 32768.0 - - -def transcribe(model, audio_float32): - segments, _ = model.transcribe(audio_float32, beam_size=5) - return "".join(segment.text for segment in segments).strip() - - -def is_hallucination(text): - lowered = text.lower().strip() - if len(lowered) < 3: - return True - return any(p in lowered for p in HALLUCINATION_PATTERNS) - - # --- CLI --- def parse_args(): parser = argparse.ArgumentParser(