Refactor tool-speechtotext: extract sttlib shared library and add tests

Extract duplicated code (Whisper loading, audio recording, transcription,
VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as
thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback
bug in assistant.py and args.system assignment bug.
This commit is contained in:
local
2026-02-08 00:40:31 +00:00
parent 848681087e
commit 104da381fb
15 changed files with 480 additions and 195 deletions

View File

@@ -0,0 +1,7 @@
from sttlib.whisper_loader import load_whisper_model
from sttlib.audio import record_until_enter, pcm_bytes_to_float32
from sttlib.transcription import transcribe, is_hallucination, HALLUCINATION_PATTERNS
from sttlib.vad import (
VADProcessor, audio_callback, audio_queue,
SAMPLE_RATE, CHANNELS, FRAME_DURATION_MS, FRAME_SIZE, MIN_UTTERANCE_FRAMES,
)

View File

@@ -0,0 +1,28 @@
import sys
import numpy as np
import sounddevice as sd
def record_until_enter(sample_rate=16000):
"""Record audio until user presses Enter. Returns float32 numpy array."""
print("\n[READY] Press Enter to START recording...")
input()
print("[RECORDING] Press Enter to STOP...")
recording = []
def callback(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
recording.append(indata.copy())
with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback):
input()
return np.concatenate(recording, axis=0)
def pcm_bytes_to_float32(pcm_bytes):
"""Convert raw 16-bit PCM bytes to float32 array normalized to [-1, 1]."""
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
return audio_int16.astype(np.float32) / 32768.0

View File

@@ -0,0 +1,19 @@
HALLUCINATION_PATTERNS = [
"thank you", "thanks for watching", "subscribe",
"bye", "the end", "thank you for watching",
"please subscribe", "like and subscribe",
]
def transcribe(model, audio_float32):
"""Transcribe audio using Whisper. Returns stripped text."""
segments, _ = model.transcribe(audio_float32, beam_size=5)
return "".join(segment.text for segment in segments).strip()
def is_hallucination(text):
"""Return True if text looks like a Whisper hallucination."""
lowered = text.lower().strip()
if len(lowered) < 3:
return True
return any(p in lowered for p in HALLUCINATION_PATTERNS)

View File

@@ -0,0 +1,58 @@
import sys
import queue
import collections
import webrtcvad
SAMPLE_RATE = 16000
CHANNELS = 1
FRAME_DURATION_MS = 30
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples
MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks
audio_queue = queue.Queue()
def audio_callback(indata, frames, time_info, status):
"""sounddevice callback that pushes raw bytes to the audio queue."""
if status:
print(status, file=sys.stderr)
audio_queue.put(bytes(indata))
class VADProcessor:
def __init__(self, aggressiveness, silence_threshold):
self.vad = webrtcvad.Vad(aggressiveness)
self.silence_threshold = silence_threshold
self.reset()
def reset(self):
self.triggered = False
self.utterance_frames = []
self.silence_duration = 0.0
self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll
def process_frame(self, frame_bytes):
"""Process one 30ms frame. Returns utterance bytes when complete, else None."""
is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE)
if not self.triggered:
self.pre_buffer.append(frame_bytes)
if is_speech:
self.triggered = True
self.silence_duration = 0.0
self.utterance_frames = list(self.pre_buffer)
self.utterance_frames.append(frame_bytes)
else:
self.utterance_frames.append(frame_bytes)
if is_speech:
self.silence_duration = 0.0
else:
self.silence_duration += FRAME_DURATION_MS / 1000.0
if self.silence_duration >= self.silence_threshold:
if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES:
self.reset()
return None
result = b"".join(self.utterance_frames)
self.reset()
return result
return None

View File

@@ -0,0 +1,15 @@
import os
from faster_whisper import WhisperModel
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
def load_whisper_model(model_size):
"""Load Whisper with GPU (cuda/float16) -> CPU (cpu/int8) fallback."""
print(f"Loading Whisper model ({model_size})...")
try:
return WhisperModel(model_size, device="cuda", compute_type="float16")
except Exception as e:
print(f"GPU loading failed: {e}")
print("Falling back to CPU (int8)")
return WhisperModel(model_size, device="cpu", compute_type="int8")