Refactor tool-speechtotext: extract sttlib shared library and add tests
Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
This commit is contained in:
7
python/tool-speechtotext/sttlib/__init__.py
Normal file
7
python/tool-speechtotext/sttlib/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from sttlib.whisper_loader import load_whisper_model
|
||||
from sttlib.audio import record_until_enter, pcm_bytes_to_float32
|
||||
from sttlib.transcription import transcribe, is_hallucination, HALLUCINATION_PATTERNS
|
||||
from sttlib.vad import (
|
||||
VADProcessor, audio_callback, audio_queue,
|
||||
SAMPLE_RATE, CHANNELS, FRAME_DURATION_MS, FRAME_SIZE, MIN_UTTERANCE_FRAMES,
|
||||
)
|
||||
28
python/tool-speechtotext/sttlib/audio.py
Normal file
28
python/tool-speechtotext/sttlib/audio.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import sys
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
|
||||
|
||||
def record_until_enter(sample_rate=16000):
|
||||
"""Record audio until user presses Enter. Returns float32 numpy array."""
|
||||
print("\n[READY] Press Enter to START recording...")
|
||||
input()
|
||||
print("[RECORDING] Press Enter to STOP...")
|
||||
|
||||
recording = []
|
||||
|
||||
def callback(indata, frames, time, status):
|
||||
if status:
|
||||
print(status, file=sys.stderr)
|
||||
recording.append(indata.copy())
|
||||
|
||||
with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback):
|
||||
input()
|
||||
|
||||
return np.concatenate(recording, axis=0)
|
||||
|
||||
|
||||
def pcm_bytes_to_float32(pcm_bytes):
|
||||
"""Convert raw 16-bit PCM bytes to float32 array normalized to [-1, 1]."""
|
||||
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||
return audio_int16.astype(np.float32) / 32768.0
|
||||
19
python/tool-speechtotext/sttlib/transcription.py
Normal file
19
python/tool-speechtotext/sttlib/transcription.py
Normal file
@@ -0,0 +1,19 @@
|
||||
HALLUCINATION_PATTERNS = [
|
||||
"thank you", "thanks for watching", "subscribe",
|
||||
"bye", "the end", "thank you for watching",
|
||||
"please subscribe", "like and subscribe",
|
||||
]
|
||||
|
||||
|
||||
def transcribe(model, audio_float32):
|
||||
"""Transcribe audio using Whisper. Returns stripped text."""
|
||||
segments, _ = model.transcribe(audio_float32, beam_size=5)
|
||||
return "".join(segment.text for segment in segments).strip()
|
||||
|
||||
|
||||
def is_hallucination(text):
|
||||
"""Return True if text looks like a Whisper hallucination."""
|
||||
lowered = text.lower().strip()
|
||||
if len(lowered) < 3:
|
||||
return True
|
||||
return any(p in lowered for p in HALLUCINATION_PATTERNS)
|
||||
58
python/tool-speechtotext/sttlib/vad.py
Normal file
58
python/tool-speechtotext/sttlib/vad.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import sys
|
||||
import queue
|
||||
import collections
|
||||
import webrtcvad
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
CHANNELS = 1
|
||||
FRAME_DURATION_MS = 30
|
||||
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples
|
||||
MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks
|
||||
|
||||
audio_queue = queue.Queue()
|
||||
|
||||
|
||||
def audio_callback(indata, frames, time_info, status):
|
||||
"""sounddevice callback that pushes raw bytes to the audio queue."""
|
||||
if status:
|
||||
print(status, file=sys.stderr)
|
||||
audio_queue.put(bytes(indata))
|
||||
|
||||
|
||||
class VADProcessor:
|
||||
def __init__(self, aggressiveness, silence_threshold):
|
||||
self.vad = webrtcvad.Vad(aggressiveness)
|
||||
self.silence_threshold = silence_threshold
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.triggered = False
|
||||
self.utterance_frames = []
|
||||
self.silence_duration = 0.0
|
||||
self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll
|
||||
|
||||
def process_frame(self, frame_bytes):
|
||||
"""Process one 30ms frame. Returns utterance bytes when complete, else None."""
|
||||
is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE)
|
||||
|
||||
if not self.triggered:
|
||||
self.pre_buffer.append(frame_bytes)
|
||||
if is_speech:
|
||||
self.triggered = True
|
||||
self.silence_duration = 0.0
|
||||
self.utterance_frames = list(self.pre_buffer)
|
||||
self.utterance_frames.append(frame_bytes)
|
||||
else:
|
||||
self.utterance_frames.append(frame_bytes)
|
||||
if is_speech:
|
||||
self.silence_duration = 0.0
|
||||
else:
|
||||
self.silence_duration += FRAME_DURATION_MS / 1000.0
|
||||
if self.silence_duration >= self.silence_threshold:
|
||||
if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES:
|
||||
self.reset()
|
||||
return None
|
||||
result = b"".join(self.utterance_frames)
|
||||
self.reset()
|
||||
return result
|
||||
return None
|
||||
15
python/tool-speechtotext/sttlib/whisper_loader.py
Normal file
15
python/tool-speechtotext/sttlib/whisper_loader.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import os
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
|
||||
|
||||
|
||||
def load_whisper_model(model_size):
|
||||
"""Load Whisper with GPU (cuda/float16) -> CPU (cpu/int8) fallback."""
|
||||
print(f"Loading Whisper model ({model_size})...")
|
||||
try:
|
||||
return WhisperModel(model_size, device="cuda", compute_type="float16")
|
||||
except Exception as e:
|
||||
print(f"GPU loading failed: {e}")
|
||||
print("Falling back to CPU (int8)")
|
||||
return WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
Reference in New Issue
Block a user