Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
59 lines
1.9 KiB
Python
59 lines
1.9 KiB
Python
import sys
|
|
import queue
|
|
import collections
|
|
import webrtcvad
|
|
|
|
SAMPLE_RATE = 16000
|
|
CHANNELS = 1
|
|
FRAME_DURATION_MS = 30
|
|
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples
|
|
MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks
|
|
|
|
audio_queue = queue.Queue()
|
|
|
|
|
|
def audio_callback(indata, frames, time_info, status):
|
|
"""sounddevice callback that pushes raw bytes to the audio queue."""
|
|
if status:
|
|
print(status, file=sys.stderr)
|
|
audio_queue.put(bytes(indata))
|
|
|
|
|
|
class VADProcessor:
|
|
def __init__(self, aggressiveness, silence_threshold):
|
|
self.vad = webrtcvad.Vad(aggressiveness)
|
|
self.silence_threshold = silence_threshold
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.triggered = False
|
|
self.utterance_frames = []
|
|
self.silence_duration = 0.0
|
|
self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll
|
|
|
|
def process_frame(self, frame_bytes):
|
|
"""Process one 30ms frame. Returns utterance bytes when complete, else None."""
|
|
is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE)
|
|
|
|
if not self.triggered:
|
|
self.pre_buffer.append(frame_bytes)
|
|
if is_speech:
|
|
self.triggered = True
|
|
self.silence_duration = 0.0
|
|
self.utterance_frames = list(self.pre_buffer)
|
|
self.utterance_frames.append(frame_bytes)
|
|
else:
|
|
self.utterance_frames.append(frame_bytes)
|
|
if is_speech:
|
|
self.silence_duration = 0.0
|
|
else:
|
|
self.silence_duration += FRAME_DURATION_MS / 1000.0
|
|
if self.silence_duration >= self.silence_threshold:
|
|
if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES:
|
|
self.reset()
|
|
return None
|
|
result = b"".join(self.utterance_frames)
|
|
self.reset()
|
|
return result
|
|
return None
|