import sys import queue import collections import webrtcvad SAMPLE_RATE = 16000 CHANNELS = 1 FRAME_DURATION_MS = 30 FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000) # 480 samples MIN_UTTERANCE_FRAMES = 10 # ~300ms minimum to filter coughs/clicks audio_queue = queue.Queue() def audio_callback(indata, frames, time_info, status): """sounddevice callback that pushes raw bytes to the audio queue.""" if status: print(status, file=sys.stderr) audio_queue.put(bytes(indata)) class VADProcessor: def __init__(self, aggressiveness, silence_threshold): self.vad = webrtcvad.Vad(aggressiveness) self.silence_threshold = silence_threshold self.reset() def reset(self): self.triggered = False self.utterance_frames = [] self.silence_duration = 0.0 self.pre_buffer = collections.deque(maxlen=10) # ~300ms pre-roll def process_frame(self, frame_bytes): """Process one 30ms frame. Returns utterance bytes when complete, else None.""" is_speech = self.vad.is_speech(frame_bytes, SAMPLE_RATE) if not self.triggered: self.pre_buffer.append(frame_bytes) if is_speech: self.triggered = True self.silence_duration = 0.0 self.utterance_frames = list(self.pre_buffer) self.utterance_frames.append(frame_bytes) else: self.utterance_frames.append(frame_bytes) if is_speech: self.silence_duration = 0.0 else: self.silence_duration += FRAME_DURATION_MS / 1000.0 if self.silence_duration >= self.silence_threshold: if len(self.utterance_frames) < MIN_UTTERANCE_FRAMES: self.reset() return None result = b"".join(self.utterance_frames) self.reset() return result return None