Refactor tool-speechtotext: extract sttlib shared library and add tests
Extract duplicated code (Whisper loading, audio recording, transcription, VAD processing) into reusable sttlib/ package. Rewrite all 3 scripts as thin wrappers. Add 24 unit tests with mocked hardware. Fix GPU fallback bug in assistant.py and args.system assignment bug.
This commit is contained in:
28
python/tool-speechtotext/sttlib/audio.py
Normal file
28
python/tool-speechtotext/sttlib/audio.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import sys
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
|
||||
|
||||
def record_until_enter(sample_rate=16000):
|
||||
"""Record audio until user presses Enter. Returns float32 numpy array."""
|
||||
print("\n[READY] Press Enter to START recording...")
|
||||
input()
|
||||
print("[RECORDING] Press Enter to STOP...")
|
||||
|
||||
recording = []
|
||||
|
||||
def callback(indata, frames, time, status):
|
||||
if status:
|
||||
print(status, file=sys.stderr)
|
||||
recording.append(indata.copy())
|
||||
|
||||
with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback):
|
||||
input()
|
||||
|
||||
return np.concatenate(recording, axis=0)
|
||||
|
||||
|
||||
def pcm_bytes_to_float32(pcm_bytes):
|
||||
"""Convert raw 16-bit PCM bytes to float32 array normalized to [-1, 1]."""
|
||||
audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||
return audio_int16.astype(np.float32) / 32768.0
|
||||
Reference in New Issue
Block a user