# Code Borrowed From:
# 1. https://github.com/wiseman/py-webrtcvad/blob/3b39545dbb026d998bf407f1cb86e0ed6192a5a6/example.py
# 2. https://github.com/mozilla/DeepSpeech-examples/blob/r0.7/vad_transcriber/wavTranscriber.py
import collections
import logging
import webrtcvad
logger = logging.getLogger(__name__)
[docs]class Frame:
"""Represents a "frame" of audio data."""
def __init__(self, bytes, timestamp, duration):
self.bytes = bytes
self.timestamp = timestamp
self.duration = duration
[docs]def frame_generator(frame_duration_ms, audio, sample_rate):
"""Generates audio frames from PCM audio data.
Takes the desired frame duration in milliseconds, the PCM data, and
the sample rate.
Yields Frames of the requested duration.
"""
n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
offset = 0
timestamp = 0.0
duration = (float(n) / sample_rate) / 2.0
while offset + n < len(audio):
yield Frame(audio[offset : offset + n], timestamp, duration)
timestamp += duration
offset += n
[docs]def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
"""Filters out non-voiced audio frames.
Given a webrtcvad.Vad and a source of audio frames, yields only
the voiced audio.
Uses a padded, sliding window algorithm over the audio frames.
When more than 90% of the frames in the window are voiced (as
reported by the VAD), the collector triggers and begins yielding
audio frames. Then the collector waits until 90% of the frames in
the window are unvoiced to detrigger.
The window is padded at the front and back to provide a small
amount of silence or the beginnings/endings of speech around the
voiced frames.
Args:
sample_rate: The audio sample rate, in Hz.
frame_duration_ms: The frame duration in milliseconds.
padding_duration_ms: The amount to pad the window, in milliseconds.
vad: An instance of webrtcvad.Vad.
frames: a source of audio frames (sequence or generator).
Returns:
[generator]: A generator that yields PCM audio data.
"""
num_padding_frames = int(padding_duration_ms / frame_duration_ms)
# We use a deque for our sliding window/ring buffer.
ring_buffer = collections.deque(maxlen=num_padding_frames)
# We have two states: TRIGGERED and NOTTRIGGERED. We start in the
# NOTTRIGGERED state.
triggered = False
voiced_frames = []
for frame in frames:
is_speech = vad.is_speech(frame.bytes, sample_rate)
if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
# If we're NOTTRIGGERED and more than 90% of the frames in
# the ring buffer are voiced frames, then enter the
# TRIGGERED state.
if num_voiced > 0.9 * ring_buffer.maxlen:
triggered = True
# We want to yield all the audio we see from now until
# we are NOTTRIGGERED, but we have to start with the
# audio that's already in the ring buffer.
for f, s in ring_buffer:
voiced_frames.append(f)
ring_buffer.clear()
else:
# We're in the TRIGGERED state, so collect the audio data
# and add it to the ring buffer.
voiced_frames.append(frame)
ring_buffer.append((frame, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
# If more than 90% of the frames in the ring buffer are
# unvoiced, then enter NOTTRIGGERED and yield whatever
# audio we've collected.
if num_unvoiced > 0.9 * ring_buffer.maxlen:
triggered = False
yield b"".join([f.bytes for f in voiced_frames])
ring_buffer.clear()
voiced_frames = []
if triggered:
pass
# If we have any leftover voiced audio when we run out of input,
# yield it.
if voiced_frames:
yield b"".join([f.bytes for f in voiced_frames])
[docs]def vad_segment_generator(wavFile, aggressiveness, desired_sample_rate=None):
"""
Generate VAD segments. Filters out non-voiced audio frames.
Args:
waveFile (str): Path to input wav file to run VAD on.
Returns:
[tuple]:
``segments``: a bytearray of multiple smaller audio frames
(The longer audio split into multiple smaller one's)
``sample_rate``: Sample rate of the input audio file
``audio_length``: Duration of the input audio file
"""
from .transcribe_main import read_wave
logging.debug("Caught the wav file @: %s", (wavFile))
audio, sample_rate, audio_length = read_wave(wavFile, desired_sample_rate)
if sample_rate not in (
8000,
16000,
32000,
48000,
):
raise AssertionError("The WebRTC VAD only accepts 16-bit mono PCM audio, sampled at 8000, 16000, 32000 or 48000 Hz.")
vad = webrtcvad.Vad(int(aggressiveness))
frames = frame_generator(30, audio, sample_rate)
frames = list(frames)
segments = vad_collector(sample_rate, 30, 300, vad, frames)
return segments, sample_rate, audio_length