Source code for lecture2notes.end_to_end.transcribe.webrtcvad_utils

# Code Borrowed From:
# 1.
# 2.

import collections
import logging

import webrtcvad

logger = logging.getLogger(__name__)

[docs]class Frame: """Represents a "frame" of audio data.""" def __init__(self, bytes, timestamp, duration): self.bytes = bytes self.timestamp = timestamp self.duration = duration
[docs]def frame_generator(frame_duration_ms, audio, sample_rate): """Generates audio frames from PCM audio data. Takes the desired frame duration in milliseconds, the PCM data, and the sample rate. Yields Frames of the requested duration. """ n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) offset = 0 timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): yield Frame(audio[offset : offset + n], timestamp, duration) timestamp += duration offset += n
[docs]def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): """Filters out non-voiced audio frames. Given a webrtcvad.Vad and a source of audio frames, yields only the voiced audio. Uses a padded, sliding window algorithm over the audio frames. When more than 90% of the frames in the window are voiced (as reported by the VAD), the collector triggers and begins yielding audio frames. Then the collector waits until 90% of the frames in the window are unvoiced to detrigger. The window is padded at the front and back to provide a small amount of silence or the beginnings/endings of speech around the voiced frames. Args: sample_rate: The audio sample rate, in Hz. frame_duration_ms: The frame duration in milliseconds. padding_duration_ms: The amount to pad the window, in milliseconds. vad: An instance of webrtcvad.Vad. frames: a source of audio frames (sequence or generator). Returns: [generator]: A generator that yields PCM audio data. """ num_padding_frames = int(padding_duration_ms / frame_duration_ms) # We use a deque for our sliding window/ring buffer. ring_buffer = collections.deque(maxlen=num_padding_frames) # We have two states: TRIGGERED and NOTTRIGGERED. We start in the # NOTTRIGGERED state. triggered = False voiced_frames = [] for frame in frames: is_speech = vad.is_speech(frame.bytes, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) # If we're NOTTRIGGERED and more than 90% of the frames in # the ring buffer are voiced frames, then enter the # TRIGGERED state. if num_voiced > 0.9 * ring_buffer.maxlen: triggered = True # We want to yield all the audio we see from now until # we are NOTTRIGGERED, but we have to start with the # audio that's already in the ring buffer. for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: # We're in the TRIGGERED state, so collect the audio data # and add it to the ring buffer. voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) # If more than 90% of the frames in the ring buffer are # unvoiced, then enter NOTTRIGGERED and yield whatever # audio we've collected. if num_unvoiced > 0.9 * ring_buffer.maxlen: triggered = False yield b"".join([f.bytes for f in voiced_frames]) ring_buffer.clear() voiced_frames = [] if triggered: pass # If we have any leftover voiced audio when we run out of input, # yield it. if voiced_frames: yield b"".join([f.bytes for f in voiced_frames])
[docs]def vad_segment_generator(wavFile, aggressiveness, desired_sample_rate=None): """ Generate VAD segments. Filters out non-voiced audio frames. Args: waveFile (str): Path to input wav file to run VAD on. Returns: [tuple]: ``segments``: a bytearray of multiple smaller audio frames (The longer audio split into multiple smaller one's) ``sample_rate``: Sample rate of the input audio file ``audio_length``: Duration of the input audio file """ from .transcribe_main import read_wave logging.debug("Caught the wav file @: %s", (wavFile)) audio, sample_rate, audio_length = read_wave(wavFile, desired_sample_rate) if sample_rate not in ( 8000, 16000, 32000, 48000, ): raise AssertionError("The WebRTC VAD only accepts 16-bit mono PCM audio, sampled at 8000, 16000, 32000 or 48000 Hz.") vad = webrtcvad.Vad(int(aggressiveness)) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) return segments, sample_rate, audio_length