谷歌為WebRTC專案開發的VAD是目前最優秀、最先進和免費的產品之一。webrtcvad是WebRTC語音活動檢測器(VAD)的python介面。相容python2和python3。功能是將一段音訊資料分為靜音與非靜音。它對於電話和語音識別很有用。
1、安裝pip
yum -y install epel-release yum -y install python-pip
2、安裝webrtcvad
yum -y install python-devel pip install webrtcvad
3、webrtcvad測試指令碼(test_webrtcvad.py)
import collections import contextlib import sys import wave import webrtcvad def read_wave(path): with contextlib.closing(wave.open(path, 'rb')) as wf: num_channels = wf.getnchannels() assert num_channels == 1 sample_width = wf.getsampwidth() assert sample_width == 2 sample_rate = wf.getframerate() assert sample_rate in (8000, 16000, 32000) pcm_data = wf.readframes(wf.getnframes()) return pcm_data, sample_rate def write_wave(path, audio, sample_rate): with contextlib.closing(wave.open(path, 'wb')) as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(audio) class Frame(object): def __init__(self, bytes, timestamp, duration): self.bytes = bytes self.timestamp = timestamp self.duration = duration def frame_generator(frame_duration_ms, audio, sample_rate): n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) offset = 0 timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): yield Frame(audio[offset:offset + n], timestamp, duration) timestamp += duration offset += n def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): num_padding_frames = int(padding_duration_ms / frame_duration_ms) ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False voiced_frames = [] for frame in frames: sys.stdout.write( '1' if vad.is_speech(frame.bytes, sample_rate) else '0') if not triggered: ring_buffer.append(frame) num_voiced = len([f for f in ring_buffer if vad.is_speech(f.bytes, sample_rate)]) if num_voiced > 0.9 * ring_buffer.maxlen: sys.stdout.write('+(%s)' % (ring_buffer[0].timestamp,)) triggered = True voiced_frames.extend(ring_buffer) ring_buffer.clear() else: voiced_frames.append(frame) ring_buffer.append(frame) num_unvoiced = len([f for f in ring_buffer if not vad.is_speech(f.bytes, sample_rate)]) if num_unvoiced > 0.9 * ring_buffer.maxlen: sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) triggered = False yield b''.join([f.bytes for f in voiced_frames]) ring_buffer.clear() voiced_frames = [] if triggered: sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) sys.stdout.write('\n') if voiced_frames: yield b''.join([f.bytes for f in voiced_frames]) def main(args): if len(args) != 2: sys.stderr.write( 'Usage: example.py <aggressiveness> <path to wav file>\n') sys.exit(1) audio, sample_rate = read_wave(args[1]) vad = webrtcvad.Vad(int(args[0])) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) for i, segment in enumerate(segments): #path = 'chunk-%002d.wav' % (i,) print('--end') #write_wave(path, segment, sample_rate) if __name__ == '__main__': main(sys.argv[1:])
4、執行命令(其中,第一個引數為敏感係數,取值0-3,越大表示越敏感,越激進,對細微的聲音訊段都可以識別出來;第二個引數為wav檔案存放路徑,目前僅支援8K,16K,32K的取樣率,示例wav檔案下載:73.wav 連結:https://pan.baidu.com/s/19YJB9u0zvCFGBLDRisK1KQ 密碼:fgkf)
[root@host-10-0-251-159 ~]# python test_webrtcvad.py 2 /home/73.wav 00000000000000000000000000000000000000000000000000000000000000000000000000111111+(2.1)11111111111111111111111100000000-(3.36)--end 00000000000111111+(3.57)1111111111111111111111111111111111111001111111111111111111111111111111111111000000111111111111111111111111111111111111111111111111101111111111110000011110000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111000111111111111111111111111111100111111111111111111111111111111111111111111111100000000000011100000000-(14.43)--end 000000000000000000000000000000000000011+(15.3)111100000001110000-(16.14)--end 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001111111+(21.21)11111111111111111111111110000000-(22.47)--end 00000000000111111+(22.68)111111111111111111111111111111111111111111100000000000-(24.6)--end 000000111111+(24.66)111111111111111111111111111111111111111111111111111110000000-(26.76)--end 1111111111+(26.76)1111111111111110000000000-(27.81)--end 000000001111+(27.87)11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100000-(31.38)--end 0001111111+(31.38)11111111111111111110111111111111111000000-(32.91)--end 00000001111111111111+(33.21)111000111111111111111111111111111111111110000000000-(35.04)--end 000000000000000000000000000111111+(35.73)111111111111111111111111111111111111111111111111111000011111111111111111111111111000000011111111111111111111111111111111111111111111111111111111111111111111111111111000011100000000-(41.43)--end 000000000000000000000000000000000000000000000111111+(42.66)1111111111111111111110000000-(43.8)--end 000000001111111+(43.95)1111111111111111111111110011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111010000000-(51.03)--end 00000000111111+(51.15)1111111111111111111111111111111111111001111111111111111111111111111111111000000-(53.82)--end 0111111111+(53.82)11111111111111111111111111111111111111111111000011111111111111111111111111001111111111111111111111111111111111111111111111111111000111111111111111111111111111111111111111111111111111110000000-(59.85)--end 00000000000000000000000000111111+(60.51)11111111111111111111111111111111111000000111100111111111111111111111111111111111111111111111111111111111111111111111110011100000000-(64.74)--end 0000111000000000000000000001111111+(65.46)11111111111111111111111111111110011100000000000000-(67.26)--end 00000000111000000000111111+(67.74)111111111111111111111111111111111000100000000-(69.39)--end 00001111111+(69.42)11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100001111111111110001111111111111111111111111111111110000000-(74.55)--end 1111111111+(74.55)111111111111111111111111111111111111100000011111011111111111111111111111111111111111111111111111111111111111111111111111111111111111100111111111111111111111111111111111111111111111111111111111111111111111000000000-(81.24)--end 0011111000000111111+(81.51)111111111111111111111111111111111111111111111111111111111111111111111100000000111111111111111111111111111111111111111111111111111111111111111001111111111111111111111111111111111000000001100000000-(87.66)--end 000000000001111111+(87.9)1111111111111111111111111111110111111000001100000000-(89.76)--end 000000000000000000000000000000000000000000000000111111+(91.08)1111111111111100000000-(92.04)--end 0000000000000111111+(92.31)11111111111111110111011111111111111111111111110001111111111111111111111111111111111111111000001111111111111111111111111111111111111111100000000-(96.9)--end 000000000000000111111+(97.23)11111111111111111111111111111111111111100111111001111111111111111111111111111111111111111111111111001111111111111111111111111111111111111111100000000000000000-(102.27)--end 000111000000111111+(102.51)111111111111111111111111111111111111111111111110000000-(104.43)--end 0000111111+(104.43)111111111111111111111111111111110000000-(105.9)--end 11100111100000000011111111+(106.38)111000000011111111111111111111111111111100000000-(108.12)--end 00001111000000000011110111111+(108.69)111111111111111111111111111111110000000-(110.16)--end 000000000000000000000000011100111000111111+(111.12)111111111111111111100001111111111111111111111111110000000-(113.13)--end 0001111111+(113.13)111111111111111111111111111111111110000010000000-(114.87)--end 0111011111+(114.87)1111111111111111111111111111111111100000011111111111111111111111111111111111111111111110110000000-(118.08)--end 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000