用python實(shí)現(xiàn)語(yǔ)音端點(diǎn)檢測(cè)(Voice Activity Detection,VAD)
發(fā)布日期:2022/10/11 7:19:28 瀏覽量:
1.準(zhǔn)備環(huán)境
https://github.com/marsbroshok/VAD-python
里面的vad.py文件
2.具體代碼
from vad import VoiceActivityDetector
import wave
if __name__ == "__main__":
load_file = "test.wav"
save_file = "process.wav"
# 獲取vad分割節(jié)點(diǎn)
v = VoiceActivityDetector(load_file)
raw_detection = v.detect_speech()
speech_labels, point_labels = v.convert_windows_to_readible_labels(raw_detection)
if len(point_labels) != 0:
# 根據(jù)節(jié)點(diǎn)音頻分割并連接
data = v.data
cut_data = []
Fs = v.rate
for start, end in point_labels:
cut_data.extend(data[int(start):int(end)])
# 保存音頻
f = wave.open(save_file, ’w’)
nframes = len(cut_data)
f.setparams((1, 2, Fs, nframes, ’NONE’, ’NONE’)) # 聲道,字節(jié)數(shù),采樣頻率,*,*
wavdata = np.array(cut_data)
wavdata = wavdata.astype(np.int16)
f.writeframes(wavdata) # outData
f.close()
vad.py文件
class VoiceActivityDetector():
""" Use signal energy to detect voice activity in wav file """
def __init__(self, wave_input_filename):
self._read_wav(wave_input_filename)._convert_to_mono()
#沿音頻數(shù)據(jù)移動(dòng) 20 毫秒的窗口。
self.sample_window = 0.02 # 20 ms
self.sample_overlap = 0.01 # 10ms
#應(yīng)用長(zhǎng)度為 0.5s 的中值濾波器來(lái)平滑檢測(cè)到的語(yǔ)音區(qū)域。
self.speech_window = 0.5 # half a second
#計(jì)算語(yǔ)帶能量與窗口總能量的比值。如果比率大于閾值(默認(rèn)為 0.6),則將窗口標(biāo)記為語(yǔ)音
self.speech_energy_threshold = 0.6 # 60% of energy in voice band
#中值濾波器(濾波保留2000-8000hz)
self.speech_start_band = 2000
self.speech_end_band = 8000
self.data_speech = []

馬上咨詢: 如果您有業(yè)務(wù)方面的問(wèn)題或者需求,歡迎您咨詢!我們帶來(lái)的不僅僅是技術(shù),還有行業(yè)經(jīng)驗(yàn)積累。
QQ: 39764417/308460098 Phone: 13 9800 1 9844 / 135 6887 9550 聯(lián)系人:石先生/雷先生