# -*- coding: utf-8 -*- import numpy as np from scipy.signal import correlate2d from skimage.util import view_as_windows class WSOLA(object): def __init__(self, fs, speech_rate, shiftms=10): self.fs = fs self.speech_rate = speech_rate self.shiftms = shiftms # Hs的时间 self.sl = int(self.fs * self.shiftms / 1000) # Hs的长度 self.fl = self.sl * 2 # 帧长 Hs长度的两倍 self.epstep = int(self.sl * self.speech_rate) # Ha的长度 self.win = np.hanning(self.fl) # 窗函数 def duration_modification(self, x): wlen = len(x) # Lin #Lout = Lin/rate wsolaed = np.zeros(int(wlen / self.speech_rate), dtype='d') # 初始化 sp = self.sl * 2 # x'm的中心 rp = sp + self.sl # x~m的中心 ep = sp + self.epstep # x+(m+1)的中心 outp = self.sl # allocate first frame of waveform to outp wsolaed[:outp] = x[:outp] while wlen > ep + self.fl: # copy wavform ref = x[rp - self.sl:rp + self.sl] # x~m buff = x[ep - self.fl:ep + self.fl] # 搜索区域 # search minimum distance bepween ref and buff delta = self._search_minimum_distance(ref, buff) epd = ep + delta # store WSOLAed waveform using over-lap add spdata = x[sp:sp + self.sl] * self.win[self.sl:] # x'm的右半帧 epdata = x[epd - self.sl:epd] * self.win[:self.sl] # x~m的左半帧 if len(spdata) == len(wsolaed[outp:outp + self.sl]): wsolaed[outp:outp + self.sl] = spdata + epdata else: wsolaed_len = len(wsolaed[outp:outp + self.sl]) wsolaed[outp:outp + self.sl] = spdata[:wsolaed_len] + \ epdata[:wsolaed_len] outp += self.sl # 计算下一帧的起始位置 sp = epd rp = sp + self.sl ep += self.epstep return wsolaed def _search_minimum_distance(self, ref, buff): if len(ref) < self.fl: ref = np.r_[ref, np.zeros(self.fl - len(ref))] # slicing and windowing one sample by one buffmat = view_as_windows(buff, self.fl) * self.win refwin = np.array(ref * self.win).reshape(1, self.fl) corr = correlate2d(buffmat, refwin, mode='valid') return np.argmax(corr) - self.sl