|
|
|
|
|
import numpy as np |
|
from scipy.signal import correlate2d |
|
from skimage.util import view_as_windows |
|
class WSOLA(object): |
|
|
|
def __init__(self, fs, speech_rate, shiftms=10): |
|
self.fs = fs |
|
self.speech_rate = speech_rate |
|
|
|
self.shiftms = shiftms |
|
self.sl = int(self.fs * self.shiftms / 1000) |
|
self.fl = self.sl * 2 |
|
self.epstep = int(self.sl * self.speech_rate) |
|
self.win = np.hanning(self.fl) |
|
|
|
def duration_modification(self, x): |
|
wlen = len(x) |
|
|
|
wsolaed = np.zeros(int(wlen / self.speech_rate), dtype='d') |
|
|
|
sp = self.sl * 2 |
|
rp = sp + self.sl |
|
ep = sp + self.epstep |
|
outp = self.sl |
|
|
|
wsolaed[:outp] = x[:outp] |
|
|
|
while wlen > ep + self.fl: |
|
|
|
ref = x[rp - self.sl:rp + self.sl] |
|
buff = x[ep - self.fl:ep + self.fl] |
|
|
|
|
|
delta = self._search_minimum_distance(ref, buff) |
|
epd = ep + delta |
|
|
|
|
|
spdata = x[sp:sp + self.sl] * self.win[self.sl:] |
|
epdata = x[epd - self.sl:epd] * self.win[:self.sl] |
|
if len(spdata) == len(wsolaed[outp:outp + self.sl]): |
|
wsolaed[outp:outp + self.sl] = spdata + epdata |
|
else: |
|
wsolaed_len = len(wsolaed[outp:outp + self.sl]) |
|
wsolaed[outp:outp + self.sl] = spdata[:wsolaed_len] + \ |
|
epdata[:wsolaed_len] |
|
|
|
outp += self.sl |
|
|
|
|
|
sp = epd |
|
rp = sp + self.sl |
|
ep += self.epstep |
|
|
|
return wsolaed |
|
|
|
def _search_minimum_distance(self, ref, buff): |
|
if len(ref) < self.fl: |
|
ref = np.r_[ref, np.zeros(self.fl - len(ref))] |
|
|
|
|
|
buffmat = view_as_windows(buff, self.fl) * self.win |
|
refwin = np.array(ref * self.win).reshape(1, self.fl) |
|
corr = correlate2d(buffmat, refwin, mode='valid') |
|
|
|
return np.argmax(corr) - self.sl |