File size: 955 Bytes
574ab7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from espnet2.bin.asr_inference import Speech2Text
import resampy
from espnet_model_zoo.downloader import ModelDownloader

TAGS_TO_MODELS = {
    'phones': 'asr_tts-phn_en.zip',
    'STT': 'asr_stt_en.zip',
    'TTS': 'asr_tts_en.zip'
}


class DemoASR:

    def __init__(self, model_path, model_tag, device):
        self.model_tag = model_tag

        d = ModelDownloader()

        self.speech2text = Speech2Text(
            **d.download_and_unpack(str(model_path / TAGS_TO_MODELS[self.model_tag])),
            device=str(device),
            minlenratio=0.0,
            maxlenratio=0.0,
            ctc_weight=0.4,
            beam_size=15,
            batch_size=1,
            nbest=1
        )

    def recognize_speech(self, audio, sr):
        if len(audio.shape) == 2:
            audio = audio.T[0]
        speech = resampy.resample(audio, sr, 16000)
        nbests = self.speech2text(speech)
        text, *_ = nbests[0]
        return text