Spaces:
Runtime error
Runtime error
Commit
·
5d0c495
1
Parent(s):
cb79139
Create SpeechIntent.py
Browse files- SpeechIntent.py +75 -0
SpeechIntent.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import NeMo and it's ASR, NLP and TTS collections
|
2 |
+
import nemo
|
3 |
+
# Import Speech Recognition collection
|
4 |
+
import nemo.collections.asr as nemo_asr
|
5 |
+
# Import Natural Language Processing colleciton
|
6 |
+
import nemo.collections.nlp as nemo_nlp
|
7 |
+
# Import Speech Synthesis collection
|
8 |
+
import nemo.collections.tts as nemo_tts
|
9 |
+
from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel
|
10 |
+
import whisper
|
11 |
+
from .utils import measure_time
|
12 |
+
|
13 |
+
|
14 |
+
class SpeechTranslate():
|
15 |
+
@measure_time
|
16 |
+
def __init__(self,intents=None):
|
17 |
+
# Next, we instantiate all the necessary models directly from NVIDIA NGC
|
18 |
+
# Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0
|
19 |
+
self.intent_label= intents
|
20 |
+
self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval()
|
21 |
+
self.transcription= whisper.load_model("base")
|
22 |
+
# Neural Machine Translation model
|
23 |
+
self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval()
|
24 |
+
self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval()
|
25 |
+
# Spectrogram generator which takes text as an input and produces spectrogram
|
26 |
+
self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval()
|
27 |
+
# Vocoder model which takes spectrogram and produces actual audio
|
28 |
+
self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval()
|
29 |
+
@measure_time
|
30 |
+
def translate(self,speechfile):
|
31 |
+
# Transcribe an audio file
|
32 |
+
# IMPORTANT: The audio must be mono with 16Khz sampling rate
|
33 |
+
text = self.transcription.transcribe(speechfile)
|
34 |
+
# You should see russian text here. Let's translate it to English
|
35 |
+
|
36 |
+
if text["language"]=="de":
|
37 |
+
english_text = self.nmt_model.translate([text["text"]])
|
38 |
+
elif text["language"]=="en":
|
39 |
+
english_text=text["text"]
|
40 |
+
else:
|
41 |
+
raise NotImplementedError(f"Language: {text['language']} currently not supported")
|
42 |
+
if self.intent_label is None:
|
43 |
+
self.text = self.nmt_model_de.translate(english_text)
|
44 |
+
|
45 |
+
else:
|
46 |
+
self.text=english_text
|
47 |
+
# After this you should see English translation
|
48 |
+
# Let's convert it into audio
|
49 |
+
# A helper function which combines FastPitch and HiFiGAN to go directly from
|
50 |
+
# text to audio
|
51 |
+
@measure_time
|
52 |
+
def get_intent(self):
|
53 |
+
intents = self.intent_model.predict([self.text[0]],self.intent_label)
|
54 |
+
intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"]
|
55 |
+
print(intents)
|
56 |
+
intenti = self.nmt_model_de.translate(intent)
|
57 |
+
return intenti,intents[0]['labels'][0]
|
58 |
+
|
59 |
+
@measure_time
|
60 |
+
def text_to_audio(self):
|
61 |
+
parsed = self.spectrogram_generator.parse(self.text[0])
|
62 |
+
spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed)
|
63 |
+
audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram)
|
64 |
+
return audio.to('cpu').detach().numpy()
|
65 |
+
@measure_time
|
66 |
+
def process(self,speechfile,intents):
|
67 |
+
self.intent_label = intents.split(",") if intents is not None else None
|
68 |
+
self.translate(speechfile)
|
69 |
+
if self.intent_label is not None:
|
70 |
+
self.text,intent = self.get_intent()
|
71 |
+
return self.text_to_audio(),intent
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|