fathyshalab commited on
Commit
5d0c495
·
1 Parent(s): cb79139

Create SpeechIntent.py

Browse files
Files changed (1) hide show
  1. SpeechIntent.py +75 -0
SpeechIntent.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import NeMo and it's ASR, NLP and TTS collections
2
+ import nemo
3
+ # Import Speech Recognition collection
4
+ import nemo.collections.asr as nemo_asr
5
+ # Import Natural Language Processing colleciton
6
+ import nemo.collections.nlp as nemo_nlp
7
+ # Import Speech Synthesis collection
8
+ import nemo.collections.tts as nemo_tts
9
+ from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel
10
+ import whisper
11
+ from .utils import measure_time
12
+
13
+
14
+ class SpeechTranslate():
15
+ @measure_time
16
+ def __init__(self,intents=None):
17
+ # Next, we instantiate all the necessary models directly from NVIDIA NGC
18
+ # Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0
19
+ self.intent_label= intents
20
+ self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval()
21
+ self.transcription= whisper.load_model("base")
22
+ # Neural Machine Translation model
23
+ self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval()
24
+ self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval()
25
+ # Spectrogram generator which takes text as an input and produces spectrogram
26
+ self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval()
27
+ # Vocoder model which takes spectrogram and produces actual audio
28
+ self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval()
29
+ @measure_time
30
+ def translate(self,speechfile):
31
+ # Transcribe an audio file
32
+ # IMPORTANT: The audio must be mono with 16Khz sampling rate
33
+ text = self.transcription.transcribe(speechfile)
34
+ # You should see russian text here. Let's translate it to English
35
+
36
+ if text["language"]=="de":
37
+ english_text = self.nmt_model.translate([text["text"]])
38
+ elif text["language"]=="en":
39
+ english_text=text["text"]
40
+ else:
41
+ raise NotImplementedError(f"Language: {text['language']} currently not supported")
42
+ if self.intent_label is None:
43
+ self.text = self.nmt_model_de.translate(english_text)
44
+
45
+ else:
46
+ self.text=english_text
47
+ # After this you should see English translation
48
+ # Let's convert it into audio
49
+ # A helper function which combines FastPitch and HiFiGAN to go directly from
50
+ # text to audio
51
+ @measure_time
52
+ def get_intent(self):
53
+ intents = self.intent_model.predict([self.text[0]],self.intent_label)
54
+ intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"]
55
+ print(intents)
56
+ intenti = self.nmt_model_de.translate(intent)
57
+ return intenti,intents[0]['labels'][0]
58
+
59
+ @measure_time
60
+ def text_to_audio(self):
61
+ parsed = self.spectrogram_generator.parse(self.text[0])
62
+ spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed)
63
+ audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram)
64
+ return audio.to('cpu').detach().numpy()
65
+ @measure_time
66
+ def process(self,speechfile,intents):
67
+ self.intent_label = intents.split(",") if intents is not None else None
68
+ self.translate(speechfile)
69
+ if self.intent_label is not None:
70
+ self.text,intent = self.get_intent()
71
+ return self.text_to_audio(),intent
72
+
73
+
74
+
75
+