Spaces:

fathyshalab
/

nemointent

Runtime error

App Files Files Community

fathyshalab commited on Mar 23, 2023

Commit

5d0c495

1 Parent(s): cb79139

Create SpeechIntent.py

Browse files

Files changed (1) hide show

SpeechIntent.py +75 -0

SpeechIntent.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Import NeMo and it's ASR, NLP and TTS collections
+import nemo
+# Import Speech Recognition collection
+import nemo.collections.asr as nemo_asr
+# Import Natural Language Processing colleciton
+import nemo.collections.nlp as nemo_nlp
+# Import Speech Synthesis collection
+import nemo.collections.tts as nemo_tts
+from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel
+import whisper
+from .utils import measure_time
+class SpeechTranslate():
+  @measure_time
+  def __init__(self,intents=None):
+    # Next, we instantiate all the necessary models directly from NVIDIA NGC
+    # Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0
+    self.intent_label= intents
+    self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval()
+    self.transcription= whisper.load_model("base")
+    # Neural Machine Translation model
+    self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval()
+    self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval()
+    # Spectrogram generator which takes text as an input and produces spectrogram
+    self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval()
+    # Vocoder model which takes spectrogram and produces actual audio
+    self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval()
+  @measure_time
+  def translate(self,speechfile):
+    # Transcribe an audio file
+    # IMPORTANT: The audio must be mono with 16Khz sampling rate
+    text = self.transcription.transcribe(speechfile)
+    # You should see russian text here. Let's translate it to English
+    if text["language"]=="de":
+      english_text = self.nmt_model.translate([text["text"]])
+    elif text["language"]=="en":
+      english_text=text["text"]
+    else:
+      raise NotImplementedError(f"Language: {text['language']} currently not supported")
+    if self.intent_label is  None:
+      self.text = self.nmt_model_de.translate(english_text)
+    else:
+      self.text=english_text
+  # After this you should see English translation
+  # Let's convert it into audio
+  # A helper function which combines FastPitch and HiFiGAN to go directly from
+  # text to audio
+  @measure_time
+  def get_intent(self):
+    intents = self.intent_model.predict([self.text[0]],self.intent_label)
+    intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"]
+    print(intents)
+    intenti = self.nmt_model_de.translate(intent)
+    return intenti,intents[0]['labels'][0]
+  @measure_time
+  def text_to_audio(self):
+    parsed = self.spectrogram_generator.parse(self.text[0])
+    spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed)
+    audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram)
+    return audio.to('cpu').detach().numpy()
+  @measure_time
+  def process(self,speechfile,intents):
+    self.intent_label = intents.split(",") if intents is not None else None
+    self.translate(speechfile)
+    if self.intent_label is not None:
+      self.text,intent = self.get_intent()
+    return self.text_to_audio(),intent