Spaces:

HanaeRateau
/

Speech-to-Speech-FM

Sleeping

App Files Files Community

HanaeRateau commited on 18 days ago

Commit

e9c2890

•

1 Parent(s): da0a5ea

first commit

Browse files

Files changed (3) hide show

TTS_models.py +109 -0
app.py +188 -0
requirements.txt +11 -0

TTS_models.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from abc import ABC, abstractmethod
+import io
+import numpy as np
+import torch
+from transformers import pipeline
+from datasets import load_dataset
+class TTSModel:
+    def __init__(self, model_name):
+        self.hf_name = model_name
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    @abstractmethod
+    def synthesize(self, text):
+        pass
+#####
+####################################################
+class SpeechT5(TTSModel):
+    def __init__(self, name="microsoft/speecht5_tts"):
+        super(SpeechT5, self).__init__(name)
+        self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
+        self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        self.speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+    def synthesize(self, text):
+        speech = self.synthesiser(text, forward_params={"speaker_embeddings": self.speaker_embedding})
+        print("[SpeechT5 - synthesize]", speech)
+        return (np.array(speech["audio"])* 32767).astype(np.int16) # return a numpy array of int to play
+####################################################
+# PENDING: NOT WORKING FROM HF
+# from MeloTTS.melo.api import TTS as meloTTS
+# import nltk
+# class MeloTTS(TTSModel):
+#     def __init__(self, name="myshell-ai/MeloTTS-English"):
+#         super(MeloTTS, self).__init__(name)
+#         nltk.download('averaged_perceptron_tagger_eng')
+#         self.synthesiser = meloTTS(language='EN', device=self.device)
+#         self.speaker_ids = self.synthesiser.hps.data.spk2id
+#     def synthesize(self, text):
+#         speech = self.synthesiser.tts_to_file(text, self.speaker_ids['EN-Default'])
+#         print("[MeloTTS - synthesize]", speech)
+#         return speech
+####################################################
+class Bark(TTSModel):
+    def __init__(self, name="suno/bark"):
+        super(Bark, self).__init__(name)
+        self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
+    def synthesize(self, text):
+        speech = self.synthesiser(text)
+        print("[Bark - synthesize]", speech)
+        return speech
+####################################################
+# pip install git+https://github.com/huggingface/parler-tts.git
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
+class ParlerTTS(TTSModel):
+    def __init__(self, name="parler-tts/parler-tts-large-v1"):
+        super(ParlerTTS, self).__init__(name)
+        self.description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
+        self.model = ParlerTTSForConditionalGeneration.from_pretrained(self.hf_name).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
+        # self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)
+    def synthesize(self, text):
+        input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
+        prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
+        generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+        speech = generation.cpu().numpy().squeeze()
+        print("[ParlerTTS - synthesize]", speech)
+        return speech
+####################################################
+# PENDING: NOT WORKING FROM HF
+# pip install coqui-tts
+# https://github.com/idiap/coqui-ai-TTS
+from TTS.api import TTS
+class XTTS(TTSModel):
+    def __init__(self, name="tts_models/en/ljspeech/glow-tts"):
+        super(XTTS, self).__init__(name)
+        self.synthesiser = TTS(model_name=name, progress_bar=False).to(self.device)
+        # self.model = AutoModelForSequenceClassification.from_pretrained(self.hf_name).to(self.device)
+        # self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
+        # self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)
+        # self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
+    def synthesize(self, text):
+        # input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
+        # prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
+        # print("synthesizing ", text)
+        speech = self.synthesiser.tts(text=text)
+        print("[XTTS - synthesize]", len(speech), text)
+        return speech

app.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import gradio as gr
+import numpy as np
+import torch
+import ollama
+import emoji
+from datasets import load_dataset
+from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
+from huggingface_hub import login
+from TTS_models import *
+login(token = os.getenv('HF_TOKEN'))
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# load speech translation checkpoint
+STT_model_id = "openai/whisper-tiny"
+# load llm
+llm_model_id = "gemma2:2b"
+# init TTS model
+TTS_model_id = "tts_models/en/ljspeech/tacotron2-DDC_ph"
+client = ollama.Client()
+llmpipe = pipeline(
+    "text-generation",
+    model="google/gemma-2-2b-it",
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    stream=True,
+    device=device
+)
+def translate(audio):
+    global STT_model_id
+    asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
+    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language":"fr"})
+    print(f'Translated {outputs} using {asr_pipe.model}')
+    return outputs["text"]
+def transcribe(audio):
+    global STT_model_id
+    asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
+    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
+    print(f'[transcribe] Transcribe {outputs}')
+    return outputs["text"]
+def chatCompletion(text):
+    global llm_model_id
+    global llmpipe
+    global client
+    messages = [
+        {"role": "user", "content": "You are a helpful assistant. Answer in English only in text.\n\n"+text},
+    ]
+    try:
+        response: ollama.ListResponse = ollama.list()
+        response = client.chat(
+            model=llm_model_id,
+            messages=messages,
+            stream=True,
+            options={
+                'num_predict': 256,
+                'temperature': 0.5,
+                'low_vram': True,
+            },
+        )
+        buffer = ""
+        for chunk in response:
+            buffer += chunk["message"]["content"]
+        print(f'[chatCompletion] {buffer}')
+        return buffer
+    except:
+        outputs = pipe(messages, max_new_tokens=256)
+        buffer = outputs[0]["generated_text"][-1]["content"].strip()
+        print(f'[chatCompletion] {buffer}')
+        return buffer
+def synthesise(text):
+    global TTS_model_id
+    text = emoji.replace_emoji(text, replace="!")
+    synthesiser = XTTS(TTS_model_id)
+    speech = synthesiser.synthesize(text)
+    return (np.array(speech)* 32767).astype(np.int16)
+def speech_to_speech_translation(audioMic, audioFile):
+    audio = None
+    if audioMic is not None:
+        audio = audioMic
+    elif audioFile is not None:
+        audio = audioFile
+    translated_text = chatCompletion("")
+    # translated_text = translate(audio)
+    synthesised_speech = synthesise(translated_text)
+    return (22050, synthesised_speech), translated_text
+def speech_to_speech(audioMic, audioFile):
+    audio = None
+    if audioMic is not None:
+        audio = audioMic
+    elif audioFile is not None:
+        audio = audioFile
+    translated_text = "Sorry no audio was found."
+    if audio is not None:
+        # Transcribe audio
+        translated_text = transcribe(audio)
+    # Call LLM
+    answer = chatCompletion(translated_text)
+    # Synthesize answer
+    synthesised_speech = synthesise(answer)
+    print(f'[speech_to_speech] Transcribed text {translated_text}')
+    print(f'[speech_to_speech] LLM answer {answer}')
+    return (22050, synthesised_speech), answer
+with gr.Blocks() as demo:
+    options = gr.WaveformOptions(sample_rate=22050)
+    with gr.Tab("Instant Translation"):
+        gr.Markdown(
+        """
+        # Tanslation of audio to audio
+        The aime of this tab is to demonstrate the speech-to-speech translation capabilities of the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model.
+        It uses:
+        - [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
+        - and glow-tts as a voice synthesizer.
+        You can either record yourself or upload an audio file in the tabs below.
+        This will translate to english.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                with gr.Tab("Record Audio"):
+                    audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
+                with gr.Tab("Upload Audio"):
+                    audioFile = gr.Audio(sources="upload", waveform_options=options)
+                transcribeBtn = gr.Button("Submit", size='lg')
+            with gr.Column(scale=1):
+                textOutput = gr.Textbox(label="Transcribed text")
+                audioOutput = gr.Audio(waveform_options=options, type="numpy")
+        transcribeBtn.click(fn=speech_to_speech_translation, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")
+    with gr.Tab("Voice Assistant"):
+        gr.Markdown(
+        """
+        # Voice Assistant
+        This is a demo to show what are the possibilities for building your own voice assistant.
+        This demo uses:
+        - [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
+        - [ollama/gemma2:2b](https://ollama.com/library/gemma2:2b) model to generate the answer of the assistant,
+        - and glow-tts as a voice synthesizer.
+        This means that you need to install ollama on your machine to be able to use this.
+        You can either record yourself or upload an audio file in the tabs below.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                with gr.Tab("Record Audio"):
+                    audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
+                with gr.Tab("Upload Audio"):
+                    audioFile = gr.Audio(sources="upload", waveform_options=options)
+                transcribeBtn = gr.Button("Submit", size='lg')
+            with gr.Column(scale=1):
+                textOutput = gr.Textbox(label="Transcribed text")
+                audioOutput = gr.Audio(waveform_options=options, type="numpy")
+        transcribeBtn.click(fn=speech_to_speech, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")
+demo.launch(auth=("FM", "FlandersM4ke"))

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch
+accelerate
+torchaudio
+transformers
+gradio
+pypdf
+langchain-ollama
+emoji
+coqui-tts
+git+https://github.com/huggingface/parler-tts.git
+nltk