Spaces:
Sleeping
Sleeping
HanaeRateau
commited on
Commit
•
e9c2890
1
Parent(s):
da0a5ea
first commit
Browse files- TTS_models.py +109 -0
- app.py +188 -0
- requirements.txt +11 -0
TTS_models.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
import io
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import pipeline
|
8 |
+
from datasets import load_dataset
|
9 |
+
|
10 |
+
class TTSModel:
|
11 |
+
def __init__(self, model_name):
|
12 |
+
self.hf_name = model_name
|
13 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
14 |
+
|
15 |
+
@abstractmethod
|
16 |
+
def synthesize(self, text):
|
17 |
+
pass
|
18 |
+
|
19 |
+
#####
|
20 |
+
|
21 |
+
####################################################
|
22 |
+
class SpeechT5(TTSModel):
|
23 |
+
def __init__(self, name="microsoft/speecht5_tts"):
|
24 |
+
super(SpeechT5, self).__init__(name)
|
25 |
+
self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
|
26 |
+
self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
27 |
+
self.speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
28 |
+
|
29 |
+
|
30 |
+
def synthesize(self, text):
|
31 |
+
speech = self.synthesiser(text, forward_params={"speaker_embeddings": self.speaker_embedding})
|
32 |
+
print("[SpeechT5 - synthesize]", speech)
|
33 |
+
return (np.array(speech["audio"])* 32767).astype(np.int16) # return a numpy array of int to play
|
34 |
+
|
35 |
+
####################################################
|
36 |
+
# PENDING: NOT WORKING FROM HF
|
37 |
+
# from MeloTTS.melo.api import TTS as meloTTS
|
38 |
+
# import nltk
|
39 |
+
|
40 |
+
# class MeloTTS(TTSModel):
|
41 |
+
# def __init__(self, name="myshell-ai/MeloTTS-English"):
|
42 |
+
# super(MeloTTS, self).__init__(name)
|
43 |
+
# nltk.download('averaged_perceptron_tagger_eng')
|
44 |
+
# self.synthesiser = meloTTS(language='EN', device=self.device)
|
45 |
+
# self.speaker_ids = self.synthesiser.hps.data.spk2id
|
46 |
+
|
47 |
+
# def synthesize(self, text):
|
48 |
+
# speech = self.synthesiser.tts_to_file(text, self.speaker_ids['EN-Default'])
|
49 |
+
# print("[MeloTTS - synthesize]", speech)
|
50 |
+
# return speech
|
51 |
+
|
52 |
+
####################################################
|
53 |
+
class Bark(TTSModel):
|
54 |
+
def __init__(self, name="suno/bark"):
|
55 |
+
super(Bark, self).__init__(name)
|
56 |
+
self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
|
57 |
+
|
58 |
+
def synthesize(self, text):
|
59 |
+
speech = self.synthesiser(text)
|
60 |
+
print("[Bark - synthesize]", speech)
|
61 |
+
return speech
|
62 |
+
|
63 |
+
####################################################
|
64 |
+
# pip install git+https://github.com/huggingface/parler-tts.git
|
65 |
+
|
66 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
67 |
+
from transformers import AutoTokenizer
|
68 |
+
|
69 |
+
class ParlerTTS(TTSModel):
|
70 |
+
def __init__(self, name="parler-tts/parler-tts-large-v1"):
|
71 |
+
super(ParlerTTS, self).__init__(name)
|
72 |
+
self.description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
73 |
+
self.model = ParlerTTSForConditionalGeneration.from_pretrained(self.hf_name).to(self.device)
|
74 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
|
75 |
+
# self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)
|
76 |
+
|
77 |
+
def synthesize(self, text):
|
78 |
+
input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
|
79 |
+
prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
|
80 |
+
|
81 |
+
generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
82 |
+
speech = generation.cpu().numpy().squeeze()
|
83 |
+
print("[ParlerTTS - synthesize]", speech)
|
84 |
+
return speech
|
85 |
+
|
86 |
+
####################################################
|
87 |
+
# PENDING: NOT WORKING FROM HF
|
88 |
+
# pip install coqui-tts
|
89 |
+
# https://github.com/idiap/coqui-ai-TTS
|
90 |
+
|
91 |
+
from TTS.api import TTS
|
92 |
+
class XTTS(TTSModel):
|
93 |
+
def __init__(self, name="tts_models/en/ljspeech/glow-tts"):
|
94 |
+
super(XTTS, self).__init__(name)
|
95 |
+
self.synthesiser = TTS(model_name=name, progress_bar=False).to(self.device)
|
96 |
+
# self.model = AutoModelForSequenceClassification.from_pretrained(self.hf_name).to(self.device)
|
97 |
+
# self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
|
98 |
+
# self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)
|
99 |
+
|
100 |
+
# self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
|
101 |
+
|
102 |
+
|
103 |
+
def synthesize(self, text):
|
104 |
+
# input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
|
105 |
+
# prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
|
106 |
+
# print("synthesizing ", text)
|
107 |
+
speech = self.synthesiser.tts(text=text)
|
108 |
+
print("[XTTS - synthesize]", len(speech), text)
|
109 |
+
return speech
|
app.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import ollama
|
6 |
+
import emoji
|
7 |
+
from datasets import load_dataset
|
8 |
+
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
|
9 |
+
from huggingface_hub import login
|
10 |
+
|
11 |
+
from TTS_models import *
|
12 |
+
|
13 |
+
login(token = os.getenv('HF_TOKEN'))
|
14 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
15 |
+
|
16 |
+
# load speech translation checkpoint
|
17 |
+
STT_model_id = "openai/whisper-tiny"
|
18 |
+
|
19 |
+
# load llm
|
20 |
+
llm_model_id = "gemma2:2b"
|
21 |
+
|
22 |
+
# init TTS model
|
23 |
+
TTS_model_id = "tts_models/en/ljspeech/tacotron2-DDC_ph"
|
24 |
+
|
25 |
+
client = ollama.Client()
|
26 |
+
llmpipe = pipeline(
|
27 |
+
"text-generation",
|
28 |
+
model="google/gemma-2-2b-it",
|
29 |
+
model_kwargs={"torch_dtype": torch.bfloat16},
|
30 |
+
stream=True,
|
31 |
+
device=device
|
32 |
+
)
|
33 |
+
|
34 |
+
def translate(audio):
|
35 |
+
global STT_model_id
|
36 |
+
asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
|
37 |
+
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language":"fr"})
|
38 |
+
print(f'Translated {outputs} using {asr_pipe.model}')
|
39 |
+
return outputs["text"]
|
40 |
+
|
41 |
+
def transcribe(audio):
|
42 |
+
global STT_model_id
|
43 |
+
asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
|
44 |
+
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
|
45 |
+
print(f'[transcribe] Transcribe {outputs}')
|
46 |
+
return outputs["text"]
|
47 |
+
|
48 |
+
def chatCompletion(text):
|
49 |
+
global llm_model_id
|
50 |
+
global llmpipe
|
51 |
+
global client
|
52 |
+
|
53 |
+
messages = [
|
54 |
+
{"role": "user", "content": "You are a helpful assistant. Answer in English only in text.\n\n"+text},
|
55 |
+
]
|
56 |
+
|
57 |
+
try:
|
58 |
+
response: ollama.ListResponse = ollama.list()
|
59 |
+
|
60 |
+
response = client.chat(
|
61 |
+
model=llm_model_id,
|
62 |
+
messages=messages,
|
63 |
+
stream=True,
|
64 |
+
options={
|
65 |
+
'num_predict': 256,
|
66 |
+
'temperature': 0.5,
|
67 |
+
'low_vram': True,
|
68 |
+
},
|
69 |
+
)
|
70 |
+
|
71 |
+
buffer = ""
|
72 |
+
for chunk in response:
|
73 |
+
buffer += chunk["message"]["content"]
|
74 |
+
|
75 |
+
print(f'[chatCompletion] {buffer}')
|
76 |
+
return buffer
|
77 |
+
except:
|
78 |
+
outputs = pipe(messages, max_new_tokens=256)
|
79 |
+
buffer = outputs[0]["generated_text"][-1]["content"].strip()
|
80 |
+
|
81 |
+
print(f'[chatCompletion] {buffer}')
|
82 |
+
return buffer
|
83 |
+
|
84 |
+
|
85 |
+
def synthesise(text):
|
86 |
+
global TTS_model_id
|
87 |
+
text = emoji.replace_emoji(text, replace="!")
|
88 |
+
synthesiser = XTTS(TTS_model_id)
|
89 |
+
speech = synthesiser.synthesize(text)
|
90 |
+
|
91 |
+
return (np.array(speech)* 32767).astype(np.int16)
|
92 |
+
|
93 |
+
def speech_to_speech_translation(audioMic, audioFile):
|
94 |
+
audio = None
|
95 |
+
if audioMic is not None:
|
96 |
+
audio = audioMic
|
97 |
+
elif audioFile is not None:
|
98 |
+
audio = audioFile
|
99 |
+
|
100 |
+
translated_text = chatCompletion("")
|
101 |
+
# translated_text = translate(audio)
|
102 |
+
synthesised_speech = synthesise(translated_text)
|
103 |
+
return (22050, synthesised_speech), translated_text
|
104 |
+
|
105 |
+
def speech_to_speech(audioMic, audioFile):
|
106 |
+
audio = None
|
107 |
+
if audioMic is not None:
|
108 |
+
audio = audioMic
|
109 |
+
elif audioFile is not None:
|
110 |
+
audio = audioFile
|
111 |
+
|
112 |
+
translated_text = "Sorry no audio was found."
|
113 |
+
|
114 |
+
if audio is not None:
|
115 |
+
# Transcribe audio
|
116 |
+
translated_text = transcribe(audio)
|
117 |
+
|
118 |
+
# Call LLM
|
119 |
+
answer = chatCompletion(translated_text)
|
120 |
+
|
121 |
+
# Synthesize answer
|
122 |
+
synthesised_speech = synthesise(answer)
|
123 |
+
|
124 |
+
print(f'[speech_to_speech] Transcribed text {translated_text}')
|
125 |
+
print(f'[speech_to_speech] LLM answer {answer}')
|
126 |
+
|
127 |
+
return (22050, synthesised_speech), answer
|
128 |
+
|
129 |
+
with gr.Blocks() as demo:
|
130 |
+
options = gr.WaveformOptions(sample_rate=22050)
|
131 |
+
|
132 |
+
with gr.Tab("Instant Translation"):
|
133 |
+
gr.Markdown(
|
134 |
+
"""
|
135 |
+
# Tanslation of audio to audio
|
136 |
+
The aime of this tab is to demonstrate the speech-to-speech translation capabilities of the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model.
|
137 |
+
|
138 |
+
It uses:
|
139 |
+
- [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
|
140 |
+
- and glow-tts as a voice synthesizer.
|
141 |
+
|
142 |
+
You can either record yourself or upload an audio file in the tabs below.
|
143 |
+
This will translate to english.
|
144 |
+
""")
|
145 |
+
with gr.Row():
|
146 |
+
with gr.Column(scale=1):
|
147 |
+
with gr.Tab("Record Audio"):
|
148 |
+
audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
|
149 |
+
with gr.Tab("Upload Audio"):
|
150 |
+
audioFile = gr.Audio(sources="upload", waveform_options=options)
|
151 |
+
|
152 |
+
transcribeBtn = gr.Button("Submit", size='lg')
|
153 |
+
|
154 |
+
with gr.Column(scale=1):
|
155 |
+
textOutput = gr.Textbox(label="Transcribed text")
|
156 |
+
audioOutput = gr.Audio(waveform_options=options, type="numpy")
|
157 |
+
|
158 |
+
transcribeBtn.click(fn=speech_to_speech_translation, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")
|
159 |
+
|
160 |
+
with gr.Tab("Voice Assistant"):
|
161 |
+
gr.Markdown(
|
162 |
+
"""
|
163 |
+
# Voice Assistant
|
164 |
+
This is a demo to show what are the possibilities for building your own voice assistant.
|
165 |
+
This demo uses:
|
166 |
+
- [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
|
167 |
+
- [ollama/gemma2:2b](https://ollama.com/library/gemma2:2b) model to generate the answer of the assistant,
|
168 |
+
- and glow-tts as a voice synthesizer.
|
169 |
+
|
170 |
+
This means that you need to install ollama on your machine to be able to use this.
|
171 |
+
|
172 |
+
You can either record yourself or upload an audio file in the tabs below.
|
173 |
+
""")
|
174 |
+
with gr.Row():
|
175 |
+
with gr.Column(scale=1):
|
176 |
+
with gr.Tab("Record Audio"):
|
177 |
+
audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
|
178 |
+
with gr.Tab("Upload Audio"):
|
179 |
+
audioFile = gr.Audio(sources="upload", waveform_options=options)
|
180 |
+
|
181 |
+
transcribeBtn = gr.Button("Submit", size='lg')
|
182 |
+
with gr.Column(scale=1):
|
183 |
+
textOutput = gr.Textbox(label="Transcribed text")
|
184 |
+
audioOutput = gr.Audio(waveform_options=options, type="numpy")
|
185 |
+
|
186 |
+
transcribeBtn.click(fn=speech_to_speech, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")
|
187 |
+
|
188 |
+
demo.launch(auth=("FM", "FlandersM4ke"))
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
accelerate
|
3 |
+
torchaudio
|
4 |
+
transformers
|
5 |
+
gradio
|
6 |
+
pypdf
|
7 |
+
langchain-ollama
|
8 |
+
emoji
|
9 |
+
coqui-tts
|
10 |
+
git+https://github.com/huggingface/parler-tts.git
|
11 |
+
nltk
|