HanaeRateau commited on
Commit
e9c2890
1 Parent(s): da0a5ea

first commit

Browse files
Files changed (3) hide show
  1. TTS_models.py +109 -0
  2. app.py +188 -0
  3. requirements.txt +11 -0
TTS_models.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ import io
3
+
4
+ import numpy as np
5
+
6
+ import torch
7
+ from transformers import pipeline
8
+ from datasets import load_dataset
9
+
10
+ class TTSModel:
11
+ def __init__(self, model_name):
12
+ self.hf_name = model_name
13
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
14
+
15
+ @abstractmethod
16
+ def synthesize(self, text):
17
+ pass
18
+
19
+ #####
20
+
21
+ ####################################################
22
+ class SpeechT5(TTSModel):
23
+ def __init__(self, name="microsoft/speecht5_tts"):
24
+ super(SpeechT5, self).__init__(name)
25
+ self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
26
+ self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
27
+ self.speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
28
+
29
+
30
+ def synthesize(self, text):
31
+ speech = self.synthesiser(text, forward_params={"speaker_embeddings": self.speaker_embedding})
32
+ print("[SpeechT5 - synthesize]", speech)
33
+ return (np.array(speech["audio"])* 32767).astype(np.int16) # return a numpy array of int to play
34
+
35
+ ####################################################
36
+ # PENDING: NOT WORKING FROM HF
37
+ # from MeloTTS.melo.api import TTS as meloTTS
38
+ # import nltk
39
+
40
+ # class MeloTTS(TTSModel):
41
+ # def __init__(self, name="myshell-ai/MeloTTS-English"):
42
+ # super(MeloTTS, self).__init__(name)
43
+ # nltk.download('averaged_perceptron_tagger_eng')
44
+ # self.synthesiser = meloTTS(language='EN', device=self.device)
45
+ # self.speaker_ids = self.synthesiser.hps.data.spk2id
46
+
47
+ # def synthesize(self, text):
48
+ # speech = self.synthesiser.tts_to_file(text, self.speaker_ids['EN-Default'])
49
+ # print("[MeloTTS - synthesize]", speech)
50
+ # return speech
51
+
52
+ ####################################################
53
+ class Bark(TTSModel):
54
+ def __init__(self, name="suno/bark"):
55
+ super(Bark, self).__init__(name)
56
+ self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
57
+
58
+ def synthesize(self, text):
59
+ speech = self.synthesiser(text)
60
+ print("[Bark - synthesize]", speech)
61
+ return speech
62
+
63
+ ####################################################
64
+ # pip install git+https://github.com/huggingface/parler-tts.git
65
+
66
+ from parler_tts import ParlerTTSForConditionalGeneration
67
+ from transformers import AutoTokenizer
68
+
69
+ class ParlerTTS(TTSModel):
70
+ def __init__(self, name="parler-tts/parler-tts-large-v1"):
71
+ super(ParlerTTS, self).__init__(name)
72
+ self.description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
73
+ self.model = ParlerTTSForConditionalGeneration.from_pretrained(self.hf_name).to(self.device)
74
+ self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
75
+ # self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)
76
+
77
+ def synthesize(self, text):
78
+ input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
79
+ prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
80
+
81
+ generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
82
+ speech = generation.cpu().numpy().squeeze()
83
+ print("[ParlerTTS - synthesize]", speech)
84
+ return speech
85
+
86
+ ####################################################
87
+ # PENDING: NOT WORKING FROM HF
88
+ # pip install coqui-tts
89
+ # https://github.com/idiap/coqui-ai-TTS
90
+
91
+ from TTS.api import TTS
92
+ class XTTS(TTSModel):
93
+ def __init__(self, name="tts_models/en/ljspeech/glow-tts"):
94
+ super(XTTS, self).__init__(name)
95
+ self.synthesiser = TTS(model_name=name, progress_bar=False).to(self.device)
96
+ # self.model = AutoModelForSequenceClassification.from_pretrained(self.hf_name).to(self.device)
97
+ # self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
98
+ # self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)
99
+
100
+ # self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
101
+
102
+
103
+ def synthesize(self, text):
104
+ # input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
105
+ # prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
106
+ # print("synthesizing ", text)
107
+ speech = self.synthesiser.tts(text=text)
108
+ print("[XTTS - synthesize]", len(speech), text)
109
+ return speech
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import numpy as np
4
+ import torch
5
+ import ollama
6
+ import emoji
7
+ from datasets import load_dataset
8
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
9
+ from huggingface_hub import login
10
+
11
+ from TTS_models import *
12
+
13
+ login(token = os.getenv('HF_TOKEN'))
14
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
+
16
+ # load speech translation checkpoint
17
+ STT_model_id = "openai/whisper-tiny"
18
+
19
+ # load llm
20
+ llm_model_id = "gemma2:2b"
21
+
22
+ # init TTS model
23
+ TTS_model_id = "tts_models/en/ljspeech/tacotron2-DDC_ph"
24
+
25
+ client = ollama.Client()
26
+ llmpipe = pipeline(
27
+ "text-generation",
28
+ model="google/gemma-2-2b-it",
29
+ model_kwargs={"torch_dtype": torch.bfloat16},
30
+ stream=True,
31
+ device=device
32
+ )
33
+
34
+ def translate(audio):
35
+ global STT_model_id
36
+ asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
37
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language":"fr"})
38
+ print(f'Translated {outputs} using {asr_pipe.model}')
39
+ return outputs["text"]
40
+
41
+ def transcribe(audio):
42
+ global STT_model_id
43
+ asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
44
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
45
+ print(f'[transcribe] Transcribe {outputs}')
46
+ return outputs["text"]
47
+
48
+ def chatCompletion(text):
49
+ global llm_model_id
50
+ global llmpipe
51
+ global client
52
+
53
+ messages = [
54
+ {"role": "user", "content": "You are a helpful assistant. Answer in English only in text.\n\n"+text},
55
+ ]
56
+
57
+ try:
58
+ response: ollama.ListResponse = ollama.list()
59
+
60
+ response = client.chat(
61
+ model=llm_model_id,
62
+ messages=messages,
63
+ stream=True,
64
+ options={
65
+ 'num_predict': 256,
66
+ 'temperature': 0.5,
67
+ 'low_vram': True,
68
+ },
69
+ )
70
+
71
+ buffer = ""
72
+ for chunk in response:
73
+ buffer += chunk["message"]["content"]
74
+
75
+ print(f'[chatCompletion] {buffer}')
76
+ return buffer
77
+ except:
78
+ outputs = pipe(messages, max_new_tokens=256)
79
+ buffer = outputs[0]["generated_text"][-1]["content"].strip()
80
+
81
+ print(f'[chatCompletion] {buffer}')
82
+ return buffer
83
+
84
+
85
+ def synthesise(text):
86
+ global TTS_model_id
87
+ text = emoji.replace_emoji(text, replace="!")
88
+ synthesiser = XTTS(TTS_model_id)
89
+ speech = synthesiser.synthesize(text)
90
+
91
+ return (np.array(speech)* 32767).astype(np.int16)
92
+
93
+ def speech_to_speech_translation(audioMic, audioFile):
94
+ audio = None
95
+ if audioMic is not None:
96
+ audio = audioMic
97
+ elif audioFile is not None:
98
+ audio = audioFile
99
+
100
+ translated_text = chatCompletion("")
101
+ # translated_text = translate(audio)
102
+ synthesised_speech = synthesise(translated_text)
103
+ return (22050, synthesised_speech), translated_text
104
+
105
+ def speech_to_speech(audioMic, audioFile):
106
+ audio = None
107
+ if audioMic is not None:
108
+ audio = audioMic
109
+ elif audioFile is not None:
110
+ audio = audioFile
111
+
112
+ translated_text = "Sorry no audio was found."
113
+
114
+ if audio is not None:
115
+ # Transcribe audio
116
+ translated_text = transcribe(audio)
117
+
118
+ # Call LLM
119
+ answer = chatCompletion(translated_text)
120
+
121
+ # Synthesize answer
122
+ synthesised_speech = synthesise(answer)
123
+
124
+ print(f'[speech_to_speech] Transcribed text {translated_text}')
125
+ print(f'[speech_to_speech] LLM answer {answer}')
126
+
127
+ return (22050, synthesised_speech), answer
128
+
129
+ with gr.Blocks() as demo:
130
+ options = gr.WaveformOptions(sample_rate=22050)
131
+
132
+ with gr.Tab("Instant Translation"):
133
+ gr.Markdown(
134
+ """
135
+ # Tanslation of audio to audio
136
+ The aime of this tab is to demonstrate the speech-to-speech translation capabilities of the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model.
137
+
138
+ It uses:
139
+ - [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
140
+ - and glow-tts as a voice synthesizer.
141
+
142
+ You can either record yourself or upload an audio file in the tabs below.
143
+ This will translate to english.
144
+ """)
145
+ with gr.Row():
146
+ with gr.Column(scale=1):
147
+ with gr.Tab("Record Audio"):
148
+ audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
149
+ with gr.Tab("Upload Audio"):
150
+ audioFile = gr.Audio(sources="upload", waveform_options=options)
151
+
152
+ transcribeBtn = gr.Button("Submit", size='lg')
153
+
154
+ with gr.Column(scale=1):
155
+ textOutput = gr.Textbox(label="Transcribed text")
156
+ audioOutput = gr.Audio(waveform_options=options, type="numpy")
157
+
158
+ transcribeBtn.click(fn=speech_to_speech_translation, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")
159
+
160
+ with gr.Tab("Voice Assistant"):
161
+ gr.Markdown(
162
+ """
163
+ # Voice Assistant
164
+ This is a demo to show what are the possibilities for building your own voice assistant.
165
+ This demo uses:
166
+ - [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
167
+ - [ollama/gemma2:2b](https://ollama.com/library/gemma2:2b) model to generate the answer of the assistant,
168
+ - and glow-tts as a voice synthesizer.
169
+
170
+ This means that you need to install ollama on your machine to be able to use this.
171
+
172
+ You can either record yourself or upload an audio file in the tabs below.
173
+ """)
174
+ with gr.Row():
175
+ with gr.Column(scale=1):
176
+ with gr.Tab("Record Audio"):
177
+ audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
178
+ with gr.Tab("Upload Audio"):
179
+ audioFile = gr.Audio(sources="upload", waveform_options=options)
180
+
181
+ transcribeBtn = gr.Button("Submit", size='lg')
182
+ with gr.Column(scale=1):
183
+ textOutput = gr.Textbox(label="Transcribed text")
184
+ audioOutput = gr.Audio(waveform_options=options, type="numpy")
185
+
186
+ transcribeBtn.click(fn=speech_to_speech, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")
187
+
188
+ demo.launch(auth=("FM", "FlandersM4ke"))
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ accelerate
3
+ torchaudio
4
+ transformers
5
+ gradio
6
+ pypdf
7
+ langchain-ollama
8
+ emoji
9
+ coqui-tts
10
+ git+https://github.com/huggingface/parler-tts.git
11
+ nltk