Spaces:
Running
Running
import os | |
import gradio as gr | |
import numpy as np | |
import torch | |
import ollama | |
import emoji | |
from datasets import load_dataset | |
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline | |
from huggingface_hub import login | |
from TTS_models import * | |
login(token = os.getenv('HF_TOKEN')) | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
# load speech translation checkpoint | |
STT_model_id = "openai/whisper-tiny" | |
# load llm | |
llm_model_id = "gemma2:2b" | |
# init TTS model | |
TTS_model_id = "tts_models/en/ljspeech/tacotron2-DDC_ph" | |
client = ollama.Client() | |
llmpipe = pipeline( | |
"text-generation", | |
model="google/gemma-2-2b-it", | |
model_kwargs={"torch_dtype": torch.bfloat16}, | |
device=device | |
) | |
def translate(audio): | |
global STT_model_id | |
asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device) | |
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language":"fr"}) | |
print(f'Translated {outputs} using {asr_pipe.model}') | |
return outputs["text"] | |
def transcribe(audio): | |
global STT_model_id | |
asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device) | |
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"}) | |
print(f'[transcribe] Transcribe {outputs}') | |
return outputs["text"] | |
def chatCompletion(text): | |
global llm_model_id | |
global llmpipe | |
global client | |
messages = [ | |
{"role": "user", "content": "You are a helpful assistant. Answer in English only in text.\n\n"+text}, | |
] | |
try: # try to get a ollama client | |
response: ollama.ListResponse = ollama.list() | |
response = client.chat( | |
model=llm_model_id, | |
messages=messages, | |
stream=True, | |
options={ | |
'num_predict': 256, | |
'temperature': 0.5, | |
'low_vram': True, | |
}, | |
) | |
buffer = "" | |
for chunk in response: | |
buffer += chunk["message"]["content"] | |
print(f'[chatCompletion] {buffer}') | |
return buffer | |
except: # get a HF piepline LLM | |
outputs = llmpipe(messages, max_new_tokens=256) | |
buffer = outputs[0]["generated_text"][-1]["content"].strip() | |
print(f'[chatCompletion] {buffer}') | |
return buffer | |
def synthesise(text): | |
global TTS_model_id | |
text = emoji.replace_emoji(text, replace="!") | |
synthesiser = XTTS(TTS_model_id) | |
speech = synthesiser.synthesize(text) | |
return (np.array(speech)* 32767).astype(np.int16) | |
def speech_to_speech_translation(audioMic, audioFile): | |
audio = None | |
if audioMic is not None: | |
audio = audioMic | |
elif audioFile is not None: | |
audio = audioFile | |
translated_text = translate(audio) | |
synthesised_speech = synthesise(translated_text) | |
return (22050, synthesised_speech), translated_text | |
def speech_to_speech(audioMic, audioFile): | |
audio = None | |
if audioMic is not None: | |
audio = audioMic | |
elif audioFile is not None: | |
audio = audioFile | |
translated_text = "Sorry no audio was found." | |
if audio is not None: | |
# Transcribe audio | |
translated_text = transcribe(audio) | |
# Call LLM | |
answer = chatCompletion(translated_text) | |
# Synthesize answer | |
synthesised_speech = synthesise(answer) | |
print(f'[speech_to_speech] Transcribed text {translated_text}') | |
print(f'[speech_to_speech] LLM answer {answer}') | |
return (22050, synthesised_speech), translated_text, answer | |
with gr.Blocks() as demo: | |
options = gr.WaveformOptions(sample_rate=22050) | |
with gr.Tab("Instant Translation"): | |
gr.Markdown( | |
""" | |
# Tanslation of audio to audio | |
The aime of this tab is to demonstrate the speech-to-speech translation capabilities of the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model. | |
It uses: | |
- [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe, | |
- and glow-tts as a voice synthesizer. | |
You can either record yourself or upload an audio file in the tabs below. | |
This will translate to english. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
with gr.Tab("Record Audio"): | |
audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath") | |
with gr.Tab("Upload Audio"): | |
audioFile = gr.Audio(sources="upload", type="filepath") | |
transcribeBtn = gr.Button("Submit", size='lg') | |
with gr.Column(scale=1): | |
textOutput = gr.Textbox(label="Transcribed text") | |
audioOutput = gr.Audio(waveform_options=options, type="numpy") | |
transcribeBtn.click(fn=speech_to_speech_translation, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation") | |
with gr.Tab("Voice Assistant"): | |
gr.Markdown( | |
""" | |
# Voice Assistant | |
This is a demo to show what are the possibilities for building your own voice assistant. | |
This demo uses: | |
- [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe, | |
- [ollama/gemma2:2b](https://ollama.com/library/gemma2:2b) model to generate the answer of the assistant, | |
- and glow-tts as a voice synthesizer. | |
This means that you need to install ollama on your machine to be able to use this. | |
You can either record yourself or upload an audio file in the tabs below. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
with gr.Tab("Record Audio"): | |
audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath") | |
with gr.Tab("Upload Audio"): | |
audioFile = gr.Audio(sources="upload", type="filepath") | |
translateBtn = gr.Button("Submit", size='lg') | |
with gr.Column(scale=1): | |
textOutput = gr.Textbox(label="Transcribed text") | |
textAnswer = gr.Textbox(label="Assistant's Answer") | |
audioOutput = gr.Audio(waveform_options=options, type="numpy") | |
translateBtn.click(fn=speech_to_speech, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput, textAnswer], api_name="report_generation") | |
demo.launch() |