HanaeRateau's picture
Adds textbox for LLM's answer
9f5ff14
raw
history blame
6.41 kB
import os
import gradio as gr
import numpy as np
import torch
import ollama
import emoji
from datasets import load_dataset
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
from huggingface_hub import login
from TTS_models import *
login(token = os.getenv('HF_TOKEN'))
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load speech translation checkpoint
STT_model_id = "openai/whisper-tiny"
# load llm
llm_model_id = "gemma2:2b"
# init TTS model
TTS_model_id = "tts_models/en/ljspeech/tacotron2-DDC_ph"
client = ollama.Client()
llmpipe = pipeline(
"text-generation",
model="google/gemma-2-2b-it",
model_kwargs={"torch_dtype": torch.bfloat16},
device=device
)
def translate(audio):
global STT_model_id
asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language":"fr"})
print(f'Translated {outputs} using {asr_pipe.model}')
return outputs["text"]
def transcribe(audio):
global STT_model_id
asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
print(f'[transcribe] Transcribe {outputs}')
return outputs["text"]
def chatCompletion(text):
global llm_model_id
global llmpipe
global client
messages = [
{"role": "user", "content": "You are a helpful assistant. Answer in English only in text.\n\n"+text},
]
try: # try to get a ollama client
response: ollama.ListResponse = ollama.list()
response = client.chat(
model=llm_model_id,
messages=messages,
stream=True,
options={
'num_predict': 256,
'temperature': 0.5,
'low_vram': True,
},
)
buffer = ""
for chunk in response:
buffer += chunk["message"]["content"]
print(f'[chatCompletion] {buffer}')
return buffer
except: # get a HF piepline LLM
outputs = llmpipe(messages, max_new_tokens=256)
buffer = outputs[0]["generated_text"][-1]["content"].strip()
print(f'[chatCompletion] {buffer}')
return buffer
def synthesise(text):
global TTS_model_id
text = emoji.replace_emoji(text, replace="!")
synthesiser = XTTS(TTS_model_id)
speech = synthesiser.synthesize(text)
return (np.array(speech)* 32767).astype(np.int16)
def speech_to_speech_translation(audioMic, audioFile):
audio = None
if audioMic is not None:
audio = audioMic
elif audioFile is not None:
audio = audioFile
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
return (22050, synthesised_speech), translated_text
def speech_to_speech(audioMic, audioFile):
audio = None
if audioMic is not None:
audio = audioMic
elif audioFile is not None:
audio = audioFile
translated_text = "Sorry no audio was found."
if audio is not None:
# Transcribe audio
translated_text = transcribe(audio)
# Call LLM
answer = chatCompletion(translated_text)
# Synthesize answer
synthesised_speech = synthesise(answer)
print(f'[speech_to_speech] Transcribed text {translated_text}')
print(f'[speech_to_speech] LLM answer {answer}')
return (22050, synthesised_speech), translated_text, answer
with gr.Blocks() as demo:
options = gr.WaveformOptions(sample_rate=22050)
with gr.Tab("Instant Translation"):
gr.Markdown(
"""
# Tanslation of audio to audio
The aime of this tab is to demonstrate the speech-to-speech translation capabilities of the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model.
It uses:
- [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
- and glow-tts as a voice synthesizer.
You can either record yourself or upload an audio file in the tabs below.
This will translate to english.
""")
with gr.Row():
with gr.Column(scale=1):
with gr.Tab("Record Audio"):
audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
with gr.Tab("Upload Audio"):
audioFile = gr.Audio(sources="upload", type="filepath")
transcribeBtn = gr.Button("Submit", size='lg')
with gr.Column(scale=1):
textOutput = gr.Textbox(label="Transcribed text")
audioOutput = gr.Audio(waveform_options=options, type="numpy")
transcribeBtn.click(fn=speech_to_speech_translation, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")
with gr.Tab("Voice Assistant"):
gr.Markdown(
"""
# Voice Assistant
This is a demo to show what are the possibilities for building your own voice assistant.
This demo uses:
- [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
- [ollama/gemma2:2b](https://ollama.com/library/gemma2:2b) model to generate the answer of the assistant,
- and glow-tts as a voice synthesizer.
This means that you need to install ollama on your machine to be able to use this.
You can either record yourself or upload an audio file in the tabs below.
""")
with gr.Row():
with gr.Column(scale=1):
with gr.Tab("Record Audio"):
audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
with gr.Tab("Upload Audio"):
audioFile = gr.Audio(sources="upload", type="filepath")
translateBtn = gr.Button("Submit", size='lg')
with gr.Column(scale=1):
textOutput = gr.Textbox(label="Transcribed text")
textAnswer = gr.Textbox(label="Assistant's Answer")
audioOutput = gr.Audio(waveform_options=options, type="numpy")
translateBtn.click(fn=speech_to_speech, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput, textAnswer], api_name="report_generation")
demo.launch()