Spaces:

HanaeRateau
/

Speech-to-Speech-FM

Running

App Files Files Community

Speech-to-Speech-FM / app.py

HanaeRateau

Adds textbox for LLM's answer

9f5ff14 about 2 months ago

raw

history blame

6.41 kB

	import os
	import gradio as gr
	import numpy as np
	import torch
	import ollama
	import emoji
	from datasets import load_dataset
	from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
	from huggingface_hub import login

	from TTS_models import *

	login(token = os.getenv('HF_TOKEN'))
	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# load speech translation checkpoint
	STT_model_id = "openai/whisper-tiny"

	# load llm
	llm_model_id = "gemma2:2b"

	# init TTS model
	TTS_model_id = "tts_models/en/ljspeech/tacotron2-DDC_ph"

	client = ollama.Client()
	llmpipe = pipeline(
	"text-generation",
	model="google/gemma-2-2b-it",
	model_kwargs={"torch_dtype": torch.bfloat16},
	device=device
	)

	def translate(audio):
	global STT_model_id
	asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language":"fr"})
	print(f'Translated {outputs} using {asr_pipe.model}')
	return outputs["text"]

	def transcribe(audio):
	global STT_model_id
	asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device)
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"})
	print(f'[transcribe] Transcribe {outputs}')
	return outputs["text"]

	def chatCompletion(text):
	global llm_model_id
	global llmpipe
	global client

	messages = [
	{"role": "user", "content": "You are a helpful assistant. Answer in English only in text.\n\n"+text},
	]

	try: # try to get a ollama client
	response: ollama.ListResponse = ollama.list()

	response = client.chat(
	model=llm_model_id,
	messages=messages,
	stream=True,
	options={
	'num_predict': 256,
	'temperature': 0.5,
	'low_vram': True,
	},
	)

	buffer = ""
	for chunk in response:
	buffer += chunk["message"]["content"]

	print(f'[chatCompletion] {buffer}')
	return buffer
	except: # get a HF piepline LLM
	outputs = llmpipe(messages, max_new_tokens=256)
	buffer = outputs[0]["generated_text"][-1]["content"].strip()

	print(f'[chatCompletion] {buffer}')
	return buffer


	def synthesise(text):
	global TTS_model_id
	text = emoji.replace_emoji(text, replace="!")
	synthesiser = XTTS(TTS_model_id)
	speech = synthesiser.synthesize(text)

	return (np.array(speech)* 32767).astype(np.int16)

	def speech_to_speech_translation(audioMic, audioFile):
	audio = None
	if audioMic is not None:
	audio = audioMic
	elif audioFile is not None:
	audio = audioFile

	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	return (22050, synthesised_speech), translated_text

	def speech_to_speech(audioMic, audioFile):
	audio = None
	if audioMic is not None:
	audio = audioMic
	elif audioFile is not None:
	audio = audioFile

	translated_text = "Sorry no audio was found."

	if audio is not None:
	# Transcribe audio
	translated_text = transcribe(audio)

	# Call LLM
	answer = chatCompletion(translated_text)

	# Synthesize answer
	synthesised_speech = synthesise(answer)

	print(f'[speech_to_speech] Transcribed text {translated_text}')
	print(f'[speech_to_speech] LLM answer {answer}')

	return (22050, synthesised_speech), translated_text, answer

	with gr.Blocks() as demo:
	options = gr.WaveformOptions(sample_rate=22050)

	with gr.Tab("Instant Translation"):
	gr.Markdown(
	"""
	# Tanslation of audio to audio
	The aime of this tab is to demonstrate the speech-to-speech translation capabilities of the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model.

	It uses:
	- [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
	- and glow-tts as a voice synthesizer.

	You can either record yourself or upload an audio file in the tabs below.
	This will translate to english.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	with gr.Tab("Record Audio"):
	audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
	with gr.Tab("Upload Audio"):
	audioFile = gr.Audio(sources="upload", type="filepath")

	transcribeBtn = gr.Button("Submit", size='lg')

	with gr.Column(scale=1):
	textOutput = gr.Textbox(label="Transcribed text")
	audioOutput = gr.Audio(waveform_options=options, type="numpy")

	transcribeBtn.click(fn=speech_to_speech_translation, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation")

	with gr.Tab("Voice Assistant"):
	gr.Markdown(
	"""
	# Voice Assistant
	This is a demo to show what are the possibilities for building your own voice assistant.
	This demo uses:
	- [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe,
	- [ollama/gemma2:2b](https://ollama.com/library/gemma2:2b) model to generate the answer of the assistant,
	- and glow-tts as a voice synthesizer.

	This means that you need to install ollama on your machine to be able to use this.

	You can either record yourself or upload an audio file in the tabs below.
	""")
	with gr.Row():
	with gr.Column(scale=1):
	with gr.Tab("Record Audio"):
	audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath")
	with gr.Tab("Upload Audio"):
	audioFile = gr.Audio(sources="upload", type="filepath")

	translateBtn = gr.Button("Submit", size='lg')
	with gr.Column(scale=1):
	textOutput = gr.Textbox(label="Transcribed text")
	textAnswer = gr.Textbox(label="Assistant's Answer")
	audioOutput = gr.Audio(waveform_options=options, type="numpy")

	translateBtn.click(fn=speech_to_speech, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput, textAnswer], api_name="report_generation")

	demo.launch()