Spaces:
Runtime error
Runtime error
File size: 2,543 Bytes
d347764 e9b38af d347764 33930f0 faa46b9 33930f0 d347764 33930f0 d347764 33930f0 49b592c 524e96e d347764 33930f0 d347764 49b592c d347764 f805e49 c6f1d54 49b592c f805e49 c737803 d347764 226ec3a d347764 f805e49 d347764 c737803 58bdc84 c737803 3946ba6 c737803 49b592c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
from transformers import pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
from transformers import VitsModel, VitsTokenizer
spanish_model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device)
spanish_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-spa")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
return outputs["text"]
def synthesise(text):
inputs = spanish_tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = spanish_model(input_ids)
speech = outputs.waveform
return speech.cpu()[0]
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:

"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch() |