File size: 2,543 Bytes
d347764
 
 
 
 
e9b38af
d347764
 
 
 
 
 
 
 
33930f0
 
 
faa46b9
33930f0
 
 
d347764
 
 
 
 
33930f0
d347764
 
 
 
33930f0
 
 
 
49b592c
524e96e
d347764
 
33930f0
d347764
 
 
49b592c
d347764
 
 
f805e49
 
c6f1d54
 
49b592c
f805e49
 
 
c737803
 
 
d347764
226ec3a
d347764
f805e49
 
d347764
c737803
 
 
 
 
58bdc84
c737803
 
 
 
 
3946ba6
c737803
49b592c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset

from transformers import pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)


from transformers import VitsModel, VitsTokenizer

spanish_model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device)
spanish_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-spa")



embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
    return outputs["text"]


def synthesise(text):
    inputs = spanish_tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    with torch.no_grad():
        outputs = spanish_model(input_ids)
    speech = outputs.waveform
    return speech.cpu()[0]



def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech


title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:

![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()