File size: 2,472 Bytes
e2ff519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import torch
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset, Audio
import numpy as np
from speechbrain.inference import EncoderClassifier

# Load models and processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("tdnathmlenthusiast/speecht5_finetuned_German_dataset")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load speaker encoder
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-xvect-voxceleb",
    run_opts={"device": device},
    savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
)

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

# Load a sample from the dataset for speaker embedding
try:
    dataset = load_dataset(""Thorsten-Voice/TV-44kHz-Full", "TV-2023.09-Hessisch", split="train", trust_remote_code=True)

    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

    sample = dataset[10]

    speaker_embedding = create_speaker_embedding(sample['audio']['array'])

except Exception as e:

    print(f"Error loading dataset: {e}")

    # Use a random speaker embedding as fallback

    speaker_embedding = torch.randn(1, 512)





def text_to_speech(text):

    # Clean up text

    replacements = [

    ("0", "zero"),

    ("1", "one"),

    ("2", "two"),

    ("3", "three"),

    ("4", "four"),

    ("5", "five"),

    ("6", "six"),

    ("7", "seven"),

    ("8", "eight"),

    ("9", "nine"),

    ("_", " ")

    ]

    for src, dst in replacements:

        text = text.replace(src, dst)



    inputs = processor(text=text, return_tensors="pt")

    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)

    return (16000, speech.numpy())



iface = gr.Interface(

    fn=text_to_speech,

    inputs="text",

    outputs="audio",

    title="German Text-to-Speech Using T5 by Tirtha Debnath ",

    description="Enter German text to convert to speech"

)



iface.launch()