tdnathmlenthusiast commited on
Commit
ceb3363
1 Parent(s): e2ff519

fixed syntax error '

Browse files
Files changed (1) hide show
  1. app.py +70 -70
app.py CHANGED
@@ -1,71 +1,71 @@
1
- import gradio as gr
2
- import torch
3
- import os
4
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
- from datasets import load_dataset, Audio
6
- import numpy as np
7
- from speechbrain.inference import EncoderClassifier
8
-
9
- # Load models and processor
10
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
- model = SpeechT5ForTextToSpeech.from_pretrained("tdnathmlenthusiast/speecht5_finetuned_German_dataset")
12
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
-
14
- # Load speaker encoder
15
- device = "cuda" if torch.cuda.is_available() else "cpu"
16
- speaker_model = EncoderClassifier.from_hparams(
17
- source="speechbrain/spkrec-xvect-voxceleb",
18
- run_opts={"device": device},
19
- savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
20
- )
21
-
22
- def create_speaker_embedding(waveform):
23
- with torch.no_grad():
24
- speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
25
- speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
26
- speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
27
- return speaker_embeddings
28
-
29
- # Load a sample from the dataset for speaker embedding
30
- try:
31
- dataset = load_dataset(""Thorsten-Voice/TV-44kHz-Full", "TV-2023.09-Hessisch", split="train", trust_remote_code=True)
32
- dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
33
- sample = dataset[10]
34
- speaker_embedding = create_speaker_embedding(sample['audio']['array'])
35
- except Exception as e:
36
- print(f"Error loading dataset: {e}")
37
- # Use a random speaker embedding as fallback
38
- speaker_embedding = torch.randn(1, 512)
39
-
40
-
41
- def text_to_speech(text):
42
- # Clean up text
43
- replacements = [
44
- ("0", "zero"),
45
- ("1", "one"),
46
- ("2", "two"),
47
- ("3", "three"),
48
- ("4", "four"),
49
- ("5", "five"),
50
- ("6", "six"),
51
- ("7", "seven"),
52
- ("8", "eight"),
53
- ("9", "nine"),
54
- ("_", " ")
55
- ]
56
- for src, dst in replacements:
57
- text = text.replace(src, dst)
58
-
59
- inputs = processor(text=text, return_tensors="pt")
60
- speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
61
- return (16000, speech.numpy())
62
-
63
- iface = gr.Interface(
64
- fn=text_to_speech,
65
- inputs="text",
66
- outputs="audio",
67
- title="German Text-to-Speech Using T5 by Tirtha Debnath ",
68
- description="Enter German text to convert to speech"
69
- )
70
-
71
  iface.launch()
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ from datasets import load_dataset, Audio
6
+ import numpy as np
7
+ from speechbrain.inference import EncoderClassifier
8
+
9
+ # Load models and processor
10
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
+ model = SpeechT5ForTextToSpeech.from_pretrained("tdnathmlenthusiast/speecht5_finetuned_German_dataset")
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
+
14
+ # Load speaker encoder
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ speaker_model = EncoderClassifier.from_hparams(
17
+ source="speechbrain/spkrec-xvect-voxceleb",
18
+ run_opts={"device": device},
19
+ savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
20
+ )
21
+
22
+ def create_speaker_embedding(waveform):
23
+ with torch.no_grad():
24
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
25
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
26
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
27
+ return speaker_embeddings
28
+
29
+ # Load a sample from the dataset for speaker embedding
30
+ try:
31
+ dataset = load_dataset("Thorsten-Voice/TV-44kHz-Full", "TV-2023.09-Hessisch", split="train", trust_remote_code=True)
32
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
33
+ sample = dataset[10]
34
+ speaker_embedding = create_speaker_embedding(sample['audio']['array'])
35
+ except Exception as e:
36
+ print(f"Error loading dataset: {e}")
37
+ # Use a random speaker embedding as fallback
38
+ speaker_embedding = torch.randn(1, 512)
39
+
40
+
41
+ def text_to_speech(text):
42
+ # Clean up text
43
+ replacements = [
44
+ ("0", "zero"),
45
+ ("1", "one"),
46
+ ("2", "two"),
47
+ ("3", "three"),
48
+ ("4", "four"),
49
+ ("5", "five"),
50
+ ("6", "six"),
51
+ ("7", "seven"),
52
+ ("8", "eight"),
53
+ ("9", "nine"),
54
+ ("_", " ")
55
+ ]
56
+ for src, dst in replacements:
57
+ text = text.replace(src, dst)
58
+
59
+ inputs = processor(text=text, return_tensors="pt")
60
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
61
+ return (16000, speech.numpy())
62
+
63
+ iface = gr.Interface(
64
+ fn=text_to_speech,
65
+ inputs="text",
66
+ outputs="audio",
67
+ title="German Text-to-Speech Using T5 by Tirtha Debnath ",
68
+ description="Enter German text to convert to speech"
69
+ )
70
+
71
  iface.launch()