emirhanbilgic commited on
Commit
b40d902
·
verified ·
1 Parent(s): 4789559

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -3,6 +3,7 @@ import torch
3
  from datasets import load_dataset
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  import soundfile as sf
 
6
 
7
  # Load the fine-tuned model, processor, and vocoder
8
  model_name = "microsoft/speecht5_tts"
@@ -10,16 +11,15 @@ processor = SpeechT5Processor.from_pretrained(model_name)
10
  model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr")
11
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
 
13
- # Load speaker embeddings (using the same as before, as it's not clear if a specific embedding is needed for the Turkish model)
14
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
15
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
16
 
17
-
18
  def text_to_speech(text):
19
  inputs = processor(text=text, return_tensors="pt")
20
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
21
- sf.write("output.wav", speech.numpy(), samplerate=16000)
22
- return "output.wav"
23
 
24
  # Create Gradio interface
25
  iface = gr.Interface(
 
3
  from datasets import load_dataset
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  import soundfile as sf
6
+ import numpy as np
7
 
8
  # Load the fine-tuned model, processor, and vocoder
9
  model_name = "microsoft/speecht5_tts"
 
11
  model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr")
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
+ # Load speaker embeddings
15
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
16
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
17
 
 
18
  def text_to_speech(text):
19
  inputs = processor(text=text, return_tensors="pt")
20
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
21
+ speech_numpy = speech.numpy()
22
+ return (16000, speech_numpy) # Return sample rate and numpy array
23
 
24
  # Create Gradio interface
25
  iface = gr.Interface(