amir22010 commited on
Commit
8cb91c0
1 Parent(s): 8ede9d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -32
app.py CHANGED
@@ -6,43 +6,27 @@ import numpy as np
6
  import wave
7
 
8
  #tts
9
- from balacoon_tts import TTS
10
- from threading import Lock
11
- from huggingface_hub import hf_hub_download, list_repo_files
12
- import io
13
  import tempfile
 
 
 
14
 
15
- #tts cpu model
16
- tts_model_str = "en_us_hifi_jets_cpu.addon"
17
-
18
- for name in list_repo_files(repo_id="balacoon/tts"):
19
- if name == tts_model_str:
20
- if not os.path.isfile(os.path.join(os.getcwd(), name)):
21
- hf_hub_download(
22
- repo_id="balacoon/tts",
23
- filename=name,
24
- local_dir=os.getcwd(),
25
- )
26
-
27
-
28
- tts = TTS(os.path.join(os.getcwd(), tts_model_str))
29
 
30
  def text_to_speech(text):
31
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
32
- if len(text) > 1024:
33
- # truncate the text
34
- text_str = text[:1024]
35
- else:
36
- text_str = text
37
- with locker:
38
- global tts
39
- samples = tts.synthesize(text_str, "92")
40
- output_file = temp_file.name
41
- with wave.open(f"{output_file}", "w") as fp:
42
- fp.setparams((1, 2, tts.get_sampling_rate(), len(samples), "NONE", "NONE"))
43
- samples = np.ascontiguousarray(samples)
44
- fp.writeframes(samples)
45
- return output_file
46
 
47
  def combine_audio_files(audio_files):
48
  data= []
 
6
  import wave
7
 
8
  #tts
 
 
 
 
9
  import tempfile
10
+ import torchaudio
11
+ from speechbrain.inference.TTS import FastSpeech2
12
+ from speechbrain.inference.vocoders import HIFIGAN
13
 
14
+ fastspeech2 = FastSpeech2.from_hparams(source="speechbrain/tts-fastspeech2-ljspeech", savedir="pretrained_models/tts-fastspeech2-ljspeech")
15
+ hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained_models/tts-hifigan-ljspeech")
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def text_to_speech(text):
18
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
19
+ mel_output, durations, pitch, energy = fastspeech2.encode_text(
20
+ [text],
21
+ pace=1.0, # scale up/down the speed
22
+ pitch_rate=1.0, # scale up/down the pitch
23
+ energy_rate=1.0, # scale up/down the energy
24
+ )
25
+ # Running Vocoder (spectrogram-to-waveform)
26
+ waveforms = hifi_gan.decode_batch(mel_output)
27
+ # Save the waverform
28
+ torchaudio.save(temp_file.name, waveforms.squeeze(1), 22050)
29
+ return temp_file.name
 
 
 
30
 
31
  def combine_audio_files(audio_files):
32
  data= []