vitaliy-sharandin commited on
Commit
ef230f3
1 Parent(s): 31bd8c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -11
app.py CHANGED
@@ -12,7 +12,6 @@ import soundfile as sf
12
  import librosa
13
  from TTS.api import TTS
14
 
15
- os.environ["COQUI_TOS_AGREED"] = "1"
16
  HF_TOKEN = os.environ["HF_TOKEN"]
17
  DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
18
 
@@ -132,7 +131,7 @@ def translate_transcript(transcript, target_language, deepl_token):
132
  def adjust_voice_pace(sound_array, sample_rate, target_duration):
133
  duration = len(sound_array) / sample_rate
134
  tempo_change = duration / target_duration
135
- sound_array_stretched = pyrb.time_stretch(sound_array, sample_rate, tempo_change)
136
  return sound_array_stretched
137
 
138
 
@@ -186,23 +185,20 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
186
  sample_rate = None
187
  audio = None
188
  if 'vits' in selected_model:
189
- audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
190
- sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
191
  else:
192
- # TODO remove when https://github.com/coqui-ai/TTS/issues/3224 is resolved
193
- tts = TTS(selected_model).to(device)
194
- audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
195
- sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
196
- del tts; import gc; gc.collect(); torch.cuda.empty_cache()
197
 
198
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
199
  audio_duration = len(audio) / sample_rate
200
  if speech_item_duration < audio_duration:
201
  audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
202
-
203
  # Resample to higher rate
204
  new_sample_rate = 44100
205
- audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=new_sample_rate)
206
 
207
  # Transform to AudioArrayClip object
208
  audio = np.expand_dims(audio, axis=1)
 
12
  import librosa
13
  from TTS.api import TTS
14
 
 
15
  HF_TOKEN = os.environ["HF_TOKEN"]
16
  DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
17
 
 
131
  def adjust_voice_pace(sound_array, sample_rate, target_duration):
132
  duration = len(sound_array) / sample_rate
133
  tempo_change = duration / target_duration
134
+ sound_array_stretched = pyrb.time_stretch(np.array(sound_array), sample_rate, tempo_change)
135
  return sound_array_stretched
136
 
137
 
 
185
  sample_rate = None
186
  audio = None
187
  if 'vits' in selected_model:
188
+ audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
189
+ sample_rate = tts.synthesizer.output_sample_rate
190
  else:
191
+ audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
192
+ sample_rate = tts.synthesizer.output_sample_rate
 
 
 
193
 
194
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
195
  audio_duration = len(audio) / sample_rate
196
  if speech_item_duration < audio_duration:
197
  audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
198
+
199
  # Resample to higher rate
200
  new_sample_rate = 44100
201
+ audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
202
 
203
  # Transform to AudioArrayClip object
204
  audio = np.expand_dims(audio, axis=1)