Video_translation_with_speaker_diarization_and_voice_cloning_private

Build error

App Files Files Community

vitaliy-sharandin commited on Dec 1, 2023

Commit

ef230f3

•

1 Parent(s): 31bd8c2

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -11

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import soundfile as sf
 import librosa
 from TTS.api import TTS
-os.environ["COQUI_TOS_AGREED"] = "1"
 HF_TOKEN = os.environ["HF_TOKEN"]
 DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
@@ -132,7 +131,7 @@ def translate_transcript(transcript, target_language, deepl_token):
 def adjust_voice_pace(sound_array, sample_rate, target_duration):
  duration = len(sound_array) / sample_rate
  tempo_change = duration / target_duration
- sound_array_stretched = pyrb.time_stretch(sound_array, sample_rate, tempo_change)
  return sound_array_stretched
@@ -186,23 +185,20 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
  sample_rate = None
  audio = None
  if 'vits' in selected_model:
- audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
- sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
  else:
- # TODO remove when https://github.com/coqui-ai/TTS/issues/3224 is resolved
- tts = TTS(selected_model).to(device)
- audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
- sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
- del tts; import gc; gc.collect(); torch.cuda.empty_cache()
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
  audio_duration = len(audio) / sample_rate
  if speech_item_duration < audio_duration:
  audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
  # Resample to higher rate
  new_sample_rate = 44100
- audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=new_sample_rate)
  # Transform to AudioArrayClip object
  audio = np.expand_dims(audio, axis=1)

 import librosa
 from TTS.api import TTS
 HF_TOKEN = os.environ["HF_TOKEN"]
 DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
 def adjust_voice_pace(sound_array, sample_rate, target_duration):
  duration = len(sound_array) / sample_rate
  tempo_change = duration / target_duration
+ sound_array_stretched = pyrb.time_stretch(np.array(sound_array), sample_rate, tempo_change)
  return sound_array_stretched
  sample_rate = None
  audio = None
  if 'vits' in selected_model:
+ audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
+ sample_rate = tts.synthesizer.output_sample_rate
  else:
+ audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
+ sample_rate = tts.synthesizer.output_sample_rate
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
  audio_duration = len(audio) / sample_rate
  if speech_item_duration < audio_duration:
  audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
  # Resample to higher rate
  new_sample_rate = 44100
+ audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
  # Transform to AudioArrayClip object
  audio = np.expand_dims(audio, axis=1)