Video_translation_with_speaker_diarization_and_voice_cloning_private

Build error

App Files Files Community

vitaliy-sharandin commited on Dec 17, 2023

Commit

2b2125a

•

1 Parent(s): 66a9871

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -30

app.py CHANGED Viewed

@@ -100,18 +100,12 @@ def speaker_voice_clips(transcription, audio_path):
 # Perform text translation
 def translate_transcript(transcript, target_language, deepl_token):
- language_map = {
- 'en':'en-us',
- 'ru':'ru',
- 'uk':'uk',
- 'pl':'pl'}
  translator = deepl.Translator(deepl_token)
  translated_transcript = []
  for segment in transcript:
  text_to_translate = segment['text']
- translated_text = translator.translate_text(text_to_translate, target_lang=language_map[target_language])
  translated_segment = {
  'start': segment['start'],
@@ -136,21 +130,16 @@ def adjust_voice_pace(sound_array, sample_rate, target_duration):
 # Perform voice cloning
-def voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language, speaker_model, audio_path):
  device = "cuda"
- vits_language_map = {
- 'en':'eng',
- 'ru':'rus',
- 'uk':'ukr',
- 'pl':'pol'
- }
  # Select model
  selected_model = None
- if 'vits' in speaker_model.lower() or target_language == 'uk':
- selected_model = f'tts_models/{vits_language_map[target_language]}/fairseq/vits'
  else:
  selected_model = 'tts_models/multilingual/multi-dataset/xtts_v2'
@@ -188,14 +177,14 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
  audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
  sample_rate = tts.synthesizer.output_sample_rate
  else:
- audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
  sample_rate = tts.synthesizer.output_sample_rate
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
  audio_duration = len(audio) / sample_rate
  if speech_item_duration < audio_duration:
  audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
  # Resample to higher rate
  new_sample_rate = 44100
  audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
@@ -240,25 +229,50 @@ def dub_video(video_path, translated_audio_track, target_language):
 # Perform video translation
-def video_translation(video_path, target_language, speaker_model, hf_token, deepl_token):
  original_audio_path = extract_audio(video_path)
  transcription = speech_diarization(original_audio_path, hf_token)
- translated_transcription = translate_transcript(transcription, target_language, deepl_token)
  speakers_voice_clips = speaker_voice_clips(transcription, original_audio_path)
- translated_audio_track = voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language, speaker_model, original_audio_path)
- video_with_dubbing = dub_video(video_path, translated_audio_track, target_language)
  return video_with_dubbing
 def download_youtube_video(url):
  yt = YouTube(url)
  if yt.age_restricted:
@@ -316,19 +330,33 @@ def translate_video(video_path, youtube_link, target_language, speaker_model):
  if video_path is None:
  gr.Warning("Video input did not process well, try again")
  return translation_limit(), None
- dubbed_video_path = video_translation(video_path, target_language, speaker_model, HF_TOKEN, DEEPL_TOKEN)
  limit_info = translation_limit()
  return limit_info, dubbed_video_path
  except Exception as e:
  print(f"An error occurred: {e}")
  raise e
 initial_usage_info = translation_limit()
-with gr.Blocks(theme=gr.themes.Soft(), css=".column-frame {border: 2px solid #AAA;border-radius: 10px;padding: 10px;margin: 10px;}") as demo:
  gr.Markdown("<h1 style='text-align: center;'>🌐AI Video Translation</h2>")
- gr.Markdown("<h3 style='text-align: center;'>Currently supported languages are: English, Polish, Ukrainian, and Russian</h3>")
  with gr.Row():
  with gr.Column(elem_classes=["column-frame"]):
@@ -337,9 +365,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".column-frame {border: 2px solid #AA
  video = gr.Video(label="Upload a video file")
  gr.Markdown("<h3 style='text-align: center;'>OR</h3>")
  youtube_link = gr.Textbox(label="Paste YouTube link")
- gr.Markdown("⚠️If you get a warning that the video is age restricted, manually download it using the following [link](https://en.savefrom.net/) and use file upload, as pytube library doesn't support restricted videos download.")
  gr.Markdown("---")
- target_language = gr.Dropdown(["en", "pl", "uk", "ru"], value="pl", label="Select translation target language")
  speaker_model = gr.Dropdown(["(Recommended) XTTS_V2", "VITs (will be default for Ukrainian)"], value="(Recommended) XTTS_V2", label="Select text-to-speech generation model")
  with gr.Row():
  clear_btn = gr.Button("Clear inputs")
@@ -350,7 +378,13 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".column-frame {border: 2px solid #AA
  with gr.Column():
  gr.Markdown("<h2 style='text-align: center;'>Translated Video</h3>")
  output_video = gr.Video(label="Translated video")
  translate_btn.click(
  fn=translate_video,
  inputs=[video, youtube_link, target_language, speaker_model],

 # Perform text translation
 def translate_transcript(transcript, target_language, deepl_token):
  translator = deepl.Translator(deepl_token)
  translated_transcript = []
  for segment in transcript:
  text_to_translate = segment['text']
+ translated_text = translator.translate_text(text_to_translate, target_lang=target_language)
  translated_segment = {
  'start': segment['start'],
 # Perform voice cloning
+def voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language_codes, speaker_model, audio_path):
  device = "cuda"
+ xtts2_language_code = target_language_codes[0]
+ vits_language_code = target_language_codes[1]
  # Select model
  selected_model = None
+ if 'vits' in speaker_model.lower() or xtts2_language_code == 'uk':
+ selected_model = f'tts_models/{vits_language_code}/fairseq/vits'
  else:
  selected_model = 'tts_models/multilingual/multi-dataset/xtts_v2'
  audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
  sample_rate = tts.synthesizer.output_sample_rate
  else:
+ audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=xtts2_language_code)
  sample_rate = tts.synthesizer.output_sample_rate
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
  audio_duration = len(audio) / sample_rate
  if speech_item_duration < audio_duration:
  audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
  # Resample to higher rate
  new_sample_rate = 44100
  audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
 # Perform video translation
+def video_translation(video_path, target_language_codes, speaker_model, hf_token, deepl_token):
  original_audio_path = extract_audio(video_path)
  transcription = speech_diarization(original_audio_path, hf_token)
+ translated_transcription = translate_transcript(transcription, target_language_codes[2], deepl_token)
  speakers_voice_clips = speaker_voice_clips(transcription, original_audio_path)
+ translated_audio_track = voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language_codes, speaker_model, original_audio_path)
+ video_with_dubbing = dub_video(video_path, translated_audio_track, target_language_codes[0])
  return video_with_dubbing
+# Language: xtts2, vits, deepl
+language_codes = {
+ "Chinese": ("zh-cn", "zho", "zh"),
+ "Czech": ("cs", "ces", "cs"),
+ "Dutch": ("nl", "nld", "nl"),
+ "English": ("en", "eng", "en-us"),
+ "French": ("fr", "fra", "fr"),
+ "German": ("de", "deu", "de"),
+ "Hungarian": ("hu", "hun", "hu"),
+ "Italian": ("it", "ita", "it"),
+ "Japanese": ("ja", "jpn", "ja"),
+ "Korean": ("ko", "kor", "ko"),
+ "Polish": ("pl", "pol", "pl"),
+ "Portuguese": ("pt", "por", "pt"),
+ "Russian": ("ru", "rus", "ru"),
+ "Spanish": ("es", "spa", "es"),
+ "Turkish": ("tr", "tur", "tr"),
+ "Ukrainian": ("uk", "ukr", "uk")
+}
+def check_video_duration(video_path):
+ with mp.VideoFileClip(video_path) as video:
+ duration = video.duration
+ return duration > 180
 def download_youtube_video(url):
  yt = YouTube(url)
  if yt.age_restricted:
  if video_path is None:
  gr.Warning("Video input did not process well, try again")
  return translation_limit(), None
+ if check_video_duration(video_path):
+ gr.Warning("Video is longer than 3 minutes, please provide a shorter one")
+ return translation_limit(), None
+ target_language_codes = language_codes[target_language]
+ dubbed_video_path = video_translation(video_path, target_language_codes, speaker_model, HF_TOKEN, DEEPL_TOKEN)
  limit_info = translation_limit()
  return limit_info, dubbed_video_path
  except Exception as e:
  print(f"An error occurred: {e}")
  raise e
+css = """
+.column-frame {
+ border: 2px solid #AAA;
+ border-radius: 10px;
+ padding: 10px;
+ margin: 10px;
+}
+"""
 initial_usage_info = translation_limit()
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
  gr.Markdown("<h1 style='text-align: center;'>🌐AI Video Translation</h2>")
  with gr.Row():
  with gr.Column(elem_classes=["column-frame"]):
  video = gr.Video(label="Upload a video file")
  gr.Markdown("<h3 style='text-align: center;'>OR</h3>")
  youtube_link = gr.Textbox(label="Paste YouTube link")
+ gr.Markdown("⚠️If you get a warning that the video is age restricted, manually download it using the following [link](https://downloaderto.com/) and use file upload, as pytube library doesn't support restricted videos download.")
  gr.Markdown("---")
+ target_language = gr.Dropdown(list(language_codes.keys()), value="English", label="Select translation target language")
  speaker_model = gr.Dropdown(["(Recommended) XTTS_V2", "VITs (will be default for Ukrainian)"], value="(Recommended) XTTS_V2", label="Select text-to-speech generation model")
  with gr.Row():
  clear_btn = gr.Button("Clear inputs")
  with gr.Column():
  gr.Markdown("<h2 style='text-align: center;'>Translated Video</h3>")
  output_video = gr.Video(label="Translated video")
+ gr.Examples(
+ [[None, 'https://www.youtube.com/watch?v=q4kkQSkrrtI', 'Japanese', "(Recommended) XTTS_V2"]],
+ [video, youtube_link, target_language, speaker_model],
+ [translation_limit_info, output_video],
+ translate_video,
+ run_on_click=True,
+ )
  translate_btn.click(
  fn=translate_video,
  inputs=[video, youtube_link, target_language, speaker_model],