vitaliy-sharandin commited on
Commit
2b2125a
1 Parent(s): 66a9871

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -30
app.py CHANGED
@@ -100,18 +100,12 @@ def speaker_voice_clips(transcription, audio_path):
100
 
101
  # Perform text translation
102
  def translate_transcript(transcript, target_language, deepl_token):
103
- language_map = {
104
- 'en':'en-us',
105
- 'ru':'ru',
106
- 'uk':'uk',
107
- 'pl':'pl'}
108
-
109
  translator = deepl.Translator(deepl_token)
110
 
111
  translated_transcript = []
112
  for segment in transcript:
113
  text_to_translate = segment['text']
114
- translated_text = translator.translate_text(text_to_translate, target_lang=language_map[target_language])
115
 
116
  translated_segment = {
117
  'start': segment['start'],
@@ -136,21 +130,16 @@ def adjust_voice_pace(sound_array, sample_rate, target_duration):
136
 
137
 
138
  # Perform voice cloning
139
- def voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language, speaker_model, audio_path):
140
  device = "cuda"
141
-
142
- vits_language_map = {
143
- 'en':'eng',
144
- 'ru':'rus',
145
- 'uk':'ukr',
146
- 'pl':'pol'
147
- }
148
 
149
  # Select model
150
  selected_model = None
151
 
152
- if 'vits' in speaker_model.lower() or target_language == 'uk':
153
- selected_model = f'tts_models/{vits_language_map[target_language]}/fairseq/vits'
154
  else:
155
  selected_model = 'tts_models/multilingual/multi-dataset/xtts_v2'
156
 
@@ -188,14 +177,14 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
188
  audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
189
  sample_rate = tts.synthesizer.output_sample_rate
190
  else:
191
- audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
192
  sample_rate = tts.synthesizer.output_sample_rate
193
 
194
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
195
  audio_duration = len(audio) / sample_rate
196
  if speech_item_duration < audio_duration:
197
  audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
198
-
199
  # Resample to higher rate
200
  new_sample_rate = 44100
201
  audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
@@ -240,25 +229,50 @@ def dub_video(video_path, translated_audio_track, target_language):
240
 
241
 
242
  # Perform video translation
243
- def video_translation(video_path, target_language, speaker_model, hf_token, deepl_token):
244
-
245
  original_audio_path = extract_audio(video_path)
246
 
247
  transcription = speech_diarization(original_audio_path, hf_token)
248
 
249
- translated_transcription = translate_transcript(transcription, target_language, deepl_token)
250
 
251
  speakers_voice_clips = speaker_voice_clips(transcription, original_audio_path)
252
 
253
- translated_audio_track = voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language, speaker_model, original_audio_path)
254
 
255
- video_with_dubbing = dub_video(video_path, translated_audio_track, target_language)
256
 
257
  return video_with_dubbing
258
 
259
 
260
 
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  def download_youtube_video(url):
263
  yt = YouTube(url)
264
  if yt.age_restricted:
@@ -316,19 +330,33 @@ def translate_video(video_path, youtube_link, target_language, speaker_model):
316
  if video_path is None:
317
  gr.Warning("Video input did not process well, try again")
318
  return translation_limit(), None
319
- dubbed_video_path = video_translation(video_path, target_language, speaker_model, HF_TOKEN, DEEPL_TOKEN)
 
 
 
 
 
 
320
  limit_info = translation_limit()
321
  return limit_info, dubbed_video_path
322
  except Exception as e:
323
  print(f"An error occurred: {e}")
324
  raise e
325
 
 
 
 
 
 
 
 
 
 
326
  initial_usage_info = translation_limit()
327
 
328
- with gr.Blocks(theme=gr.themes.Soft(), css=".column-frame {border: 2px solid #AAA;border-radius: 10px;padding: 10px;margin: 10px;}") as demo:
329
 
330
  gr.Markdown("<h1 style='text-align: center;'>🌐AI Video Translation</h2>")
331
- gr.Markdown("<h3 style='text-align: center;'>Currently supported languages are: English, Polish, Ukrainian, and Russian</h3>")
332
 
333
  with gr.Row():
334
  with gr.Column(elem_classes=["column-frame"]):
@@ -337,9 +365,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".column-frame {border: 2px solid #AA
337
  video = gr.Video(label="Upload a video file")
338
  gr.Markdown("<h3 style='text-align: center;'>OR</h3>")
339
  youtube_link = gr.Textbox(label="Paste YouTube link")
340
- gr.Markdown("⚠️If you get a warning that the video is age restricted, manually download it using the following [link](https://en.savefrom.net/) and use file upload, as pytube library doesn't support restricted videos download.")
341
  gr.Markdown("---")
342
- target_language = gr.Dropdown(["en", "pl", "uk", "ru"], value="pl", label="Select translation target language")
343
  speaker_model = gr.Dropdown(["(Recommended) XTTS_V2", "VITs (will be default for Ukrainian)"], value="(Recommended) XTTS_V2", label="Select text-to-speech generation model")
344
  with gr.Row():
345
  clear_btn = gr.Button("Clear inputs")
@@ -350,7 +378,13 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".column-frame {border: 2px solid #AA
350
  with gr.Column():
351
  gr.Markdown("<h2 style='text-align: center;'>Translated Video</h3>")
352
  output_video = gr.Video(label="Translated video")
353
-
 
 
 
 
 
 
354
  translate_btn.click(
355
  fn=translate_video,
356
  inputs=[video, youtube_link, target_language, speaker_model],
 
100
 
101
  # Perform text translation
102
  def translate_transcript(transcript, target_language, deepl_token):
 
 
 
 
 
 
103
  translator = deepl.Translator(deepl_token)
104
 
105
  translated_transcript = []
106
  for segment in transcript:
107
  text_to_translate = segment['text']
108
+ translated_text = translator.translate_text(text_to_translate, target_lang=target_language)
109
 
110
  translated_segment = {
111
  'start': segment['start'],
 
130
 
131
 
132
  # Perform voice cloning
133
+ def voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language_codes, speaker_model, audio_path):
134
  device = "cuda"
135
+ xtts2_language_code = target_language_codes[0]
136
+ vits_language_code = target_language_codes[1]
 
 
 
 
 
137
 
138
  # Select model
139
  selected_model = None
140
 
141
+ if 'vits' in speaker_model.lower() or xtts2_language_code == 'uk':
142
+ selected_model = f'tts_models/{vits_language_code}/fairseq/vits'
143
  else:
144
  selected_model = 'tts_models/multilingual/multi-dataset/xtts_v2'
145
 
 
177
  audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
178
  sample_rate = tts.synthesizer.output_sample_rate
179
  else:
180
+ audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=xtts2_language_code)
181
  sample_rate = tts.synthesizer.output_sample_rate
182
 
183
  # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
184
  audio_duration = len(audio) / sample_rate
185
  if speech_item_duration < audio_duration:
186
  audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
187
+
188
  # Resample to higher rate
189
  new_sample_rate = 44100
190
  audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
 
229
 
230
 
231
  # Perform video translation
232
+ def video_translation(video_path, target_language_codes, speaker_model, hf_token, deepl_token):
233
+
234
  original_audio_path = extract_audio(video_path)
235
 
236
  transcription = speech_diarization(original_audio_path, hf_token)
237
 
238
+ translated_transcription = translate_transcript(transcription, target_language_codes[2], deepl_token)
239
 
240
  speakers_voice_clips = speaker_voice_clips(transcription, original_audio_path)
241
 
242
+ translated_audio_track = voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language_codes, speaker_model, original_audio_path)
243
 
244
+ video_with_dubbing = dub_video(video_path, translated_audio_track, target_language_codes[0])
245
 
246
  return video_with_dubbing
247
 
248
 
249
 
250
 
251
+ # Language: xtts2, vits, deepl
252
+ language_codes = {
253
+ "Chinese": ("zh-cn", "zho", "zh"),
254
+ "Czech": ("cs", "ces", "cs"),
255
+ "Dutch": ("nl", "nld", "nl"),
256
+ "English": ("en", "eng", "en-us"),
257
+ "French": ("fr", "fra", "fr"),
258
+ "German": ("de", "deu", "de"),
259
+ "Hungarian": ("hu", "hun", "hu"),
260
+ "Italian": ("it", "ita", "it"),
261
+ "Japanese": ("ja", "jpn", "ja"),
262
+ "Korean": ("ko", "kor", "ko"),
263
+ "Polish": ("pl", "pol", "pl"),
264
+ "Portuguese": ("pt", "por", "pt"),
265
+ "Russian": ("ru", "rus", "ru"),
266
+ "Spanish": ("es", "spa", "es"),
267
+ "Turkish": ("tr", "tur", "tr"),
268
+ "Ukrainian": ("uk", "ukr", "uk")
269
+ }
270
+
271
+ def check_video_duration(video_path):
272
+ with mp.VideoFileClip(video_path) as video:
273
+ duration = video.duration
274
+ return duration > 180
275
+
276
  def download_youtube_video(url):
277
  yt = YouTube(url)
278
  if yt.age_restricted:
 
330
  if video_path is None:
331
  gr.Warning("Video input did not process well, try again")
332
  return translation_limit(), None
333
+
334
+ if check_video_duration(video_path):
335
+ gr.Warning("Video is longer than 3 minutes, please provide a shorter one")
336
+ return translation_limit(), None
337
+
338
+ target_language_codes = language_codes[target_language]
339
+ dubbed_video_path = video_translation(video_path, target_language_codes, speaker_model, HF_TOKEN, DEEPL_TOKEN)
340
  limit_info = translation_limit()
341
  return limit_info, dubbed_video_path
342
  except Exception as e:
343
  print(f"An error occurred: {e}")
344
  raise e
345
 
346
+ css = """
347
+ .column-frame {
348
+ border: 2px solid #AAA;
349
+ border-radius: 10px;
350
+ padding: 10px;
351
+ margin: 10px;
352
+ }
353
+ """
354
+
355
  initial_usage_info = translation_limit()
356
 
357
+ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
358
 
359
  gr.Markdown("<h1 style='text-align: center;'>🌐AI Video Translation</h2>")
 
360
 
361
  with gr.Row():
362
  with gr.Column(elem_classes=["column-frame"]):
 
365
  video = gr.Video(label="Upload a video file")
366
  gr.Markdown("<h3 style='text-align: center;'>OR</h3>")
367
  youtube_link = gr.Textbox(label="Paste YouTube link")
368
+ gr.Markdown("⚠️If you get a warning that the video is age restricted, manually download it using the following [link](https://downloaderto.com/) and use file upload, as pytube library doesn't support restricted videos download.")
369
  gr.Markdown("---")
370
+ target_language = gr.Dropdown(list(language_codes.keys()), value="English", label="Select translation target language")
371
  speaker_model = gr.Dropdown(["(Recommended) XTTS_V2", "VITs (will be default for Ukrainian)"], value="(Recommended) XTTS_V2", label="Select text-to-speech generation model")
372
  with gr.Row():
373
  clear_btn = gr.Button("Clear inputs")
 
378
  with gr.Column():
379
  gr.Markdown("<h2 style='text-align: center;'>Translated Video</h3>")
380
  output_video = gr.Video(label="Translated video")
381
+ gr.Examples(
382
+ [[None, 'https://www.youtube.com/watch?v=q4kkQSkrrtI', 'Japanese', "(Recommended) XTTS_V2"]],
383
+ [video, youtube_link, target_language, speaker_model],
384
+ [translation_limit_info, output_video],
385
+ translate_video,
386
+ run_on_click=True,
387
+ )
388
  translate_btn.click(
389
  fn=translate_video,
390
  inputs=[video, youtube_link, target_language, speaker_model],