vitaliy-sharandin
commited on
Commit
•
2b2125a
1
Parent(s):
66a9871
Update app.py
Browse files
app.py
CHANGED
@@ -100,18 +100,12 @@ def speaker_voice_clips(transcription, audio_path):
|
|
100 |
|
101 |
# Perform text translation
|
102 |
def translate_transcript(transcript, target_language, deepl_token):
|
103 |
-
language_map = {
|
104 |
-
'en':'en-us',
|
105 |
-
'ru':'ru',
|
106 |
-
'uk':'uk',
|
107 |
-
'pl':'pl'}
|
108 |
-
|
109 |
translator = deepl.Translator(deepl_token)
|
110 |
|
111 |
translated_transcript = []
|
112 |
for segment in transcript:
|
113 |
text_to_translate = segment['text']
|
114 |
-
translated_text = translator.translate_text(text_to_translate, target_lang=
|
115 |
|
116 |
translated_segment = {
|
117 |
'start': segment['start'],
|
@@ -136,21 +130,16 @@ def adjust_voice_pace(sound_array, sample_rate, target_duration):
|
|
136 |
|
137 |
|
138 |
# Perform voice cloning
|
139 |
-
def voice_cloning_translation(translated_transcription, speakers_voice_clips,
|
140 |
device = "cuda"
|
141 |
-
|
142 |
-
|
143 |
-
'en':'eng',
|
144 |
-
'ru':'rus',
|
145 |
-
'uk':'ukr',
|
146 |
-
'pl':'pol'
|
147 |
-
}
|
148 |
|
149 |
# Select model
|
150 |
selected_model = None
|
151 |
|
152 |
-
if 'vits' in speaker_model.lower() or
|
153 |
-
selected_model = f'tts_models/{
|
154 |
else:
|
155 |
selected_model = 'tts_models/multilingual/multi-dataset/xtts_v2'
|
156 |
|
@@ -188,14 +177,14 @@ def voice_cloning_translation(translated_transcription, speakers_voice_clips, ta
|
|
188 |
audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
|
189 |
sample_rate = tts.synthesizer.output_sample_rate
|
190 |
else:
|
191 |
-
audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=
|
192 |
sample_rate = tts.synthesizer.output_sample_rate
|
193 |
|
194 |
# Adjust pace to fit the speech timeframe if translated audio is longer than phrase
|
195 |
audio_duration = len(audio) / sample_rate
|
196 |
if speech_item_duration < audio_duration:
|
197 |
audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
|
198 |
-
|
199 |
# Resample to higher rate
|
200 |
new_sample_rate = 44100
|
201 |
audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
|
@@ -240,25 +229,50 @@ def dub_video(video_path, translated_audio_track, target_language):
|
|
240 |
|
241 |
|
242 |
# Perform video translation
|
243 |
-
def video_translation(video_path,
|
244 |
-
|
245 |
original_audio_path = extract_audio(video_path)
|
246 |
|
247 |
transcription = speech_diarization(original_audio_path, hf_token)
|
248 |
|
249 |
-
translated_transcription = translate_transcript(transcription,
|
250 |
|
251 |
speakers_voice_clips = speaker_voice_clips(transcription, original_audio_path)
|
252 |
|
253 |
-
translated_audio_track = voice_cloning_translation(translated_transcription, speakers_voice_clips,
|
254 |
|
255 |
-
video_with_dubbing = dub_video(video_path, translated_audio_track,
|
256 |
|
257 |
return video_with_dubbing
|
258 |
|
259 |
|
260 |
|
261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
def download_youtube_video(url):
|
263 |
yt = YouTube(url)
|
264 |
if yt.age_restricted:
|
@@ -316,19 +330,33 @@ def translate_video(video_path, youtube_link, target_language, speaker_model):
|
|
316 |
if video_path is None:
|
317 |
gr.Warning("Video input did not process well, try again")
|
318 |
return translation_limit(), None
|
319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
limit_info = translation_limit()
|
321 |
return limit_info, dubbed_video_path
|
322 |
except Exception as e:
|
323 |
print(f"An error occurred: {e}")
|
324 |
raise e
|
325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
initial_usage_info = translation_limit()
|
327 |
|
328 |
-
with gr.Blocks(theme=gr.themes.Soft(), css=
|
329 |
|
330 |
gr.Markdown("<h1 style='text-align: center;'>🌐AI Video Translation</h2>")
|
331 |
-
gr.Markdown("<h3 style='text-align: center;'>Currently supported languages are: English, Polish, Ukrainian, and Russian</h3>")
|
332 |
|
333 |
with gr.Row():
|
334 |
with gr.Column(elem_classes=["column-frame"]):
|
@@ -337,9 +365,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".column-frame {border: 2px solid #AA
|
|
337 |
video = gr.Video(label="Upload a video file")
|
338 |
gr.Markdown("<h3 style='text-align: center;'>OR</h3>")
|
339 |
youtube_link = gr.Textbox(label="Paste YouTube link")
|
340 |
-
gr.Markdown("⚠️If you get a warning that the video is age restricted, manually download it using the following [link](https://
|
341 |
gr.Markdown("---")
|
342 |
-
target_language = gr.Dropdown(
|
343 |
speaker_model = gr.Dropdown(["(Recommended) XTTS_V2", "VITs (will be default for Ukrainian)"], value="(Recommended) XTTS_V2", label="Select text-to-speech generation model")
|
344 |
with gr.Row():
|
345 |
clear_btn = gr.Button("Clear inputs")
|
@@ -350,7 +378,13 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".column-frame {border: 2px solid #AA
|
|
350 |
with gr.Column():
|
351 |
gr.Markdown("<h2 style='text-align: center;'>Translated Video</h3>")
|
352 |
output_video = gr.Video(label="Translated video")
|
353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
translate_btn.click(
|
355 |
fn=translate_video,
|
356 |
inputs=[video, youtube_link, target_language, speaker_model],
|
|
|
100 |
|
101 |
# Perform text translation
|
102 |
def translate_transcript(transcript, target_language, deepl_token):
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
translator = deepl.Translator(deepl_token)
|
104 |
|
105 |
translated_transcript = []
|
106 |
for segment in transcript:
|
107 |
text_to_translate = segment['text']
|
108 |
+
translated_text = translator.translate_text(text_to_translate, target_lang=target_language)
|
109 |
|
110 |
translated_segment = {
|
111 |
'start': segment['start'],
|
|
|
130 |
|
131 |
|
132 |
# Perform voice cloning
|
133 |
+
def voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language_codes, speaker_model, audio_path):
|
134 |
device = "cuda"
|
135 |
+
xtts2_language_code = target_language_codes[0]
|
136 |
+
vits_language_code = target_language_codes[1]
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
# Select model
|
139 |
selected_model = None
|
140 |
|
141 |
+
if 'vits' in speaker_model.lower() or xtts2_language_code == 'uk':
|
142 |
+
selected_model = f'tts_models/{vits_language_code}/fairseq/vits'
|
143 |
else:
|
144 |
selected_model = 'tts_models/multilingual/multi-dataset/xtts_v2'
|
145 |
|
|
|
177 |
audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']])
|
178 |
sample_rate = tts.synthesizer.output_sample_rate
|
179 |
else:
|
180 |
+
audio = tts.tts(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=xtts2_language_code)
|
181 |
sample_rate = tts.synthesizer.output_sample_rate
|
182 |
|
183 |
# Adjust pace to fit the speech timeframe if translated audio is longer than phrase
|
184 |
audio_duration = len(audio) / sample_rate
|
185 |
if speech_item_duration < audio_duration:
|
186 |
audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
|
187 |
+
|
188 |
# Resample to higher rate
|
189 |
new_sample_rate = 44100
|
190 |
audio = librosa.resample(np.array(audio), orig_sr=sample_rate, target_sr=new_sample_rate)
|
|
|
229 |
|
230 |
|
231 |
# Perform video translation
|
232 |
+
def video_translation(video_path, target_language_codes, speaker_model, hf_token, deepl_token):
|
233 |
+
|
234 |
original_audio_path = extract_audio(video_path)
|
235 |
|
236 |
transcription = speech_diarization(original_audio_path, hf_token)
|
237 |
|
238 |
+
translated_transcription = translate_transcript(transcription, target_language_codes[2], deepl_token)
|
239 |
|
240 |
speakers_voice_clips = speaker_voice_clips(transcription, original_audio_path)
|
241 |
|
242 |
+
translated_audio_track = voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language_codes, speaker_model, original_audio_path)
|
243 |
|
244 |
+
video_with_dubbing = dub_video(video_path, translated_audio_track, target_language_codes[0])
|
245 |
|
246 |
return video_with_dubbing
|
247 |
|
248 |
|
249 |
|
250 |
|
251 |
+
# Language: xtts2, vits, deepl
|
252 |
+
language_codes = {
|
253 |
+
"Chinese": ("zh-cn", "zho", "zh"),
|
254 |
+
"Czech": ("cs", "ces", "cs"),
|
255 |
+
"Dutch": ("nl", "nld", "nl"),
|
256 |
+
"English": ("en", "eng", "en-us"),
|
257 |
+
"French": ("fr", "fra", "fr"),
|
258 |
+
"German": ("de", "deu", "de"),
|
259 |
+
"Hungarian": ("hu", "hun", "hu"),
|
260 |
+
"Italian": ("it", "ita", "it"),
|
261 |
+
"Japanese": ("ja", "jpn", "ja"),
|
262 |
+
"Korean": ("ko", "kor", "ko"),
|
263 |
+
"Polish": ("pl", "pol", "pl"),
|
264 |
+
"Portuguese": ("pt", "por", "pt"),
|
265 |
+
"Russian": ("ru", "rus", "ru"),
|
266 |
+
"Spanish": ("es", "spa", "es"),
|
267 |
+
"Turkish": ("tr", "tur", "tr"),
|
268 |
+
"Ukrainian": ("uk", "ukr", "uk")
|
269 |
+
}
|
270 |
+
|
271 |
+
def check_video_duration(video_path):
|
272 |
+
with mp.VideoFileClip(video_path) as video:
|
273 |
+
duration = video.duration
|
274 |
+
return duration > 180
|
275 |
+
|
276 |
def download_youtube_video(url):
|
277 |
yt = YouTube(url)
|
278 |
if yt.age_restricted:
|
|
|
330 |
if video_path is None:
|
331 |
gr.Warning("Video input did not process well, try again")
|
332 |
return translation_limit(), None
|
333 |
+
|
334 |
+
if check_video_duration(video_path):
|
335 |
+
gr.Warning("Video is longer than 3 minutes, please provide a shorter one")
|
336 |
+
return translation_limit(), None
|
337 |
+
|
338 |
+
target_language_codes = language_codes[target_language]
|
339 |
+
dubbed_video_path = video_translation(video_path, target_language_codes, speaker_model, HF_TOKEN, DEEPL_TOKEN)
|
340 |
limit_info = translation_limit()
|
341 |
return limit_info, dubbed_video_path
|
342 |
except Exception as e:
|
343 |
print(f"An error occurred: {e}")
|
344 |
raise e
|
345 |
|
346 |
+
css = """
|
347 |
+
.column-frame {
|
348 |
+
border: 2px solid #AAA;
|
349 |
+
border-radius: 10px;
|
350 |
+
padding: 10px;
|
351 |
+
margin: 10px;
|
352 |
+
}
|
353 |
+
"""
|
354 |
+
|
355 |
initial_usage_info = translation_limit()
|
356 |
|
357 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
358 |
|
359 |
gr.Markdown("<h1 style='text-align: center;'>🌐AI Video Translation</h2>")
|
|
|
360 |
|
361 |
with gr.Row():
|
362 |
with gr.Column(elem_classes=["column-frame"]):
|
|
|
365 |
video = gr.Video(label="Upload a video file")
|
366 |
gr.Markdown("<h3 style='text-align: center;'>OR</h3>")
|
367 |
youtube_link = gr.Textbox(label="Paste YouTube link")
|
368 |
+
gr.Markdown("⚠️If you get a warning that the video is age restricted, manually download it using the following [link](https://downloaderto.com/) and use file upload, as pytube library doesn't support restricted videos download.")
|
369 |
gr.Markdown("---")
|
370 |
+
target_language = gr.Dropdown(list(language_codes.keys()), value="English", label="Select translation target language")
|
371 |
speaker_model = gr.Dropdown(["(Recommended) XTTS_V2", "VITs (will be default for Ukrainian)"], value="(Recommended) XTTS_V2", label="Select text-to-speech generation model")
|
372 |
with gr.Row():
|
373 |
clear_btn = gr.Button("Clear inputs")
|
|
|
378 |
with gr.Column():
|
379 |
gr.Markdown("<h2 style='text-align: center;'>Translated Video</h3>")
|
380 |
output_video = gr.Video(label="Translated video")
|
381 |
+
gr.Examples(
|
382 |
+
[[None, 'https://www.youtube.com/watch?v=q4kkQSkrrtI', 'Japanese', "(Recommended) XTTS_V2"]],
|
383 |
+
[video, youtube_link, target_language, speaker_model],
|
384 |
+
[translation_limit_info, output_video],
|
385 |
+
translate_video,
|
386 |
+
run_on_click=True,
|
387 |
+
)
|
388 |
translate_btn.click(
|
389 |
fn=translate_video,
|
390 |
inputs=[video, youtube_link, target_language, speaker_model],
|