Video_translation_with_speaker_diarization_and_voice_cloning_private

Build error

App Files Files Community

vitaliy-sharandin commited on Nov 18, 2023

Commit

e0f5494

•

1 Parent(s): b6f9245

Update app.py

Browse files

Files changed (1) hide show

app.py +287 -36

app.py CHANGED Viewed

@@ -1,37 +1,288 @@
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
-model = AutoModelForCausalLM.from_pretrained(
- 'vitaliy-sharandin/wiseai',
- load_in_8bit=True,
- device_map = {"": 0}
- )
-tokenizer = AutoTokenizer.from_pretrained('vitaliy-sharandin/wiseai')
-pipe = pipeline('text-generation', model=model,tokenizer=tokenizer)
-def generate_text(instruction, input):
- if not instruction.strip():
- return str('The instruction field is required.')
- if instruction.strip() and input.strip():
- input_prompt = (f"Below is an instruction that describes a task. "
- "Write a response that appropriately completes the request.\n\n"
- "### Instruction:\n"
- f"{instruction}\n\n"
- "### Input:\n"
- f"{input}\n\n"
- f"### Response: \n")
- else :
- input_prompt = (f"Below is an instruction that describes a task. "
- "Write a response that appropriately completes the request.\n\n"
- "### Instruction:\n"
- f"{instruction}\n\n"
- f"### Response: \n")
- result = pipe(input_prompt, max_length=200, top_p=0.9, temperature=0.9, num_return_sequences=1, return_full_text=False)[0]['generated_text']
- return result[:str(result).find("###")]
-iface = gr.Interface(fn=generate_text, inputs=[gr.Textbox(label="Instruction"),
- gr.Textbox(label="Additional Input")],
- outputs=gr.Textbox(label="Response"))
-iface.launch()

+import os
 import gradio as gr
+import whisperx
+import numpy as np
+import moviepy.editor as mp
+from moviepy.audio.AudioClip import AudioArrayClip
+from pytube import YouTube
+import deepl
+import torch
+import pyrubberband as pyrb
+import soundfile as sf
+import librosa
+from TTS.api import TTS
+os.environ["COQUI_TOS_AGREED"] = "1"
+HF_TOKEN = os.environ["HF_TOKEN"]
+DEEPL_TOKEN = os.environ["DEEPL_TOKEN"]
+# Download video from Youtube
+def download_youtube_video(url):
+ yt = YouTube(url)
+ stream = yt.streams.filter(file_extension='mp4').first()
+ output_path = stream.download()
+ return output_path
+# Extract audio from video
+def extract_audio(video_path):
+ clip = mp.VideoFileClip(video_path)
+ audio_path = os.path.splitext(video_path)[0] + ".wav"
+ clip.audio.write_audiofile(audio_path)
+ return audio_path
+# Perform speech diarization
+def speech_diarization(audio_path, hf_token):
+ device = "cuda"
+ batch_size = 16
+ compute_type = "float16"
+ model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+ # 1. Transcribe audio
+ audio = whisperx.load_audio(audio_path)
+ result = model.transcribe(audio, batch_size=batch_size)
+ # delete model if low on GPU resources
+ import gc; gc.collect(); torch.cuda.empty_cache(); del model
+ # 2. Align whisper output
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+ result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+ # delete model if low on GPU resources
+ import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
+ # 3. Assign speaker labels
+ diarize_model = whisperx.DiarizationPipeline(model_name='pyannote/[email protected]', use_auth_token=hf_token, device=device)
+ # add min/max number of speakers if known
+ diarize_segments = diarize_model(audio)
+ # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+ result = whisperx.assign_word_speakers(diarize_segments, result)
+ print(f'\n[Original transcript]:\n{result["segments"]}\n')
+ return result["segments"]
+# Create per speaker voice clips for tts voice cloning
+def speaker_voice_clips(transcription, audio_path):
+ # Create 3 uninterrupted per speaker timecodes
+ snippets_timecodes = {}
+ for segment in transcription:
+ speaker = segment['speaker']
+ if speaker not in snippets_timecodes:
+ snippets_timecodes[speaker] = []
+ if len(snippets_timecodes[speaker]) < 3:
+ snippet = {
+ 'start': segment['start'],
+ 'end': segment['end']
+ }
+ snippets_timecodes[speaker].append(snippet)
+ # Cut voice clips and stitch them together
+ original_audio = mp.AudioFileClip(audio_path)
+ audio_file_directory = os.path.dirname(audio_path)
+ voice_clips = {}
+ for speaker, speaker_snippets in snippets_timecodes.items():
+ subclips = []
+ for snippet in speaker_snippets:
+ start, end = snippet['start'], snippet['end']
+ subclip = original_audio.subclip(start, end)
+ subclips.append(subclip)
+ concatenated_clip = mp.concatenate_audioclips(subclips)
+ output_filename = os.path.join(audio_file_directory, f"{speaker}_voice_clips.wav")
+ concatenated_clip.write_audiofile(output_filename)
+ voice_clips[speaker] = output_filename
+ return voice_clips
+# Perform text translation
+def translate_transcript(transcript, target_language, deepl_token):
+ translator = deepl.Translator(deepl_token)
+ translated_transcript = []
+ for segment in transcript:
+ text_to_translate = segment['text']
+ translated_text = translator.translate_text(text_to_translate, target_lang=target_language)
+ translated_segment = {
+ 'start': segment['start'],
+ 'end': segment['end'],
+ 'text': translated_text.text,
+ 'speaker': segment['speaker']
+ }
+ translated_transcript.append(translated_segment)
+ print(f'\n[Translated transcript]:\n{translated_transcript}\n')
+ return translated_transcript
+# Adjust voice pace
+def adjust_voice_pace(sound_array, sample_rate, target_duration):
+ duration = len(sound_array) / sample_rate
+ tempo_change = duration / target_duration
+ sound_array_stretched = pyrb.time_stretch(sound_array, sample_rate, tempo_change)
+ return sound_array_stretched
+# Perform voice cloning
+def voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language, speaker_model, audio_path):
+ device = "cuda"
+ vits_language_map = {
+ 'en':'eng',
+ 'ru':'rus',
+ 'uk':'ukr',
+ 'pl':'pol'
+ }
+ # Select model
+ selected_model = None
+ if 'vits' in speaker_model.lower() or target_language is 'uk':
+ selected_model = f'tts_models/{vits_language_map[target_language]}/fairseq/vits'
+ else:
+ selected_model = 'tts_models/multilingual/multi-dataset/xtts_v2'
+ print(selected_model)
+ tts = None
+ final_audio_track = None
+ try:
+ # TODO uncomment when https://github.com/coqui-ai/TTS/issues/3224 is resolved
+ # tts = TTS(selected_model).to(device)
+ # Generate and concatenate voice clips per speaker
+ last_end_time = 0
+ clips = []
+ # Generate sentences
+ for speech_item in translated_transcription:
+ speech_item_duration = speech_item['end'] - speech_item['start']
+ # Silence
+ gap_duration = speech_item['start'] - last_end_time
+ if gap_duration > 0:
+ silent_audio = np.zeros((int(44100 * gap_duration), 2))
+ silent_clip = AudioArrayClip(silent_audio, fps=44100)
+ clips.append(silent_clip)
+ print(f"\nAdded silence: Start={last_end_time}, Duration={gap_duration}")
+ # Generate speech
+ print(f"[{speech_item['speaker']}]")
+ tts = TTS(selected_model).to(device)
+ audio = tts.tts_with_vc(text=speech_item['text'], speaker_wav=speakers_voice_clips[speech_item['speaker']], language=target_language)
+ sample_rate = tts.voice_converter.vc_config.audio.output_sample_rate
+ # Adjust pace to fit the speech timeframe if translated audio is longer than phrase
+ audio_duration = len(audio) / sample_rate
+ if speech_item_duration < audio_duration:
+ audio = adjust_voice_pace(audio, sample_rate, speech_item_duration)
+ # Resample to higher rate
+ new_sample_rate = 44100
+ audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=new_sample_rate)
+ # Transform to AudioArrayClip object
+ audio = np.expand_dims(audio, axis=1)
+ audio_stereo = np.repeat(audio, 2, axis=1)
+ audio_clip = AudioArrayClip(audio_stereo, fps=44100)
+ # Cut out possible glitch from AudioArrayClip end
+ audio_clip = audio_clip.subclip(0, audio_clip.duration - 0.2)
+ clips.append(audio_clip)
+ print(f"Added speech: Start={speech_item['start']}, Final duration={audio_clip.duration}, Original duration={speech_item_duration}")
+ last_end_time = speech_item['start'] + audio_clip.duration
+ del tts; import gc; gc.collect(); torch.cuda.empty_cache()
+ # Merge sentences
+ final_audio_track = mp.concatenate_audioclips(clips)
+ audio_files_directory = os.path.dirname(audio_path)
+ final_audio_track.write_audiofile(os.path.join(audio_files_directory, "translated_voice_track.wav"), fps=44100)
+ except Exception as e:
+ if tts is not None:
+ import gc; gc.collect(); torch.cuda.empty_cache(); del tts
+ raise e
+ return final_audio_track
+def dub_video(video_path, translated_audio_track, target_language):
+ video = mp.VideoFileClip(video_path)
+ video = video.subclip(0, translated_audio_track.duration)
+ original_audio = video.audio.volumex(0.2)
+ dubbed_audio = mp.CompositeAudioClip([original_audio, translated_audio_track.set_start(0)])
+ video_with_dubbing = video.set_audio(dubbed_audio)
+ video_with_dubbing_path = os.path.splitext(video_path)[0] + "_" + target_language + ".mp4"
+ video_with_dubbing.write_videofile(video_with_dubbing_path)
+ return video_with_dubbing_path
+# Perform video translation
+def video_translation(video_path, target_language, speaker_model, hf_token, deepl_token):
+ original_audio_path = extract_audio(video_path)
+ transcription = speech_diarization(original_audio_path, hf_token)
+ translated_transcription = translate_transcript(transcription, target_language, deepl_token)
+ speakers_voice_clips = speaker_voice_clips(transcription, original_audio_path)
+ translated_audio_track = voice_cloning_translation(translated_transcription, speakers_voice_clips, target_language, speaker_model, original_audio_path)
+ video_with_dubbing = dub_video(video_path, translated_audio_track, target_language)
+ return video_with_dubbing
+def translate_video(_, video_path, __, youtube_link, ___, target_language, speaker_model):
+ try:
+ if not video_path and not youtube_link:
+ gr.Warning("You should either upload video or input a YouTube link")
+ return None
+ if youtube_link:
+ video_path = download_youtube_video(youtube_link)
+ dubbed_video = video_translation(video_path, target_language, speaker_model, HF_TOKEN, DEEPL_TOKEN)
+ except Exception as e:
+ print(f"An error occurred: {e}")
+ return gr.components.Video(dubbed_video)
+inputs = [
+ gr.Markdown("## Currently supported languages are: English, Polish, Ukrainian and Russian"),
+ gr.Video(label="Upload a video file"),
+ gr.Markdown("**OR**"),
+ gr.Textbox(label="Paste YouTube link"),
+ gr.Markdown("---"),
+ gr.Dropdown(["en", "pl", "uk", "ru"], value="pl", label="Select translation target language"),
+ gr.Dropdown(["(Recommended) XTTS_V2", "VITs (will be default for Ukrainian)"], value="(Recommended) XTTS_V2", label="Select text-to-speech generation model")
+]
+outputs = gr.Video(label="Translated video")
+gr.Interface(fn=translate_video,
+ inputs=inputs,
+ outputs=outputs,
+ title="🌐AI Video Translation",
+ theme=gr.themes.Base()
+ ).launch(show_error=True, debug=True)