import os import requests import gradio as gr import moviepy.editor as mp from TTS.api import TTS import torch import assemblyai as aai os.environ["COQUI_TOS_AGREED"] = "1" # Download necessary models if not already present model_files = { "wav2lip.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth", "wav2lip_gan.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth", "resnet50.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth", "mobilenet.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth", "s3fd.pth": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" } device = "cpu" # Initialize TTS model tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) # Download models for filename, url in model_files.items(): file_path = os.path.join("checkpoints" if "pth" in filename else "face_detection", filename) if not os.path.exists(file_path): print(f"Downloading {filename}...") r = requests.get(url) with open(file_path, 'wb') as f: f.write(r.content) # Translation class class translation: def __init__(self, video_path, original_language, target_language): self.video_path = video_path self.original_language = original_language self.target_language = target_language def org_language_parameters(self, original_language): language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} self.lan_code = language_codes.get(original_language, '') def target_language_parameters(self, target_language): language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} self.tran_code = language_codes.get(target_language, '') def extract_audio(self): video = mp.VideoFileClip(self.video_path) audio = video.audio audio_path = "output_audio.wav" audio.write_audiofile(audio_path) return audio_path def transcribe_audio(self, audio_path): aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") config = aai.TranscriptionConfig(language_code=self.lan_code) transcriber = aai.Transcriber(config=config) transcript = transcriber.transcribe(audio_path) return transcript.text def translate_text(self, transcript_text): base_url = "https://api.cognitive.microsofttranslator.com/translate" headers = { "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"), "Content-Type": "application/json", "Ocp-Apim-Subscription-Region": "southeastasia" } params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code} body = [{"text": transcript_text}] response = requests.post(base_url, headers=headers, params=params, json=body) translation = response.json()[0]["translations"][0]["text"] return translation def generate_audio(self, translated_text): tts.tts_to_file(text=translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=self.tran_code) return "output_synth.wav" def translate_video(self): audio_path = self.extract_audio() self.org_language_parameters(self.original_language) self.target_language_parameters(self.target_language) transcript_text = self.transcribe_audio(audio_path) translated_text = self.translate_text(transcript_text) translated_audio_path = self.generate_audio(translated_text) # Run Wav2Lip inference (update the path to inference.py) inference_script_path = "inference.py" # Update this to the actual location of inference.py os.system(f"python {inference_script_path} --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'") return 'output_video.mp4' # Gradio Interface def app(video_path, original_language, target_language): translator = translation(video_path, original_language, target_language) video_file = translator.translate_video() return video_file interface = gr.Interface( fn=app, inputs=[ gr.Video(label="Video Path"), gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"), gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"), ], outputs=gr.Video(label="Translated Video") ) interface.launch() # import os # import requests # import gradio as gr # import moviepy.editor as mp # from TTS.api import TTS # import torch # import assemblyai as aai # os.environ["COQUI_TOS_AGREED"] = "1" # # Download necessary models if not already present # model_files = { # "wav2lip.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth", # "wav2lip_gan.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth", # "resnet50.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth", # "mobilenet.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth", # "s3fd.pth": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" # } # device = "cpu" # tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) # for filename, url in model_files.items(): # file_path = os.path.join("checkpoints" if "pth" in filename else "face_detection", filename) # if not os.path.exists(file_path): # print(f"Downloading {filename}...") # r = requests.get(url) # with open(file_path, 'wb') as f: # f.write(r.content) # # Translation class # class translation: # def __init__(self, video_path, original_language, target_language): # self.video_path = video_path # self.original_language = original_language # self.target_language = target_language # def org_language_parameters(self, original_language): # language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} # self.lan_code = language_codes.get(original_language, '') # def target_language_parameters(self, target_language): # language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} # self.tran_code = language_codes.get(target_language, '') # def extract_audio(self): # video = mp.VideoFileClip(self.video_path) # audio = video.audio # audio_path = "output_audio.wav" # audio.write_audiofile(audio_path) # return audio_path # def transcribe_audio(self, audio_path): # aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") # config = aai.TranscriptionConfig(language_code=self.lan_code) # transcriber = aai.Transcriber(config=config) # transcript = transcriber.transcribe(audio_path) # return transcript.text # def translate_text(self, transcript_text): # base_url = "https://api.cognitive.microsofttranslator.com/translate" # headers = { # "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"), # "Content-Type": "application/json", # "Ocp-Apim-Subscription-Region": "southeastasia" # } # params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code} # body = [{"text": transcript_text}] # response = requests.post(base_url, headers=headers, params=params, json=body) # translation = response.json()[0]["translations"][0]["text"] # return translation # def generate_audio(self, translated_text): # tts.tts_to_file(text=translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=self.tran_code) # return "output_synth.wav" # def translate_video(self): # audio_path = self.extract_audio() # self.org_language_parameters(self.original_language) # self.target_language_parameters(self.target_language) # transcript_text = self.transcribe_audio(audio_path) # translated_text = self.translate_text(transcript_text) # translated_audio_path = self.generate_audio(translated_text) # # Run Wav2Lip inference # os.system(f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'") # return 'output_video.mp4' # # Gradio Interface # def app(video_path, original_language, target_language): # translator = translation(video_path, original_language, target_language) # video_file = translator.translate_video() # return video_file # interface = gr.Interface( # fn=app, # inputs=[ # gr.Video(label="Video Path"), # gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"), # gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"), # ], # outputs=gr.Video(label="Translated Video") # ) # interface.launch()