import gradio as gr import subprocess import whisper from transformers import pipeline from TTS.api import TTS # Coqui TTS library from moviepy.editor import VideoFileClip, AudioFileClip # 1. Download Audio and Video from YouTube def download_audio_video(youtube_url, audio_path="audio.mp3", video_path="input_video.mp4"): # Download audio as MP3 audio_command = [ "yt-dlp", "--extract-audio", "--audio-format", "mp3", "--output", audio_path, youtube_url ] subprocess.run(audio_command) # Download video as MP4 video_command = [ "yt-dlp", "--format", "mp4", "--output", video_path, youtube_url ] subprocess.run(video_command) # 2. Transcribe Audio Using Whisper def transcribe_audio(audio_path, model_name="base"): model = whisper.load_model(model_name) result = model.transcribe(audio_path) transcription = result['text'] print(f"Whisper Transcription:\n{transcription}\n") # Print the transcription return transcription # 3. Split Text for Translation def split_text(text, max_length=400): """ Splits the text into chunks of `max_length` words. """ words = text.split() chunks = [] current_chunk = [] for word in words: if len(" ".join(current_chunk + [word])) > max_length: chunks.append(" ".join(current_chunk)) current_chunk = [word] else: current_chunk.append(word) if current_chunk: chunks.append(" ".join(current_chunk)) return chunks # 4. Translate Text to Turkish def translate_to_turkish(text, model_name="Helsinki-NLP/opus-mt-tc-big-en-tr", max_length=400): translator = pipeline("translation", model=model_name) # Split the text into manageable chunks text_chunks = split_text(text, max_length=max_length) translated_chunks = [] for chunk in text_chunks: translated = translator(chunk, max_length=max_length) translated_chunks.append(translated[0]['translation_text']) full_translation = " ".join(translated_chunks) print(f"Translated Text (English to Turkish):\n{full_translation}\n") # Print the translated text return full_translation # 5. Synthesize Turkish Audio Using Coqui TTS def synthesize_audio(text, model_name="tts_models/multilingual/multi-dataset/xtts_v2", output_path="output.wav"): # Initialize the TTS model tts = TTS(model_name=model_name, gpu=False) # Use CPU only # Generate and save the audio tts.tts_to_file(text=text, file_path=output_path, speaker_wav ="audio.mp3" , language="tr") return output_path # 6. Replace Audio in Downloaded Video def replace_audio(video_path, new_audio_path, output_path="translated_video.mp4"): video = VideoFileClip(video_path) audio = AudioFileClip(new_audio_path) final_video = video.set_audio(audio) final_video.write_videofile(output_path, codec="libx264", audio_codec="aac") return output_path # Gradio Interface def translate_pipeline(youtube_url): # Define file paths audio_path = "audio.mp3" video_path = "input_video.mp4" # Step 1: Download audio and video download_audio_video(youtube_url, audio_path, video_path) # Step 2: Transcribe audio english_text = transcribe_audio(audio_path) # Step 3: Translate to Turkish turkish_text = translate_to_turkish(english_text) # Step 4: Synthesize Turkish audio synthesized_audio = synthesize_audio(turkish_text) # Step 5: Replace audio in the downloaded video translated_video = replace_audio(video_path, synthesized_audio, "translated_video.mp4") return translated_video # Define Gradio interface iface = gr.Interface( fn=translate_pipeline, inputs=gr.Textbox(label="YouTube URL"), outputs=gr.Video(label="Translated Video"), ) iface.launch(debug=True)