import os import gradio as gr import whisper from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer from docx import Document from reportlab.pdfgen import canvas from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase import pdfmetrics from reportlab.lib.pagesizes import A4 import arabic_reshaper from bidi.algorithm import get_display from pptx import Presentation import subprocess import shlex import yt_dlp from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time # Load the Whisper model (smaller model for faster transcription) model = whisper.load_model("tiny") # Load M2M100 translation model for different languages def load_translation_model(target_language): lang_codes = { "fa": "fa", # Persian (Farsi) "es": "es", # Spanish "fr": "fr", # French "de": "de", # German "it": "it", # Italian "pt": "pt", # Portuguese } target_lang_code = lang_codes.get(target_language) if not target_lang_code: raise ValueError(f"Translation model for {target_language} not supported") tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") tokenizer.src_lang = "en" tokenizer.tgt_lang = target_lang_code return tokenizer, translation_model def translate_text(text, tokenizer, model): try: inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang)) return tokenizer.decode(translated[0], skip_special_tokens=True) except Exception as e: raise RuntimeError(f"Error during translation: {e}") # Helper function to format timestamps in SRT format def format_timestamp(seconds): milliseconds = int((seconds % 1) * 1000) seconds = int(seconds) hours = seconds // 3600 minutes = (seconds % 3600) // 60 seconds = seconds % 60 return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" def write_srt(transcription, output_file, tokenizer=None, translation_model=None): with open(output_file, "w") as f: for i, segment in enumerate(transcription['segments']): start = segment['start'] end = segment['end'] text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) start_time = format_timestamp(start) end_time = format_timestamp(end) f.write(f"{i + 1}\n") f.write(f"{start_time} --> {end_time}\n") f.write(f"{text.strip()}\n\n") def embed_hardsub_in_video(video_file, srt_file, output_video): command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"' try: process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300) if process.returncode != 0: raise RuntimeError(f"ffmpeg error: {process.stderr}") except subprocess.TimeoutExpired: raise RuntimeError("ffmpeg process timed out.") except Exception as e: raise RuntimeError(f"Error running ffmpeg: {e}") def write_word(transcription, output_file, tokenizer=None, translation_model=None, target_language=None): doc = Document() rtl = target_language == "fa" for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) para = doc.add_paragraph(f"{i + 1}. {text.strip()}") if rtl: para.paragraph_format.right_to_left = True doc.save(output_file) def reverse_text_for_rtl(text): return ' '.join([word[::-1] for word in text.split()]) def write_pdf(transcription, output_file, tokenizer=None, translation_model=None): c = canvas.Canvas(output_file, pagesize=A4) app_dir = os.path.dirname(os.path.abspath(__file__)) nazanin_font_path = os.path.join(app_dir, 'B-NAZANIN.TTF') arial_font_path = os.path.join(app_dir, 'Arial.ttf') if os.path.exists(nazanin_font_path): try: pdfmetrics.registerFont(TTFont('B-Nazanin', nazanin_font_path)) except Exception as e: raise RuntimeError(f"Error registering B-Nazanin font: {e}.") else: raise FileNotFoundError(f"B-Nazanin font file not found at {nazanin_font_path}. Please ensure it is available.") if os.path.exists(arial_font_path): try: pdfmetrics.registerFont(TTFont('Arial', arial_font_path)) except Exception as e: raise RuntimeError(f"Error registering Arial font: {e}.") else: raise FileNotFoundError(f"Arial font file not found at {arial_font_path}. Please ensure it is available.") y_position = A4[1] - 50 line_height = 20 for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) line = f"{i + 1}. {text.strip()}" target_language = None if translation_model: target_language = tokenizer.tgt_lang if target_language in ['fa', 'ar']: reshaped_text = arabic_reshaper.reshape(line) bidi_text = get_display(reshaped_text) c.setFont('B-Nazanin', 12) c.drawRightString(A4[0] - 50, y_position, bidi_text) else: c.setFont('Arial', 12) c.drawString(50, y_position, line) if y_position < 50: c.showPage() y_position = A4[1] - 50 y_position -= line_height c.save() return output_file def write_ppt(transcription, output_file, tokenizer=None, translation_model=None): ppt = Presentation() slide = ppt.slides.add_slide(ppt.slide_layouts[5]) text_buffer = "" max_chars_per_slide = 400 for i, segment in enumerate(transcription['segments']): text = segment['text'] if translation_model: text = translate_text(text, tokenizer, translation_model) line = f"{i + 1}. {text.strip()}\n" if len(text_buffer) + len(line) > max_chars_per_slide: slide.shapes.title.text = "Transcription" textbox = slide.shapes.add_textbox(left=0, top=0, width=ppt.slide_width, height=ppt.slide_height) textbox.text = text_buffer.strip() slide = ppt.slides.add_slide(ppt.slide_layouts[5]) text_buffer = line else: text_buffer += line if text_buffer: slide.shapes.title.text = "" textbox = slide.shapes.add_textbox(left=0, top=0, width=ppt.slide_width, height=ppt.slide_height) textbox.text = text_buffer.strip() ppt.save(output_file) # Download YouTube Video using yt_dlp or Selenium def download_from_ssyoutube(modified_url): driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) driver.get(modified_url) try: WebDriverWait(driver, 20).until( EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "Low quality")) ).click() WebDriverWait(driver, 20).until( EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "Download")) ).click() time.sleep(10) driver.quit() return "Video downloaded successfully!" except Exception as e: driver.quit() raise RuntimeError(f"Failed to download video: {e}") def modify_youtube_url(url): youtube_pos = url.find("youtube") if youtube_pos == -1: raise ValueError("Invalid YouTube URL.") modified_url = "https://ss" + url[youtube_pos:] return modified_url def download_youtube_video(url): try: modified_url = modify_youtube_url(url) return download_from_ssyoutube(modified_url) except Exception as e: raise RuntimeError(f"Error downloading YouTube video: {e}") def transcribe_video(video_file, video_url, language, target_language, output_format): if video_url: video_file_path = download_youtube_video(video_url) else: video_file_path = video_file transcription = model.transcribe(video_file_path) if target_language != "en": tokenizer, translation_model = load_translation_model(target_language) else: tokenizer, translation_model = None, None output_file = None if output_format == "SRT": output_file = "output.srt" write_srt(transcription, output_file, tokenizer, translation_model) elif output_format == "Word": output_file = "output.docx" write_word(transcription, output_file, tokenizer, translation_model, target_language) elif output_format == "PDF": output_file = "output.pdf" write_pdf(transcription, output_file, tokenizer, translation_model) elif output_format == "PPT": output_file = "output.pptx" write_ppt(transcription, output_file, tokenizer, translation_model) return output_file def main(): with gr.Blocks() as app: gr.Markdown("# Transcribe, Translate and Format YouTube Video Content") video_url_input = gr.Textbox(label="YouTube Video URL (or leave blank for video file upload)") video_file_input = gr.File(label="Upload Video File (leave blank for YouTube URL)") language_input = gr.Dropdown(choices=["en"], label="Video Language", value="en") target_language_input = gr.Dropdown(choices=["en", "fa", "es", "fr", "de", "it", "pt"], label="Target Language", value="en") output_format_input = gr.Dropdown(choices=["SRT", "Word", "PDF", "PPT"], label="Output Format", value="SRT") output_file = gr.File(label="Download Transcription", interactive=False) transcribe_button = gr.Button("Transcribe & Translate") def transcribe_and_translate(video_file, video_url, language, target_language, output_format): output = transcribe_video(video_file.name if video_file else None, video_url, language, target_language, output_format) return output transcribe_button.click( transcribe_and_translate, inputs=[video_file_input, video_url_input, language_input, target_language_input, output_format_input], outputs=output_file ) app.launch() if __name__ == "__main__": main()