Spaces:
Paused
Paused
import gradio as gr | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
from subprocess import run | |
from faster_whisper import WhisperModel | |
import json | |
import tempfile | |
import os | |
import ffmpeg | |
from zipfile import ZipFile | |
import stat | |
import uuid | |
import subprocess | |
import torch | |
import bitsandbytes | |
import scipy | |
from googletrans import Translator | |
import re | |
import subprocess | |
import datetime | |
ZipFile("ffmpeg.zip").extractall() | |
st = os.stat('ffmpeg') | |
os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC) | |
with open('google_lang_codes.json', 'r') as f: | |
google_lang_codes = json.load(f) | |
translator = Translator() | |
whisper_model = WhisperModel("large-v2", device="cuda", compute_type="float16") | |
print("cwd", os.getcwd()) | |
print(os.listdir()) | |
def process_video(Video, target_language, translate_video): | |
current_path = os.getcwd() | |
print("Iniciando process_video") | |
common_uuid = uuid.uuid4() | |
print("Checking FFmpeg availability...") | |
run(["ffmpeg", "-version"]) | |
audio_file = f"{common_uuid}.wav" | |
run(["ffmpeg", "-i", Video, audio_file]) | |
transcript_file = f"{current_path}/{common_uuid}.srt" | |
# Transcription with Whisper. | |
target_language_code = google_lang_codes.get(target_language, "en") | |
print("Iniciando transcrição com Whisper") | |
segments, _ = whisper_model.transcribe(audio_file, beam_size=5) | |
segments = list(segments) | |
with open(transcript_file, "w+", encoding="utf-8") as f: | |
counter = 1 | |
for segment in segments: | |
start_hours = int(segment.start // 3600) | |
start_minutes = int((segment.start % 3600) // 60) | |
start_seconds = int(segment.start % 60) | |
start_milliseconds = int((segment.start - int(segment.start)) * 1000) | |
end_hours = int(segment.end // 3600) | |
end_minutes = int((segment.end % 3600) // 60) | |
end_seconds = int(segment.end % 60) | |
end_milliseconds = int((segment.end - int(segment.end)) * 1000) | |
formatted_start = f"{start_hours:02d}:{start_minutes:02d}:{start_seconds:02d},{start_milliseconds:03d}" | |
formatted_end = f"{end_hours:02d}:{end_minutes:02d}:{end_seconds:02d},{end_milliseconds:03d}" | |
f.write(f"{counter}\n") | |
f.write(f"{formatted_start} --> {formatted_end}\n") | |
f.write(f"{segment.text}\n\n") | |
counter += 1 | |
# Check if translation is needed | |
if translate_video: | |
# Translating the SRT from Whisper with Google Translate. | |
translated_lines = [] | |
f.seek(0) # Move the file pointer to the beginning of the file. | |
for line in f: | |
if line.strip().isnumeric() or "-->" in line: | |
translated_lines.append(line) | |
elif line.strip() != "": | |
translated_text = translator.translate(line.strip(), dest=target_language_code).text | |
translated_lines.append(translated_text + "\n") | |
else: | |
translated_lines.append("\n") | |
f.seek(0) # Move the file pointer to the beginning of the file and truncate it. | |
f.truncate() | |
f.writelines(translated_lines) # Write the translated lines back into the original file. | |
output_video = f"{common_uuid}_output_video.mp4" | |
# Debugging: Validate FFmpeg command for subtitle embedding | |
print("Validating FFmpeg command for subtitle embedding...") | |
print(f"Translated SRT file: {transcript_file}") | |
with open(transcript_file, 'r', encoding='utf-8') as f: | |
print(f"First few lines of translated SRT: {f.readlines()[:10]}") | |
if os.path.exists(transcript_file): | |
print(f"{transcript_file} exists.") | |
else: | |
print(f"{transcript_file} does not exist.") | |
#transcript_file_abs_path = os.path.abspath(transcript_file) | |
try: | |
if target_language_code == 'ja': # 'ja' é o código de idioma para o japonês | |
subtitle_style = "FontName=Noto Sans CJK JP,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1" | |
else: | |
subtitle_style = "FontName=Arial Unicode MS,PrimaryColour=&H00FFFF,OutlineColour=&H000000,BackColour=&H80000000,BorderStyle=3,Outline=2,Shadow=1" | |
result = subprocess.run(["ffmpeg", "-i", Video, "-vf", f"subtitles={transcript_file}:force_style='{subtitle_style}'", output_video], capture_output=True, text=True) | |
if result.returncode == 0: | |
print("FFmpeg executado com sucesso.") | |
else: | |
print(f"FFmpeg falhou com o código de retorno {result.returncode}.") | |
print("Stdout:", result.stdout) | |
print("Stderr:", result.stderr) | |
except Exception as e: | |
print(f"Ocorreu uma exceção: {e}") | |
print("process_video concluído com sucesso") | |
os.unlink(audio_file) | |
os.unlink(transcript_file) | |
print(f"Returning output video path: {output_video}") | |
return output_video | |
iface = gr.Interface( | |
fn=process_video, | |
inputs=[ | |
gr.Video(), | |
gr.Dropdown(choices=list(google_lang_codes.keys()), label="Target Language for Translation", value="English"), | |
gr.Checkbox(label="Translate Video", value=True, info="Check to translate the video to the selected language. Uncheck for transcription only."), | |
], | |
outputs=[ | |
gr.Video(), | |
#gr.FileExplorer() | |
], | |
live=False, | |
title="VIDEO TRANSCRIPTION AND TRANSLATION", | |
description="""This tool was developed by [@artificialguybr](https://twitter.com/artificialguybr) using entirely open-source tools. Special thanks to Hugging Face for the GPU support. Test the [Video Dubbing](https://huggingface.co/spaces/artificialguybr/video-dubbing) space!""", | |
allow_flagging=False | |
) | |
with gr.Blocks() as demo: | |
iface.render() | |
gr.Markdown(""" | |
**Note:** | |
- Video limit is 15 minute. It will do the transcription and translate of subtitles. | |
- The tool uses open-source models for all models. It's a alpha version. | |
""") | |
demo.queue(concurrency_count=1, max_size=15) | |
demo.launch() |