|
import whisper |
|
import moviepy.editor as mp |
|
import gradio as gr |
|
import torch |
|
import subprocess |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model_name = "tiny" |
|
whisper_model = whisper.load_model(model_name).to(device) |
|
|
|
def generate_srt(transcription_result): |
|
srt_content = "" |
|
for i, segment in enumerate(transcription_result['segments']): |
|
start = segment['start'] |
|
end = segment['end'] |
|
start_time = f"{int(start//3600):02}:{int((start%3600)//60):02}:{int(start%60):02},{int((start%1)*1000):03}" |
|
end_time = f"{int(end//3600):02}:{int((end%3600)//60):02}:{int(end%60):02},{int((end%1)*1000):03}" |
|
srt_content += f"{i+1}\n{start_time} --> {end_time}\n{segment['text'].strip()}\n\n" |
|
return srt_content |
|
|
|
def extract_audio_ffmpeg(video_file, audio_output): |
|
subprocess.run([ |
|
'ffmpeg', |
|
'-i', video_file, |
|
'-vn', |
|
'-acodec', 'pcm_s16le', |
|
'-ar', '16000', |
|
audio_output, |
|
'-y' |
|
]) |
|
|
|
def transcribe_and_generate_subtitles(video): |
|
audio_path = "temp_audio.wav" |
|
extract_audio_ffmpeg(video, audio_path) |
|
transcription_result = whisper_model.transcribe(audio_path, language="en", verbose=False) |
|
detected_language = transcription_result['language'] |
|
if detected_language == "hau": |
|
transcription_result = whisper_model.transcribe(audio_path, task="translate", verbose=False) |
|
elif detected_language == "yor": |
|
transcription_result = whisper_model.transcribe(audio_path, task="translate", language="yor", verbose=False) |
|
elif detected_language == "ibo": |
|
transcription_result = whisper_model.transcribe(audio_path, task="translate", language="ibo", verbose=False) |
|
srt_content = generate_srt(transcription_result) |
|
srt_file = "output_subtitles.srt" |
|
with open(srt_file, "w", encoding="utf-8") as f: |
|
f.write(srt_content) |
|
output_video = "video_with_subtitles.mp4" |
|
subprocess.run([ |
|
'ffmpeg', |
|
'-i', video, |
|
'-vf', f"subtitles={srt_file}", |
|
output_video, |
|
'-y' |
|
]) |
|
return transcription_result["text"], output_video |
|
|
|
interface = gr.Interface( |
|
fn=transcribe_and_generate_subtitles, |
|
inputs=gr.Video(label="Upload Video File"), |
|
outputs=[ |
|
gr.Textbox(label="Transcription or Translation"), |
|
gr.File(label="Download Video with Subtitles") |
|
], |
|
title="Video Subtitle Generator", |
|
description="Upload a video in either English, Hausa, Yoruba, or Igbo. The system will detect the language, transcribe or translate if necessary, and generate a video with subtitles embedded.", |
|
live=False |
|
) |
|
interface.launch() |