Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pyperclip | |
import urllib.parse as urlparse | |
from pytube import YouTube | |
import re | |
import subprocess | |
import time | |
from lang_list import ORIGINAL_LANGUAGE_NAME_TO_CODE, S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES | |
import torch | |
import whisper | |
# get device | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# device = torch.device("cpu") | |
model = whisper.load_model("large-v2", device=device) | |
YOUTUBE = "youtube" | |
TWITCH = "twitch" | |
ERROR = "error" | |
def copy_url_from_clipboard(): | |
return pyperclip.paste() | |
def clear_video_url(): | |
visible = False | |
image = gr.Image(visible=visible, scale=1) | |
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True) | |
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True) | |
get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible) | |
transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible) | |
original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False) | |
original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible) | |
original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible) | |
translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible) | |
transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible) | |
return ( | |
"", | |
image, | |
source_languaje, | |
target_languaje, | |
get_audio_button, | |
transcribe_audio_button, | |
original_audio, | |
original_audio_transcribed, | |
translated_audio, | |
original_audio_translated, | |
) | |
def get_youtube_thumbnail(video_id): | |
thumbnail_url = f"https://img.youtube.com/vi/{video_id}/0.jpg" | |
return thumbnail_url | |
def get_youtube_video_id(url): | |
if "youtu.be" in url.lower(): | |
yt = YouTube(url) | |
thumbnail_url = yt.thumbnail_url | |
return thumbnail_url | |
else: | |
parsed_url = urlparse.urlparse(url) | |
video_id = urlparse.parse_qs(parsed_url.query).get('v') | |
if video_id: | |
thumbnail_url = get_youtube_thumbnail(video_id[0]) | |
return thumbnail_url | |
else: | |
return None | |
def is_valid_url(url): | |
source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True) | |
target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True) | |
get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=True) | |
original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=True, interactive=False) | |
original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=True) | |
original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=True) | |
translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=True) | |
transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=True) | |
if "youtube" in url.lower() or "youtu.be" in url.lower(): | |
thumbnail = get_youtube_video_id(url) | |
if thumbnail: | |
return ( | |
gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False), | |
source_languaje, | |
target_languaje, | |
get_audio_button, | |
gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False), | |
original_audio, | |
original_audio_transcribed, | |
translated_audio, | |
transcribe_audio_button, | |
original_audio_translated, | |
) | |
else: | |
return ( | |
gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False), | |
source_languaje, | |
target_languaje, | |
get_audio_button, | |
gr.Textbox(value=YOUTUBE, label="Stream page", elem_id="stream_page", visible=False), | |
original_audio, | |
original_audio_transcribed, | |
translated_audio, | |
transcribe_audio_button, | |
original_audio_translated, | |
) | |
elif "twitch" in url.lower() or "twitch.tv" in url.lower(): | |
return ( | |
gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False), | |
source_languaje, | |
target_languaje, | |
get_audio_button, | |
gr.Textbox(value=TWITCH, label="Stream page", elem_id="stream_page", visible=False), | |
original_audio, | |
original_audio_transcribed, | |
translated_audio, | |
transcribe_audio_button, | |
original_audio_translated, | |
) | |
else: | |
visible = False | |
image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False) | |
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True) | |
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True) | |
get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible) | |
stream_page = gr.Textbox(value=ERROR, label="Stream page", elem_id="stream_page", visible=visible) | |
original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False) | |
original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible) | |
original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible) | |
translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible) | |
transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible) | |
return ( | |
image, | |
source_languaje, | |
target_languaje, | |
get_audio_button, | |
stream_page, | |
original_audio, | |
original_audio_transcribed, | |
translated_audio, | |
transcribe_audio_button, | |
original_audio_translated, | |
) | |
def get_audio_from_video(url, stream_page): | |
if stream_page == YOUTUBE: | |
yt = YouTube(url) | |
audio_streams = yt.streams.filter(mime_type="audio/mp4") | |
# Get all available audio bitrates | |
abr_list = [] | |
for stream in audio_streams: | |
abr_list.append(stream.abr) | |
abr_list = sorted(set(abr_list)) | |
# Get the highest audio bitrate | |
audio_stream = audio_streams.filter(abr=abr_list[0]).first() | |
# Download the audio | |
filename = "audio.mp3" | |
audio_stream.download(filename=filename) | |
return ( | |
gr.Audio(value=filename, label="Original audio", elem_id="original_audio", visible=True, interactive=False), | |
gr.Textbox(value=filename, label="Stream page", elem_id="stream_page", visible=False) | |
) | |
elif stream_page == TWITCH: | |
# Get the video id | |
video_id = re.search("\d{10}", url).group(0) | |
# Download the video | |
filename = "audio.mkv" | |
subprocess.run(["twitch-dl", "download", "--overwrite", "-q", "audio_only", "--output", filename, video_id]) | |
return ( | |
gr.Audio(value=filename, label="Original audio", elem_id="original_audio", visible=True, interactive=False), | |
gr.Textbox(value=filename, label="Stream page", elem_id="stream_page", visible=False) | |
) | |
def trascribe_audio(audio_path): | |
audio = whisper.load_audio(audio_path) | |
audio = whisper.pad_or_trim(audio) | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
_, probs = model.detect_language(mel) | |
options = whisper.DecodingOptions(fp16 = False) | |
result = whisper.decode(model, mel, options) | |
# Save the result to a file | |
filename = "result.txt" | |
with open(filename, "w") as f: | |
f.write(result.text) | |
# Remove audio file | |
# subprocess.run(["rm", audio_path]) | |
return ( | |
result.text, | |
gr.Textbox(value=filename, label="Original audio transcribed", elem_id="original_audio_transcribed", visible=False) | |
) | |
def translate(original_audio_transcribed_path, source_languaje, target_languaje): | |
# Translate | |
with open(original_audio_transcribed_path, "r") as f: | |
text = f.read() | |
translated = text | |
# Save the result to a file | |
filename = "translated_text.txt" | |
with open(filename, "w") as f: | |
f.write(text) | |
# Remove audio file | |
# subprocess.run(["rm", original_audio_transcribed_path]) | |
return ( | |
translated, | |
gr.Textbox(value=filename, label="Original audio translated", elem_id="original_audio_translated", visible=False) | |
) | |
def tex2speech(original_audio_translated_path): | |
pass | |
with gr.Blocks() as demo: | |
with gr.Row(variant="panel"): | |
url_textbox = gr.Textbox(placeholder="Add video URL here", label="Video URL", elem_id="video_url", scale=1, interactive=True) | |
copy_button = gr.Button(size="sm", icon="icons/copy.svg", value="", min_width="10px", scale=0) | |
delete_button = gr.Button(size="sm", icon="icons/delete.svg", value="", min_width="10px", scale=0) | |
copy_button.click(fn=copy_url_from_clipboard, outputs=url_textbox) | |
stream_page = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False) | |
visible = False | |
with gr.Row(equal_height=False): | |
image = gr.Image(visible=visible, scale=1) | |
with gr.Column(): | |
with gr.Row(): | |
source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=ORIGINAL_LANGUAGE_NAME_TO_CODE, scale=1, interactive=True) | |
target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=S2ST_TARGET_ORIGINAL_LANGUAGE_NAMES, scale=1, interactive=True) | |
with gr.Row(): | |
get_audio_button = gr.Button(size="lg", value="get audio", min_width="10px", scale=0, visible=visible) | |
transcribe_audio_button = gr.Button(size="lg", value="transcribe audio", min_width="10px", scale=0, visible=visible) | |
original_audio = gr.Audio(label="Original audio", elem_id="original_audio", visible=visible, interactive=False) | |
original_audio_path = gr.Textbox(label="Stream page", elem_id="stream_page", visible=False) | |
original_audio_transcribed = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", interactive=False, visible=visible) | |
original_audio_transcribed_path = gr.Textbox(label="Original audio transcribed", elem_id="original_audio_transcribed", visible=False) | |
original_audio_translated = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", interactive=False, visible=visible) | |
original_audio_translated_path = gr.Textbox(label="Original audio translated", elem_id="original_audio_translated", visible=False) | |
translated_audio = gr.Audio(label="Translated audio", elem_id="translated_audio", visible=visible) | |
url_textbox.change( | |
fn=is_valid_url, | |
inputs=url_textbox, | |
outputs=[ | |
image, | |
source_languaje, | |
target_languaje, | |
get_audio_button, | |
stream_page, | |
original_audio, | |
original_audio_transcribed, | |
translated_audio, | |
transcribe_audio_button, | |
original_audio_translated, | |
] | |
) | |
delete_button.click( | |
fn=clear_video_url, | |
outputs=[ | |
url_textbox, | |
image, | |
source_languaje, | |
target_languaje, | |
get_audio_button, | |
transcribe_audio_button, | |
original_audio, | |
original_audio_transcribed, | |
translated_audio, | |
original_audio_translated, | |
] | |
) | |
get_audio_button.click(fn=get_audio_from_video, inputs=[url_textbox, stream_page], outputs=[original_audio, original_audio_path]) | |
original_audio.change(fn=trascribe_audio, inputs=original_audio_path, outputs=[original_audio_transcribed, original_audio_transcribed_path]) | |
original_audio_transcribed.change(fn=translate, inputs=[original_audio_transcribed_path, source_languaje, target_languaje], outputs=[original_audio_translated, original_audio_translated_path]) | |
original_audio_translated.change(fn=tex2speech, inputs=original_audio_translated_path, outputs=translated_audio) | |
demo.launch() |