Audio-WebUI / app.py
kadirnar's picture
Update app.py
22dcbc5 verified
import gradio as gr
import torch
from transformers import BitsAndBytesConfig, HqqConfig
from whisperplus import (
SpeechToTextPipeline,
download_youtube_to_mp3,
download_youtube_to_mp4,
format_speech_to_dialogue,
)
from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
from whisperplus.pipelines.summarization import TextSummarizationPipeline
from whisperplus.pipelines.text2speech import TextToSpeechPipeline
from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
def youtube_url_to_text(url, model_id, language_choice):
"""
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
a specified model, and returns the transcript along with the video path.
Args:
url (str): The URL of the video to download and convert.
model_id (str): The ID of the speech-to-text model to use.
language_choice (str): The language choice for the speech-to-text conversion.
Returns:
transcript (str): The transcript of the speech-to-text conversion.
"""
audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")
hqq_config = HqqConfig(
nbits=4,
group_size=64,
quant_zero=False,
quant_scale=False,
axis=0,
offload_meta=False,
) # axis=0 is used by default
pipeline = SpeechToTextPipeline(
model_id=model_id,
quant_config=hqq_config,
flash_attention_2=True,
)
transcript = pipeline(
audio_path=audio_path,
chunk_length_s=30,
stride_length_s=5,
max_new_tokens=128,
batch_size=100,
language=language_choice,
return_timestamps=False,
)
return transcript
def summarization(text, model_id="facebook/bart-large-cnn"):
"""
Main function that performs summarization using a specified model and returns the summary.
Args:
text (str): The text to summarize.
model_id (str): The ID of the summarization model to use.
Returns:
summary (str): The summary of the text.
"""
summarizer = TextSummarizationPipeline(model_id=model_id)
summary = summarizer.summarize(text)
return summary[0]["summary_text"]
def long_text_summarization(text, model_id="facebook/bart-large-cnn"):
"""
Main function that performs summarization using a specified model and returns the summary.
Args:
text (str): The text to summarize.
model_id (str): The ID of the summarization model to use.
Returns:
summary (str): The summary of the text.
"""
summarizer = LongTextSummarizationPipeline(model_id=model_id)
summary_text = summarizer.summarize(text)
return summary_text
def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
"""
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
a specified model, and returns the transcript along with the video path.
Args:
url (str): The URL of the video to download and convert.
model_id (str): The ID of the speech-to-text model to use.
language_choice (str): The language choice for the speech-to-text conversion.
Returns:
transcript (str): The transcript of the speech-to-text conversion.
video_path (str): The path of the downloaded video.
"""
pipeline = ASRDiarizationPipeline.from_pretrained(
asr_model=model_id,
diarizer_model="pyannote/speaker-diarization",
use_auth_token=False,
chunk_length_s=30,
device=device,
)
audio_path = download_youtube_to_mp3(url)
output_text = pipeline(
audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
dialogue = format_speech_to_dialogue(output_text)
return dialogue, audio_path
def text2spech_bark(text, model_id="suno/bark", voice_preset="v2/en_speaker_6"):
tts = TextToSpeechPipeline(model_id=model_id)
audio = tts(text=text, voice_preset=voice_preset)
return audio
def whisper_autocaption(url, language, model_id="openai/whisper-large-v3"):
video_path = download_youtube_to_mp4(url)
caption = WhisperAutoCaptionPipeline(model_id=model_id)
output = caption(video_path=video_path, output_path="output.mp4", language=language)
return output
with gr.Blocks() as demo:
with gr.Tab("YouTube URL to Text"):
with gr.Row():
with gr.Column():
url_input = gr.Textbox(label="Enter YouTube URL")
model_id_input = gr.Textbox(label="Enter Model ID", value="openai/whisper-medium")
language_input = gr.Textbox(label="Enter Language", value="en")
submit_btn1 = gr.Button("Submit")
with gr.Column():
output1 = gr.Textbox(label="Transcript")
submit_btn1.click(
youtube_url_to_text, inputs=[url_input, model_id_input, language_input], outputs=output1)
with gr.Tab("Text Summarization"):
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Enter Text", lines=5)
model_id_input2 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
submit_btn2 = gr.Button("Summarize")
with gr.Column():
output2 = gr.Textbox(label="Summary")
submit_btn2.click(summarization, inputs=[text_input, model_id_input2], outputs=output2)
with gr.Tab("Long Text Summarization"):
with gr.Row():
with gr.Column():
long_text_input = gr.Textbox(label="Enter Long Text", lines=10)
model_id_input3 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
submit_btn3 = gr.Button("Summarize Long Text")
with gr.Column():
output3 = gr.Textbox(label="Long Text Summary")
submit_btn3.click(long_text_summarization, inputs=[long_text_input, model_id_input3], outputs=output3)
with gr.Tab("Speaker Diarization"):
with gr.Row():
with gr.Column():
url_input2 = gr.Textbox(label="Enter YouTube URL")
model_id_input4 = gr.Textbox(label="Enter Model ID")
num_speakers = gr.Number(label="Number of Speakers", value=2)
min_speakers = gr.Number(label="Min Speakers", value=1)
max_speakers = gr.Number(label="Max Speakers", value=4)
device = gr.Textbox(label="Device", value="cpu")
submit_btn4 = gr.Button("Diarize")
with gr.Column():
output4 = gr.DataFrame(headers=["Speaker", "Text"], datatype=["str", "str"])
submit_btn4.click(
speaker_diarization,
inputs=[url_input2, model_id_input4, device, num_speakers, min_speakers, max_speakers],
outputs=output4)
with gr.Tab("Text to Speech"):
with gr.Row():
with gr.Column():
text_input2 = gr.Textbox(label="Enter Text", lines=3)
model_id_input5 = gr.Textbox(label="Enter Model ID", value="suno/bark")
voice_preset = gr.Textbox(label="Voice Preset", value="v2/en_speaker_6")
submit_btn5 = gr.Button("Generate Audio")
with gr.Column():
output5 = gr.Audio(label="Generated Audio")
submit_btn5.click(
text2spech_bark, inputs=[text_input2, model_id_input5, voice_preset], outputs=output5)
with gr.Tab("Whisper Autocaption"):
with gr.Row():
with gr.Column():
url_input3 = gr.Textbox(label="Enter YouTube URL")
language = gr.Textbox(label="Language", value="en")
model_id_input6 = gr.Textbox(label="Enter Model ID", value="openai/whisper-large-v2")
submit_btn6 = gr.Button("Generate Captions")
with gr.Column():
output6 = gr.Video(label="Captioned Video")
submit_btn6.click(
whisper_autocaption, inputs=[url_input3, language, model_id_input6], outputs=output6)
demo.launch()