Audio-WebUI

Runtime error

File size: 8,461 Bytes


import gradio as gr
import torch
from transformers import BitsAndBytesConfig, HqqConfig

from whisperplus import (
    SpeechToTextPipeline,
    download_youtube_to_mp3,
    download_youtube_to_mp4,
    format_speech_to_dialogue,
)
from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
from whisperplus.pipelines.summarization import TextSummarizationPipeline
from whisperplus.pipelines.text2speech import TextToSpeechPipeline
from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline

import subprocess

subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

def youtube_url_to_text(url, model_id, language_choice):
    """
    Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
    a specified model, and returns the transcript along with the video path.

    Args:
        url (str): The URL of the video to download and convert.
        model_id (str): The ID of the speech-to-text model to use.
        language_choice (str): The language choice for the speech-to-text conversion.

    Returns:
        transcript (str): The transcript of the speech-to-text conversion.
    """
    audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")

    hqq_config = HqqConfig(
        nbits=4,
        group_size=64,
        quant_zero=False,
        quant_scale=False,
        axis=0,
        offload_meta=False,
    )  # axis=0 is used by default

    pipeline = SpeechToTextPipeline(
        model_id=model_id,
        quant_config=hqq_config,
        flash_attention_2=True,
    )

    transcript = pipeline(
        audio_path=audio_path,
        chunk_length_s=30,
        stride_length_s=5,
        max_new_tokens=128,
        batch_size=100,
        language=language_choice,
        return_timestamps=False,
    )
    return transcript


def summarization(text, model_id="facebook/bart-large-cnn"):
    """
    Main function that performs summarization using a specified model and returns the summary.

    Args:
        text (str): The text to summarize.
        model_id (str): The ID of the summarization model to use.

    Returns:
        summary (str): The summary of the text.
    """
    summarizer = TextSummarizationPipeline(model_id=model_id)
    summary = summarizer.summarize(text)

    return summary[0]["summary_text"]


def long_text_summarization(text, model_id="facebook/bart-large-cnn"):
    """
    Main function that performs summarization using a specified model and returns the summary.

    Args:
        text (str): The text to summarize.
        model_id (str): The ID of the summarization model to use.

    Returns:
        summary (str): The summary of the text.
    """
    summarizer = LongTextSummarizationPipeline(model_id=model_id)
    summary_text = summarizer.summarize(text)

    return summary_text


def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
    """
    Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
    a specified model, and returns the transcript along with the video path.

    Args:
        url (str): The URL of the video to download and convert.
        model_id (str): The ID of the speech-to-text model to use.
        language_choice (str): The language choice for the speech-to-text conversion.

    Returns:
        transcript (str): The transcript of the speech-to-text conversion.
        video_path (str): The path of the downloaded video.
    """

    pipeline = ASRDiarizationPipeline.from_pretrained(
        asr_model=model_id,
        diarizer_model="pyannote/speaker-diarization",
        use_auth_token=False,
        chunk_length_s=30,
        device=device,
    )

    audio_path = download_youtube_to_mp3(url)
    output_text = pipeline(
        audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
    dialogue = format_speech_to_dialogue(output_text)
    return dialogue, audio_path


def text2spech_bark(text, model_id="suno/bark", voice_preset="v2/en_speaker_6"):
    tts = TextToSpeechPipeline(model_id=model_id)
    audio = tts(text=text, voice_preset=voice_preset)
    return audio


def whisper_autocaption(url, language, model_id="openai/whisper-large-v3"):
    video_path = download_youtube_to_mp4(url)

    caption = WhisperAutoCaptionPipeline(model_id=model_id)
    output = caption(video_path=video_path, output_path="output.mp4", language=language)
    return output


with gr.Blocks() as demo:
    with gr.Tab("YouTube URL to Text"):
        with gr.Row():
            with gr.Column():
                url_input = gr.Textbox(label="Enter YouTube URL")
                model_id_input = gr.Textbox(label="Enter Model ID", value="openai/whisper-medium")
                language_input = gr.Textbox(label="Enter Language", value="en")
                submit_btn1 = gr.Button("Submit")
            with gr.Column():
                output1 = gr.Textbox(label="Transcript")
        submit_btn1.click(
            youtube_url_to_text, inputs=[url_input, model_id_input, language_input], outputs=output1)

    with gr.Tab("Text Summarization"):
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="Enter Text", lines=5)
                model_id_input2 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
                submit_btn2 = gr.Button("Summarize")
            with gr.Column():
                output2 = gr.Textbox(label="Summary")
        submit_btn2.click(summarization, inputs=[text_input, model_id_input2], outputs=output2)

    with gr.Tab("Long Text Summarization"):
        with gr.Row():
            with gr.Column():
                long_text_input = gr.Textbox(label="Enter Long Text", lines=10)
                model_id_input3 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
                submit_btn3 = gr.Button("Summarize Long Text")
            with gr.Column():
                output3 = gr.Textbox(label="Long Text Summary")
        submit_btn3.click(long_text_summarization, inputs=[long_text_input, model_id_input3], outputs=output3)

    with gr.Tab("Speaker Diarization"):
        with gr.Row():
            with gr.Column():
                url_input2 = gr.Textbox(label="Enter YouTube URL")
                model_id_input4 = gr.Textbox(label="Enter Model ID")
                num_speakers = gr.Number(label="Number of Speakers", value=2)
                min_speakers = gr.Number(label="Min Speakers", value=1)
                max_speakers = gr.Number(label="Max Speakers", value=4)
                device = gr.Textbox(label="Device", value="cpu")
                submit_btn4 = gr.Button("Diarize")
            with gr.Column():
                output4 = gr.DataFrame(headers=["Speaker", "Text"], datatype=["str", "str"])
        submit_btn4.click(
            speaker_diarization,
            inputs=[url_input2, model_id_input4, device, num_speakers, min_speakers, max_speakers],
            outputs=output4)

    with gr.Tab("Text to Speech"):
        with gr.Row():
            with gr.Column():
                text_input2 = gr.Textbox(label="Enter Text", lines=3)
                model_id_input5 = gr.Textbox(label="Enter Model ID", value="suno/bark")
                voice_preset = gr.Textbox(label="Voice Preset", value="v2/en_speaker_6")
                submit_btn5 = gr.Button("Generate Audio")
            with gr.Column():
                output5 = gr.Audio(label="Generated Audio")
        submit_btn5.click(
            text2spech_bark, inputs=[text_input2, model_id_input5, voice_preset], outputs=output5)

    with gr.Tab("Whisper Autocaption"):
        with gr.Row():
            with gr.Column():
                url_input3 = gr.Textbox(label="Enter YouTube URL")
                language = gr.Textbox(label="Language", value="en")
                model_id_input6 = gr.Textbox(label="Enter Model ID", value="openai/whisper-large-v2")
                submit_btn6 = gr.Button("Generate Captions")
            with gr.Column():
                output6 = gr.Video(label="Captioned Video")
        submit_btn6.click(
            whisper_autocaption, inputs=[url_input3, language, model_id_input6], outputs=output6)

demo.launch()