Audio-WebUI / app.py
kadirnar's picture
update
bd8e31e
raw
history blame
5.71 kB
import gradio as gr
from whisperplus.utils.download_utils import download_and_convert_to_mp3
import logging
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class SpeechToTextPipeline:
"""Class for converting audio to text using a pre-trained speech recognition model."""
def __init__(self, model_id: str = "openai/whisper-large-v3"):
self.model = None
self.device = None
if self.model is None:
self.load_model(model_id)
else:
logging.info("Model already loaded.")
def load_model(self, model_id: str = "openai/whisper-large-v3"):
"""
Loads the pre-trained speech recognition model and moves it to the specified device.
Args:
model_id (str): Identifier of the pre-trained model to be loaded.
"""
logging.info("Loading model...")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
model.to(self.device)
logging.info("Model loaded successfully.")
self.model = model
def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"):
"""
Converts audio to text using the pre-trained speech recognition model.
Args:
audio_path (str): Path to the audio file to be transcribed.
model_id (str): Identifier of the pre-trained model to be used for transcription.
Returns:
str: Transcribed text from the audio.
"""
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=self.model,
torch_dtype=torch.float16,
chunk_length_s=30,
max_new_tokens=128,
batch_size=24,
return_timestamps=True,
device="cuda",
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
model_kwargs={"use_flash_attention_2": True},
generate_kwargs={"language": language},
)
logging.info("Transcribing audio...")
result = pipe(audio_path)["text"]
return result
def youtube_url_to_text(url, model_id, language_choice):
"""
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
a specified model, and returns the transcript along with the video path.
Args:
url (str): The URL of the video to download and convert.
model_id (str): The ID of the speech-to-text model to use.
language_choice (str): The language choice for the speech-to-text conversion.
Returns:
transcript (str): The transcript of the speech-to-text conversion.
video_path (str): The path of the downloaded video.
"""
video_path = download_and_convert_to_mp3(url)
pipeline = SpeechToTextPipeline(model_id)
transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice)
return transcript, video_path
def youtube_url_to_text_app():
with gr.Blocks():
with gr.Row():
with gr.Column():
youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
language_choice = gr.Dropdown(
choices=[
"English",
"Turkish",
"Spanish",
"French",
"Chinese",
"Japanese",
"Korean",
],
value="Turkish",
label="Language",
)
whisper_model_id = gr.Dropdown(
choices=[
"openai/whisper-large-v3",
"openai/whisper-large",
"openai/whisper-medium",
"openai/whisper-base",
"openai/whisper-small",
"openai/whisper-tiny",
],
value="openai/whisper-large-v3",
label="Whisper Model",
)
whisperplus_in_predict = gr.Button(value="Generator")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
output_audio = gr.Audio(label="Output Audio")
whisperplus_in_predict.click(
fn=youtube_url_to_text,
inputs=[
youtube_url_path,
whisper_model_id,
language_choice,
],
outputs=[output_text, output_audio],
)
gradio_app = gr.Blocks()
with gradio_app:
gr.HTML(
"""
<h1 style='text-align: center'>
WhisperPlus: Advancing Speech-to-Text Processing 🚀
</h1>
""")
gr.HTML(
"""
<h3 style='text-align: center'>
Follow me for more!
<a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a>
</h3>
""")
with gr.Row():
with gr.Column():
with gr.Tab(label="Youtube URL to Text"):
youtube_url_to_text_app()
gradio_app.queue()
gradio_app.launch(debug=True)