Spaces:
Runtime error
Runtime error
import gradio as gr | |
from whisperplus.utils.download_utils import download_and_convert_to_mp3 | |
import logging | |
import torch | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
class SpeechToTextPipeline: | |
"""Class for converting audio to text using a pre-trained speech recognition model.""" | |
def __init__(self, model_id: str = "openai/whisper-large-v3"): | |
self.model = None | |
self.device = None | |
if self.model is None: | |
self.load_model(model_id) | |
else: | |
logging.info("Model already loaded.") | |
def load_model(self, model_id: str = "openai/whisper-large-v3"): | |
""" | |
Loads the pre-trained speech recognition model and moves it to the specified device. | |
Args: | |
model_id (str): Identifier of the pre-trained model to be loaded. | |
""" | |
logging.info("Loading model...") | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True) | |
model.to(self.device) | |
logging.info("Model loaded successfully.") | |
self.model = model | |
def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"): | |
""" | |
Converts audio to text using the pre-trained speech recognition model. | |
Args: | |
audio_path (str): Path to the audio file to be transcribed. | |
model_id (str): Identifier of the pre-trained model to be used for transcription. | |
Returns: | |
str: Transcribed text from the audio. | |
""" | |
processor = AutoProcessor.from_pretrained(model_id) | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=self.model, | |
torch_dtype=torch.float16, | |
chunk_length_s=30, | |
max_new_tokens=128, | |
batch_size=24, | |
return_timestamps=True, | |
device="cuda", | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
model_kwargs={"use_flash_attention_2": True}, | |
generate_kwargs={"language": language}, | |
) | |
logging.info("Transcribing audio...") | |
result = pipe(audio_path)["text"] | |
return result | |
def youtube_url_to_text(url, model_id, language_choice): | |
""" | |
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using | |
a specified model, and returns the transcript along with the video path. | |
Args: | |
url (str): The URL of the video to download and convert. | |
model_id (str): The ID of the speech-to-text model to use. | |
language_choice (str): The language choice for the speech-to-text conversion. | |
Returns: | |
transcript (str): The transcript of the speech-to-text conversion. | |
video_path (str): The path of the downloaded video. | |
""" | |
video_path = download_and_convert_to_mp3(url) | |
pipeline = SpeechToTextPipeline(model_id) | |
transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice) | |
return transcript, video_path | |
def youtube_url_to_text_app(): | |
with gr.Blocks(): | |
with gr.Row(): | |
with gr.Column(): | |
youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") | |
language_choice = gr.Dropdown( | |
choices=[ | |
"English", | |
"Turkish", | |
"Spanish", | |
"French", | |
"Chinese", | |
"Japanese", | |
"Korean", | |
], | |
value="Turkish", | |
label="Language", | |
) | |
whisper_model_id = gr.Dropdown( | |
choices=[ | |
"openai/whisper-large-v3", | |
"openai/whisper-large", | |
"openai/whisper-medium", | |
"openai/whisper-base", | |
"openai/whisper-small", | |
"openai/whisper-tiny", | |
], | |
value="openai/whisper-large-v3", | |
label="Whisper Model", | |
) | |
whisperplus_in_predict = gr.Button(value="Generator") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Output Text") | |
output_audio = gr.Audio(label="Output Audio") | |
whisperplus_in_predict.click( | |
fn=youtube_url_to_text, | |
inputs=[ | |
youtube_url_path, | |
whisper_model_id, | |
language_choice, | |
], | |
outputs=[output_text, output_audio], | |
) | |
gradio_app = gr.Blocks() | |
with gradio_app: | |
gr.HTML( | |
""" | |
<h1 style='text-align: center'> | |
WhisperPlus: Advancing Speech-to-Text Processing 🚀 | |
</h1> | |
""") | |
gr.HTML( | |
""" | |
<h3 style='text-align: center'> | |
Follow me for more! | |
<a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a> | |
</h3> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Tab(label="Youtube URL to Text"): | |
youtube_url_to_text_app() | |
gradio_app.queue() | |
gradio_app.launch(debug=True) | |