Audio-WebUI

Paused

App Files Files Community

Audio-WebUI / app.py

kadirnar

update

bd8e31e over 1 year ago

raw

history blame

5.71 kB

	import gradio as gr

	from whisperplus.utils.download_utils import download_and_convert_to_mp3

	import logging

	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


	class SpeechToTextPipeline:
	"""Class for converting audio to text using a pre-trained speech recognition model."""

	def __init__(self, model_id: str = "openai/whisper-large-v3"):
	self.model = None
	self.device = None

	if self.model is None:
	self.load_model(model_id)
	else:
	logging.info("Model already loaded.")

	def load_model(self, model_id: str = "openai/whisper-large-v3"):
	"""
	Loads the pre-trained speech recognition model and moves it to the specified device.

	Args:
	model_id (str): Identifier of the pre-trained model to be loaded.
	"""
	logging.info("Loading model...")
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
	model.to(self.device)
	logging.info("Model loaded successfully.")

	self.model = model

	def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"):
	"""
	Converts audio to text using the pre-trained speech recognition model.

	Args:
	audio_path (str): Path to the audio file to be transcribed.
	model_id (str): Identifier of the pre-trained model to be used for transcription.

	Returns:
	str: Transcribed text from the audio.
	"""
	processor = AutoProcessor.from_pretrained(model_id)
	pipe = pipeline(
	"automatic-speech-recognition",
	model=self.model,
	torch_dtype=torch.float16,
	chunk_length_s=30,
	max_new_tokens=128,
	batch_size=24,
	return_timestamps=True,
	device="cuda",
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	model_kwargs={"use_flash_attention_2": True},
	generate_kwargs={"language": language},
	)
	logging.info("Transcribing audio...")
	result = pipe(audio_path)["text"]
	return result

	def youtube_url_to_text(url, model_id, language_choice):
	"""
	Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
	a specified model, and returns the transcript along with the video path.

	Args:
	url (str): The URL of the video to download and convert.
	model_id (str): The ID of the speech-to-text model to use.
	language_choice (str): The language choice for the speech-to-text conversion.

	Returns:
	transcript (str): The transcript of the speech-to-text conversion.
	video_path (str): The path of the downloaded video.
	"""
	video_path = download_and_convert_to_mp3(url)
	pipeline = SpeechToTextPipeline(model_id)
	transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice)

	return transcript, video_path


	def youtube_url_to_text_app():
	with gr.Blocks():
	with gr.Row():
	with gr.Column():
	youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")

	language_choice = gr.Dropdown(
	choices=[
	"English",
	"Turkish",
	"Spanish",
	"French",
	"Chinese",
	"Japanese",
	"Korean",
	],
	value="Turkish",
	label="Language",
	)
	whisper_model_id = gr.Dropdown(
	choices=[
	"openai/whisper-large-v3",
	"openai/whisper-large",
	"openai/whisper-medium",
	"openai/whisper-base",
	"openai/whisper-small",
	"openai/whisper-tiny",
	],
	value="openai/whisper-large-v3",
	label="Whisper Model",
	)
	whisperplus_in_predict = gr.Button(value="Generator")

	with gr.Column():
	output_text = gr.Textbox(label="Output Text")
	output_audio = gr.Audio(label="Output Audio")

	whisperplus_in_predict.click(
	fn=youtube_url_to_text,
	inputs=[
	youtube_url_path,
	whisper_model_id,
	language_choice,
	],
	outputs=[output_text, output_audio],
	)


	gradio_app = gr.Blocks()
	with gradio_app:
	gr.HTML(
	"""
	<h1 style='text-align: center'>
	WhisperPlus: Advancing Speech-to-Text Processing 🚀
	</h1>
	""")
	gr.HTML(
	"""
	<h3 style='text-align: center'>
	Follow me for more!
	<a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> \| <a href='https://github.com/kadirnar' target='_blank'>Github</a> \| <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> \| <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a>
	</h3>
	""")
	with gr.Row():
	with gr.Column():
	with gr.Tab(label="Youtube URL to Text"):
	youtube_url_to_text_app()

	gradio_app.queue()
	gradio_app.launch(debug=True)