Spaces:

alakxender
/

asr-dhivehi-demo

Running on Zero

asr-dhivehi-demo / app.py

6fb3e63 about 2 months ago

3 kB

	import spaces
	import torch
	import gradio as gr
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import tempfile
	import os

	# Model configuration, this model contains synthetic data
	MODEL_ID = "alakxender/whisper-small-dv-full"
	BATCH_SIZE = 8
	FILE_LIMIT_MB = 1000
	CHUNK_LENGTH_S = 30
	STRIDE_LENGTH_S = 5

	# Device and dtype setup
	device = 0 if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Initialize model with memory optimizations
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_ID,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	model.to(device)

	# Initialize processor
	processor = AutoProcessor.from_pretrained(MODEL_ID)

	# Single pipeline initialization with all components
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	chunk_length_s=CHUNK_LENGTH_S,
	stride_length_s=STRIDE_LENGTH_S,
	batch_size=BATCH_SIZE,
	torch_dtype=torch_dtype,
	device=device,
	)

	# Define the generation arguments

	generate_kwargs = {
	"max_new_tokens": model.config.max_target_positions-4,
	"num_beams": 4,
	"condition_on_prev_tokens": False,
	"compression_ratio_threshold": 1.35,
	#"temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
	#"logprob_threshold": -1.0,
	#"no_speech_threshold": 0.6,
	#"return_timestamps"=True
	}

	@spaces.GPU
	def transcribe(audio_input):
	if audio_input is None:
	raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

	try:
	# Use the defined generate_kwargs dictionary
	result = pipe(
	audio_input,
	generate_kwargs=generate_kwargs
	)
	return result["text"]
	except Exception as e:
	# More detailed error logging might be helpful here if issues persist
	print(f"Detailed Error: {e}")
	raise gr.Error(f"Transcription failed: {str(e)}")

	# Custom CSS with modern Gradio styling
	custom_css = """
	.thaana-textbox textarea {
	font-size: 18px !important;
	font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma', 'Noto Sans Thaana', 'MV Boli' !important;
	line-height: 1.8 !important;
	direction: rtl !important;
	}
	"""

	demo = gr.Blocks(css=custom_css)

	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio file"),
	],
	outputs=gr.Textbox(
	label="",
	lines=2,
	elem_classes=["thaana-textbox"],
	rtl=True
	),
	title="Transcribe Dhivehi Audio",
	description=(
	"Upload an audio file or record using your microphone to transcribe."
	),
	allow_flagging="never",
	examples=[
	["sample.mp3"]
	],
	)

	with demo:
	gr.TabbedInterface([file_transcribe], ["Audio file"])

	demo.queue().launch()