Spaces:

alakxender
/

asr-dhivehi-demo

Running on Zero

asr-dhivehi-demo / app.py

d3ed528 about 1 month ago

4.39 kB

	import spaces
	import torch
	import gradio as gr
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import tempfile
	import os

	# Model configuration, this model contains synthetic data
	MODEL_ID = "alakxender/whisper-small-dv-full"
	BATCH_SIZE = 8
	FILE_LIMIT_MB = 1000
	CHUNK_LENGTH_S = 10
	STRIDE_LENGTH_S = [3,2]

	# Device and dtype setup
	device = 0 if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Initialize model with memory optimizations
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_ID,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	model.to(device)

	# Initialize processor
	processor = AutoProcessor.from_pretrained(MODEL_ID)

	# Single pipeline initialization with all components
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	chunk_length_s=CHUNK_LENGTH_S,
	stride_length_s=STRIDE_LENGTH_S,
	batch_size=BATCH_SIZE,
	torch_dtype=torch_dtype,
	device=device,
	)

	# Define the generation arguments

	# Define optimized generation arguments
	def get_generate_kwargs(is_short_audio=False):
	"""
	Get appropriate generation parameters based on audio length.
	Short audio transcription benefits from different parameters.
	"""
	common_kwargs = {
	"max_new_tokens": model.config.max_target_positions-4,
	"num_beams": 4,
	"condition_on_prev_tokens": False,
	}

	if is_short_audio:
	# Parameters optimized for short audio:
	return {
	**common_kwargs,
	"compression_ratio_threshold": 1.5, # Balanced setting to avoid repetition
	"no_speech_threshold": 0.4, # Higher threshold to reduce hallucinations
	"repetition_penalty": 1.5, # Add penalty for repeated tokens
	"return_timestamps": True, # Get timestamps for better segmentation
	}
	else:
	# Parameters for longer audio:
	return {
	**common_kwargs,
	"compression_ratio_threshold": 1.35, # Standard compression ratio for longer audio
	"repetition_penalty": 1.2, # Light penalty for repeated tokens
	}

	# IMPORTANT: Fix for forced_decoder_ids error
	# Remove forced_decoder_ids from the model's generation config
	if hasattr(model.generation_config, 'forced_decoder_ids'):
	print("Removing forced_decoder_ids from generation config")
	model.generation_config.forced_decoder_ids = None

	# Also check if it's in the model config
	if hasattr(model.config, 'forced_decoder_ids'):
	print("Removing forced_decoder_ids from model config")
	delattr(model.config, 'forced_decoder_ids')

	@spaces.GPU
	def transcribe(audio_input):
	if audio_input is None:
	raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

	try:
	# Use the defined generate_kwargs dictionary
	result = pipe(
	audio_input,
	generate_kwargs=get_generate_kwargs()
	)
	return result["text"]
	except Exception as e:
	# More detailed error logging might be helpful here if issues persist
	print(f"Detailed Error: {e}")
	raise gr.Error(f"Transcription failed: {str(e)}")

	# Custom CSS with modern Gradio styling
	custom_css = """
	.thaana-textbox textarea {
	font-size: 18px !important;
	font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma', 'Noto Sans Thaana', 'MV Boli' !important;
	line-height: 1.8 !important;
	direction: rtl !important;
	}
	"""

	demo = gr.Blocks(css=custom_css)

	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio file"),
	],
	outputs=gr.Textbox(
	label="",
	lines=2,
	elem_classes=["thaana-textbox"],
	rtl=True
	),
	title="Transcribe Dhivehi Audio",
	description=(
	"Upload an audio file or record using your microphone to transcribe."
	),
	flagging_mode="never",
	examples=[
	["sample.mp3"]
	],
	api_name=False,
	cache_examples=False
	)

	with demo:
	gr.TabbedInterface([file_transcribe], ["Audio file"])

	demo.queue().launch()