Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import torch | |
import gradio as gr | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
import tempfile | |
import os | |
# Model configuration, this model contains synthetic data | |
MODEL_ID = "alakxender/whisper-small-dv-full" | |
BATCH_SIZE = 8 | |
FILE_LIMIT_MB = 1000 | |
CHUNK_LENGTH_S = 10 | |
STRIDE_LENGTH_S = [3,2] | |
# Device and dtype setup | |
device = 0 if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
# Initialize model with memory optimizations | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
MODEL_ID, | |
torch_dtype=torch_dtype, | |
low_cpu_mem_usage=True, | |
use_safetensors=True | |
) | |
model.to(device) | |
# Initialize processor | |
processor = AutoProcessor.from_pretrained(MODEL_ID) | |
# Single pipeline initialization with all components | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
chunk_length_s=CHUNK_LENGTH_S, | |
stride_length_s=STRIDE_LENGTH_S, | |
batch_size=BATCH_SIZE, | |
torch_dtype=torch_dtype, | |
device=device, | |
) | |
# Define the generation arguments | |
# Define optimized generation arguments | |
def get_generate_kwargs(is_short_audio=False): | |
""" | |
Get appropriate generation parameters based on audio length. | |
Short audio transcription benefits from different parameters. | |
""" | |
common_kwargs = { | |
"max_new_tokens": model.config.max_target_positions-4, | |
"num_beams": 4, | |
"condition_on_prev_tokens": False, | |
} | |
if is_short_audio: | |
# Parameters optimized for short audio: | |
return { | |
**common_kwargs, | |
"compression_ratio_threshold": 1.5, # Balanced setting to avoid repetition | |
"no_speech_threshold": 0.4, # Higher threshold to reduce hallucinations | |
"repetition_penalty": 1.5, # Add penalty for repeated tokens | |
"return_timestamps": True, # Get timestamps for better segmentation | |
} | |
else: | |
# Parameters for longer audio: | |
return { | |
**common_kwargs, | |
"compression_ratio_threshold": 1.35, # Standard compression ratio for longer audio | |
"repetition_penalty": 1.2, # Light penalty for repeated tokens | |
} | |
# IMPORTANT: Fix for forced_decoder_ids error | |
# Remove forced_decoder_ids from the model's generation config | |
if hasattr(model.generation_config, 'forced_decoder_ids'): | |
print("Removing forced_decoder_ids from generation config") | |
model.generation_config.forced_decoder_ids = None | |
# Also check if it's in the model config | |
if hasattr(model.config, 'forced_decoder_ids'): | |
print("Removing forced_decoder_ids from model config") | |
delattr(model.config, 'forced_decoder_ids') | |
def transcribe(audio_input): | |
if audio_input is None: | |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") | |
try: | |
# Use the defined generate_kwargs dictionary | |
result = pipe( | |
audio_input, | |
generate_kwargs=get_generate_kwargs() | |
) | |
return result["text"] | |
except Exception as e: | |
# More detailed error logging might be helpful here if issues persist | |
print(f"Detailed Error: {e}") | |
raise gr.Error(f"Transcription failed: {str(e)}") | |
# Custom CSS with modern Gradio styling | |
custom_css = """ | |
.thaana-textbox textarea { | |
font-size: 18px !important; | |
font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma', 'Noto Sans Thaana', 'MV Boli' !important; | |
line-height: 1.8 !important; | |
direction: rtl !important; | |
} | |
""" | |
demo = gr.Blocks(css=custom_css) | |
file_transcribe = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio file"), | |
], | |
outputs=gr.Textbox( | |
label="", | |
lines=2, | |
elem_classes=["thaana-textbox"], | |
rtl=True | |
), | |
title="Transcribe Dhivehi Audio", | |
description=( | |
"Upload an audio file or record using your microphone to transcribe." | |
), | |
flagging_mode="never", | |
examples=[ | |
["sample.mp3"] | |
], | |
api_name=False, | |
cache_examples=False | |
) | |
with demo: | |
gr.TabbedInterface([file_transcribe], ["Audio file"]) | |
demo.queue().launch() | |