from typing import Dict from faster_whisper import WhisperModel class EndpointHandler: def __init__(self, model_dir=None): # If a model_dir is provided, use it; otherwise, default to 'large-v2'. model_size = "large-v2" if model_dir is None else model_dir self.model = WhisperModel(model_size, device="cpu") def __call__(self, data: Dict) -> Dict[str, str]: # Process the input data expected to be in 'inputs' key containing audio file bytes audio_bytes = data["inputs"] # Perform transcription using the model segments, info = self.model.transcribe(audio_bytes) # Compile the results into a text string and extract language information text = " ".join(segment.text for segment in segments) language_code = info.language language_prob = info.language_probability # Compile the response dictionary result = { "text": text, "language": language_code, "language_probability": language_prob } return result