from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch import whisper tokenizer = AutoTokenizer.from_pretrained("Bhuvana/t5-base-spellchecker") model = AutoModelForSeq2SeqLM.from_pretrained("Bhuvana/t5-base-spellchecker") def correct(inputs): input_ids = tokenizer.encode(inputs,return_tensors='pt') sample_output = model.generate( input_ids, do_sample=True, max_length=50, top_p=0.99, num_return_sequences=1 ) res = tokenizer.decode(sample_output[0], skip_special_tokens=True) return res whisper_model = whisper.load_model("base") def transcribe(audio_file): # Load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio_file) audio = whisper.pad_or_trim(audio) # Convert audio data to PyTorch tensor and float data type mel = torch.from_numpy(audio).float() # Make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(mel).to(model.device) # Detect the spoken language _, probs = whisper_model.detect_language(mel) # Decode the audio options = whisper.DecodingOptions(fp16=False) result = whisper.decode(whisper_model, mel, options) result_text = result.text print('result_text:'+result_text) return correct(result_text)