|
import os |
|
import sys |
|
os.system("pip install transformers==4.27.0") |
|
os.system("pip install numpy==1.23") |
|
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig, WhisperProcessor, WhisperForConditionalGeneration |
|
os.system("pip install jiwer") |
|
from jiwer import wer |
|
os.system("pip install datasets[audio]") |
|
from evaluate import evaluator, load |
|
from transformers import AutoModelForSequenceClassification, pipeline, BertTokenizer, AutoTokenizer, GPT2Model |
|
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled |
|
import gradio as gr |
|
import torch |
|
from datasets import load_dataset |
|
from transformers import WhisperForConditionalGeneration, WhisperProcessor |
|
|
|
|
|
processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50") |
|
model = WhisperForConditionalGeneration.from_pretrained("mskov/whisper-small-esc50").to("cuda") |
|
|
|
def map_to_pred(batch): |
|
audio = batch["audio"] |
|
input_features = processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features |
|
batch["reference"] = processor.tokenizer._normalize(batch['category']) |
|
|
|
with torch.no_grad(): |
|
predicted_ids = model.generate(input_features.to("cuda"))[0] |
|
transcription = processor.decode(predicted_ids) |
|
batch["prediction"] = processor.tokenizer._normalize(transcription) |
|
print(batch["prediction"]) |
|
return batch |
|
|
|
result = librispeech_test_clean.map(map_to_pred) |
|
|
|
wer = load("wer") |
|
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) |
|
|
|
|
|
|
|
def transcribe(audio): |
|
text = pipe(audio)["text"] |
|
return text, test |
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(source="microphone", type="filepath"), |
|
outputs="text", |
|
title="Whisper Small ESC50 Test", |
|
) |
|
|
|
iface.launch() |
|
|
|
|
|
''' |
|
print("check check") |
|
print(inputs) |
|
input_features = inputs.input_features |
|
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id |
|
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state |
|
list(last_hidden_state.shape) |
|
print(list(last_hidden_state.shape)) |
|
''' |