File size: 2,248 Bytes
09aaa9c
 
 
 
 
 
 
 
4485862
09aaa9c
a6d5ae5
09aaa9c
f93d945
4485862
09aaa9c
f93d945
 
a6d5ae5
f93d945
09aaa9c
f93d945
 
 
 
4485862
f93d945
 
a6d5ae5
f93d945
 
 
 
09aaa9c
a6d5ae5
 
 
 
 
f4739ca
a6d5ae5
 
 
 
f4739ca
 
2c19de2
 
 
a6d5ae5
 
 
 
 
 
2c19de2
a6d5ae5
2c19de2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import WhisperModel, WhisperFeatureExtractor
import datasets
from datasets import load_dataset, DatasetDict, Audio
from huggingface_hub import PyTorchModelHubMixin
import numpy as np

# [Your existing code for device setup, config, SpeechInferenceDataset, SpeechClassifier]

# Prepare data function
def prepare_data(audio_data, sampling_rate, model_checkpoint="openai/whisper-base"):
    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
    inputs = feature_extractor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
    input_features = inputs.input_features
    decoder_input_ids = torch.tensor([[1, 1]]) 
    return input_features.to(device), decoder_input_ids.to(device)

# Prediction function
def predict(audio_data, sampling_rate, config):
    input_features, decoder_input_ids = prepare_data(audio_data, sampling_rate, config["encoder"])
    
    model = SpeechClassifier(config).to(device)
    model.load_state_dict(torch.hub.load_state_dict_from_url("https://huggingface.co/jcho02/whisper_cleft/resolve/main/pytorch_model.bin", map_location=device))
    model.eval()

    with torch.no_grad():
        logits = model(input_features, decoder_input_ids)
        predicted_ids = int(torch.argmax(logits, dim=-1))
    return predicted_ids

# Unified Gradio interface function
def gradio_interface(audio_input):
    if isinstance(audio_input, tuple):
        # If the input is a tuple, it's from the microphone
        audio_data, sample_rate = audio_input
    else:
        # Otherwise, it's an uploaded file
        with open(audio_input, "rb") as f:
            audio_data = np.frombuffer(f.read(), np.int16)
        sample_rate = 16000  # Assume 16kHz sample rate for uploaded files

    prediction = predict(audio_data, sample_rate, config)
    label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
    return label

# Create Gradio interface
demo = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Audio(type="numpy", label="Upload or Record Audio"),
    outputs=gr.Textbox(label="Prediction")
)

# Launch the demo
demo.launch(debug=True)