subhannadeem1 commited on
Commit
d26e658
Β·
1 Parent(s): a0f51ae

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system("pip install git+https://github.com/openai/whisper.git")
3
+ import gradio as gr
4
+ import whisper
5
+ from huggingface_hub import from_pretrained_keras
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ from transformers import pipeline
8
+ from sklearn.preprocessing import StandardScaler
9
+ import logging
10
+ import librosa
11
+ import numpy as np
12
+ import pickle
13
+
14
+
15
+
16
+ #call tokenizer and NLP model for text classification
17
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
18
+ model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
19
+
20
+
21
+ # call whisper model for audio/speech processing
22
+ model = whisper.load_model("small")
23
+
24
+ # call model for audio emotions
25
+ reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')
26
+
27
+ # call scaler and decoder
28
+ with open("scaler.pkl", "rb") as f:
29
+ scaler = pickle.load(f)
30
+
31
+ with open("encoder.pkl", "rb") as f:
32
+ encoder = pickle.load(f)
33
+
34
+
35
+
36
+ def inference_audio(audio):
37
+ audio = whisper.load_audio(audio)
38
+ audio = whisper.pad_or_trim(audio)
39
+
40
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
41
+
42
+ _, probs = model.detect_language(mel)
43
+
44
+ options = whisper.DecodingOptions(fp16 = False)
45
+ result = whisper.decode(model, mel, options)
46
+
47
+ return result.text
48
+
49
+ def inference_text(audio):
50
+ text =inference_audio(audio)
51
+
52
+ sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer)
53
+ res=sentiment_task(text)[0]
54
+
55
+ return text,res['label'],res['score']
56
+
57
+
58
+ def extract_features(data):
59
+ # ZCR
60
+ result = np.array([])
61
+ zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
62
+ result=np.hstack((result, zcr)) # stacking horizontally
63
+
64
+ # Chroma_stft
65
+ stft = np.abs(librosa.stft(data))
66
+ chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
67
+ result = np.hstack((result, chroma_stft)) # stacking horizontally
68
+
69
+ # MFCC
70
+ mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
71
+ result = np.hstack((result, mfcc)) # stacking horizontally
72
+
73
+ # Root Mean Square Value
74
+ rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
75
+ result = np.hstack((result, rms)) # stacking horizontally
76
+
77
+ # MelSpectogram
78
+ mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
79
+ result = np.hstack((result, mel)) # stacking horizontally
80
+
81
+ return result
82
+ """
83
+ def audio_emotions(audio):
84
+ sr,data = audio
85
+ features_audio = extract_features(data)
86
+ features_audio = np.array(features_audio)
87
+ scaled_features=scaler.transform(features_audio)
88
+ scaled_features = np.expand_dims(scaled_features, axis=2)
89
+ prediction=reloaded_model.predict(scaled_features)
90
+ y_pred = encoder.inverse_transform(prediction)
91
+ return y_pred
92
+ """
93
+ def main(audio):
94
+ r1,r2,r3=inference_text(audio)
95
+ #r3=audio_emotions(audio)
96
+ return r1,r2,r3
97
+
98
+
99
+ audio = gr.Audio(
100
+ label="Input Audio",
101
+ show_label=False,
102
+ source="microphone",
103
+ type="filepath"
104
+ )
105
+
106
+
107
+ app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=audio, outputs=["text","text","text"]).launch(debug = True)