Boltz79 commited on
Commit
def04d4
·
verified ·
1 Parent(s): 786ea23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -45
app.py CHANGED
@@ -1,53 +1,110 @@
1
  import gradio as gr
 
 
2
  from transformers import pipeline
 
3
 
4
- # Load Whisper for speech-to-text
5
- whisper = pipeline("automatic-speech-recognition", model="openai/whisper-medium")
 
 
 
 
 
 
6
 
7
- # Load a sentiment analysis model
8
- sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Function to process audio and analyze tone
11
- def analyze_call(audio_file):
12
- try:
13
- # Step 1: Transcribe audio to text using Whisper
14
- transcription = whisper(audio_file)["text"]
 
 
 
 
 
 
15
 
16
- # Step 2: Analyze sentiment of the transcription
17
- sentiment_result = sentiment_analyzer(transcription)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- # Prepare the output
20
- output = {
21
- "transcription": transcription,
22
- "sentiment": sentiment_result["label"],
23
- "confidence": round(sentiment_result["score"], 4)
24
- }
25
- return output
26
- except Exception as e:
27
- return {"error": str(e)}
28
-
29
- # Gradio Interface
30
- def gradio_interface(audio):
31
- if audio is None:
32
- return "Please record or upload an audio file."
33
- result = analyze_call(audio)
34
- if "error" in result:
35
- return f"Error: {result['error']}"
36
- return (
37
- f"**Transcription:** {result['transcription']}\n\n"
38
- f"**Sentiment:** {result['sentiment']}\n\n"
39
- f"**Confidence:** {result['confidence']}"
40
- )
41
-
42
- # Create Gradio app
43
- interface = gr.Interface(
44
- fn=gradio_interface,
45
- inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
46
- outputs=gr.Textbox(label="Analysis Result", lines=5),
47
- title="Real-Time Call Analysis",
48
- description="Record or upload audio to analyze tone and sentiment in real time.",
49
- live=False # Set to False to avoid constant re-runs
50
- )
51
 
52
- # Launch the app
53
- interface.launch()
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import torch
4
  from transformers import pipeline
5
+ import librosa
6
 
7
+ class EmotionRecognizer:
8
+ def __init__(self):
9
+ self.classifier = pipeline(
10
+ "audio-classification",
11
+ model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
12
+ device=0 if torch.cuda.is_available() else -1
13
+ )
14
+ self.sample_rate = 16000
15
 
16
+ def process_audio(self, audio_input):
17
+ try:
18
+ # Extract audio data and sample rate from gradio input
19
+ sample_rate, audio_data = audio_input
20
+
21
+ # Convert stereo to mono if necessary
22
+ if len(audio_data.shape) > 1:
23
+ audio_data = np.mean(audio_data, axis=1)
24
+
25
+ # Convert to float32 and normalize
26
+ audio_data = audio_data.astype(np.float32)
27
+ if np.max(np.abs(audio_data)) > 1.0:
28
+ audio_data = audio_data / np.max(np.abs(audio_data))
29
+
30
+ # Resample if necessary
31
+ if sample_rate != self.sample_rate:
32
+ audio_data = librosa.resample(
33
+ y=audio_data,
34
+ orig_sr=sample_rate,
35
+ target_sr=self.sample_rate
36
+ )
37
+
38
+ # Ensure the audio isn't too short
39
+ if len(audio_data) < self.sample_rate:
40
+ # Pad audio if it's too short
41
+ audio_data = np.pad(audio_data, (0, self.sample_rate - len(audio_data)))
42
+ elif len(audio_data) > 10 * self.sample_rate:
43
+ # Take first 10 seconds if audio is too long
44
+ audio_data = audio_data[:10 * self.sample_rate]
45
+
46
+ # Make prediction
47
+ result = self.classifier({"array": audio_data, "sampling_rate": self.sample_rate})
48
+
49
+ # Format results
50
+ emotions_text = "\n".join([
51
+ f"{pred['label']}: {pred['score']*100:.2f}%"
52
+ for pred in result
53
+ ])
54
+
55
+ # Prepare plot data
56
+ plot_data = {
57
+ "labels": [pred['label'] for pred in result],
58
+ "values": [pred['score'] * 100 for pred in result]
59
+ }
60
+
61
+ return emotions_text, plot_data
62
+
63
+ except Exception as e:
64
+ print(f"Error details: {str(e)}")
65
+ return f"Error processing audio: {str(e)}", None
66
 
67
+ def create_interface():
68
+ recognizer = EmotionRecognizer()
69
+
70
+ def process_audio_file(audio):
71
+ if audio is None:
72
+ return "Please provide an audio input.", None
73
+ return recognizer.process_audio(audio)
74
+
75
+ with gr.Blocks() as interface:
76
+ gr.Markdown("# Audio Emotion Recognition")
77
+ gr.Markdown("Record or upload audio to analyze the emotional content. The model works best with clear speech in English.")
78
 
79
+ with gr.Row():
80
+ with gr.Column():
81
+ audio_input = gr.Audio(
82
+ label="Upload or Record Audio",
83
+ type="numpy",
84
+ sources=["microphone", "upload"],
85
+ )
86
+ analyze_btn = gr.Button("Analyze Emotion")
87
+ gr.Markdown("Note: Audio will be automatically converted to mono and resampled if needed.")
88
+
89
+ with gr.Column():
90
+ output_text = gr.Textbox(
91
+ label="Results",
92
+ lines=5
93
+ )
94
+ output_plot = gr.BarPlot(
95
+ title="Emotion Confidence Scores",
96
+ x_title="Emotions",
97
+ y_title="Confidence (%)"
98
+ )
99
 
100
+ analyze_btn.click(
101
+ fn=process_audio_file,
102
+ inputs=[audio_input],
103
+ outputs=[output_text, output_plot]
104
+ )
105
+
106
+ return interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ if __name__ == "__main__":
109
+ demo = create_interface()
110
+ demo.launch(share=True)