Boltz79 commited on
Commit
cd578af
·
verified ·
1 Parent(s): 88c3f37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -19
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import librosa
3
  import numpy as np
@@ -8,7 +9,7 @@ from speechbrain.inference.interfaces import foreign_class
8
  import io
9
  import matplotlib.pyplot as plt
10
  import librosa.display
11
- from PIL import Image # Added for image conversion
12
 
13
  # Try to import noisereduce (if not available, noise reduction will be skipped)
14
  try:
@@ -43,7 +44,13 @@ classifier = foreign_class(
43
  )
44
 
45
  def preprocess_audio(audio_file, apply_noise_reduction=False):
46
- """Load and preprocess the audio file: convert to 16kHz mono, optionally apply noise reduction, and normalize."""
 
 
 
 
 
 
47
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
48
  if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
49
  y = nr.reduce_noise(y=y, sr=sr)
@@ -55,7 +62,10 @@ def preprocess_audio(audio_file, apply_noise_reduction=False):
55
  return temp_file.name
56
 
57
  def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
58
- """Split longer audio files into overlapping segments, predict each segment, and return the majority-voted emotion label."""
 
 
 
59
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
60
  total_duration = librosa.get_duration(y=y, sr=sr)
61
 
@@ -80,7 +90,7 @@ def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duratio
80
  for seg in segments:
81
  temp_file = preprocess_audio(seg, apply_noise_reduction)
82
  _, _, _, label = classifier.classify_file(temp_file)
83
- predictions.append(label[0]) # Extract the predicted emotion
84
  os.remove(temp_file)
85
  os.remove(seg)
86
 
@@ -89,7 +99,9 @@ def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duratio
89
  return most_common
90
 
91
  def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
92
- """Predict emotion from an audio file and return the emotion with an emoji."""
 
 
93
  try:
94
  if use_ensemble:
95
  label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap)
@@ -97,18 +109,18 @@ def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False,
97
  temp_file = preprocess_audio(audio_file, apply_noise_reduction)
98
  result = classifier.classify_file(temp_file)
99
  os.remove(temp_file)
100
-
101
  if isinstance(result, tuple) and len(result) > 3:
102
- label = result[3][0] # Extract the predicted emotion label
103
  else:
104
- label = str(result) # Convert to string if unexpected format
105
-
106
- return add_emoji_to_label(label.lower()) # Format and add an emoji
107
  except Exception as e:
108
  return f"Error processing file: {str(e)}"
109
 
110
  def plot_waveform(audio_file):
111
- """Generate and return a waveform plot image (as a PIL Image) for the given audio file."""
 
 
112
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
113
  plt.figure(figsize=(10, 3))
114
  librosa.display.waveshow(y, sr=sr)
@@ -120,18 +132,19 @@ def plot_waveform(audio_file):
120
  return Image.open(buf)
121
 
122
  def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap):
123
- """Run emotion prediction and generate a waveform plot."""
 
 
 
124
  emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap)
125
  waveform = plot_waveform(audio_file)
126
- return emotion # Ensure emoji is included here
127
-
128
 
129
- # Build the enhanced UI using Gradio Blocks
130
  with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo:
131
- gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition</h1>")
132
  gr.Markdown(
133
  "Upload an audio file, and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. "
134
- "The prediction is accompanied by an emoji, and you can also view the audio's waveform. "
135
  "Use the options below to adjust ensemble prediction and noise reduction settings."
136
  )
137
 
@@ -149,7 +162,7 @@ with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: A
149
  waveform_image = gr.Image(label="Audio Waveform", type="pil")
150
 
151
  predict_button.click(
152
- predict_and_plot,
153
  inputs=[audio_input, use_ensemble, apply_noise_reduction, segment_duration, overlap],
154
  outputs=[result_text, waveform_image]
155
  )
@@ -163,7 +176,7 @@ with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: A
163
  - Ensemble Prediction for long audio files.
164
  - Optional Noise Reduction.
165
  - Visualization of the audio waveform.
166
- - Emoji representation of the predicted emotion.
167
 
168
  **Credits:**
169
  - [SpeechBrain](https://speechbrain.github.io)
 
1
+ # app.py
2
  import gradio as gr
3
  import librosa
4
  import numpy as np
 
9
  import io
10
  import matplotlib.pyplot as plt
11
  import librosa.display
12
+ from PIL import Image # For image conversion
13
 
14
  # Try to import noisereduce (if not available, noise reduction will be skipped)
15
  try:
 
44
  )
45
 
46
  def preprocess_audio(audio_file, apply_noise_reduction=False):
47
+ """
48
+ Load and preprocess the audio file:
49
+ - Convert to 16kHz mono.
50
+ - Optionally apply noise reduction.
51
+ - Normalize the audio.
52
+ Saves the processed audio to a temporary file and returns its path.
53
+ """
54
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
55
  if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
56
  y = nr.reduce_noise(y=y, sr=sr)
 
62
  return temp_file.name
63
 
64
  def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
65
+ """
66
+ For longer audio files, split into overlapping segments, predict each segment,
67
+ and return the majority-voted emotion label.
68
+ """
69
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
70
  total_duration = librosa.get_duration(y=y, sr=sr)
71
 
 
90
  for seg in segments:
91
  temp_file = preprocess_audio(seg, apply_noise_reduction)
92
  _, _, _, label = classifier.classify_file(temp_file)
93
+ predictions.append(label[0])
94
  os.remove(temp_file)
95
  os.remove(seg)
96
 
 
99
  return most_common
100
 
101
  def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
102
+ """
103
+ Predict emotion from an audio file and return the emotion with an emoji.
104
+ """
105
  try:
106
  if use_ensemble:
107
  label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap)
 
109
  temp_file = preprocess_audio(audio_file, apply_noise_reduction)
110
  result = classifier.classify_file(temp_file)
111
  os.remove(temp_file)
 
112
  if isinstance(result, tuple) and len(result) > 3:
113
+ label = result[3][0] # Extract predicted emotion label from the tuple
114
  else:
115
+ label = str(result)
116
+ return add_emoji_to_label(label.lower())
 
117
  except Exception as e:
118
  return f"Error processing file: {str(e)}"
119
 
120
  def plot_waveform(audio_file):
121
+ """
122
+ Generate and return a waveform plot image (as a PIL Image) for the given audio file.
123
+ """
124
  y, sr = librosa.load(audio_file, sr=16000, mono=True)
125
  plt.figure(figsize=(10, 3))
126
  librosa.display.waveshow(y, sr=sr)
 
132
  return Image.open(buf)
133
 
134
  def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap):
135
+ """
136
+ Run emotion prediction and generate a waveform plot.
137
+ Returns a tuple: (emotion label with emoji, waveform image as a PIL Image).
138
+ """
139
  emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap)
140
  waveform = plot_waveform(audio_file)
141
+ return emotion, waveform
 
142
 
 
143
  with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo:
144
+ gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition</h1>")
145
  gr.Markdown(
146
  "Upload an audio file, and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. "
147
+ "The prediction is accompanied by an emoji in the output, and you can also view the audio's waveform. "
148
  "Use the options below to adjust ensemble prediction and noise reduction settings."
149
  )
150
 
 
162
  waveform_image = gr.Image(label="Audio Waveform", type="pil")
163
 
164
  predict_button.click(
165
+ predict_and_plot,
166
  inputs=[audio_input, use_ensemble, apply_noise_reduction, segment_duration, overlap],
167
  outputs=[result_text, waveform_image]
168
  )
 
176
  - Ensemble Prediction for long audio files.
177
  - Optional Noise Reduction.
178
  - Visualization of the audio waveform.
179
+ - Emoji representation of the predicted emotion in the output.
180
 
181
  **Credits:**
182
  - [SpeechBrain](https://speechbrain.github.io)