Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import re
|
|
|
2 |
|
3 |
import ffmpeg
|
4 |
import gradio as gr
|
@@ -36,6 +37,7 @@ def initialize_model():
|
|
36 |
|
37 |
|
38 |
def handle_user_input(audio_path, video_path):
|
|
|
39 |
audio_asr_result = None
|
40 |
video_asr_result = None
|
41 |
|
@@ -71,6 +73,9 @@ def handle_user_input(audio_path, video_path):
|
|
71 |
# Perform ASR on the audio waveform
|
72 |
video_asr_result = perform_asr(waveform)
|
73 |
|
|
|
|
|
|
|
74 |
return audio_asr_result, video_asr_result
|
75 |
|
76 |
|
@@ -84,6 +89,7 @@ def perform_asr(waveform):
|
|
84 |
else:
|
85 |
raise ValueError(f'Bad audio array shape: "{waveform.shape}"')
|
86 |
|
|
|
87 |
# Split the audio array into smaller frames
|
88 |
audio_frames = []
|
89 |
start_idx = 0
|
@@ -107,11 +113,17 @@ def perform_asr(waveform):
|
|
107 |
audio_frames.append(waveform[start_idx:break_point])
|
108 |
start_idx = break_point
|
109 |
|
|
|
|
|
|
|
|
|
110 |
# Apply noise reduction on each audio frame
|
111 |
audio_frames = [
|
112 |
nr.reduce_noise(y=frame, sr=AUDIO_SAMPLING_RATE)
|
113 |
for frame in audio_frames
|
114 |
]
|
|
|
|
|
115 |
|
116 |
######################### Method 1 - For Loop #########################
|
117 |
|
@@ -135,6 +147,7 @@ def perform_asr(waveform):
|
|
135 |
|
136 |
######################### Method 2 - Batch ############################
|
137 |
|
|
|
138 |
# Process the entire batch of audio frames
|
139 |
inputs = processor(
|
140 |
audio=audio_frames,
|
@@ -154,9 +167,14 @@ def perform_asr(waveform):
|
|
154 |
predicted_ids,
|
155 |
skip_special_tokens=True
|
156 |
)
|
|
|
|
|
157 |
|
|
|
158 |
# Clean the model-generated transcriptions
|
159 |
transcriptions = [clean_model_answer(t) for t in transcriptions]
|
|
|
|
|
160 |
|
161 |
return '\n\n'.join(transcriptions)
|
162 |
|
|
|
1 |
import re
|
2 |
+
import time
|
3 |
|
4 |
import ffmpeg
|
5 |
import gradio as gr
|
|
|
37 |
|
38 |
|
39 |
def handle_user_input(audio_path, video_path):
|
40 |
+
t_start = time.time()
|
41 |
audio_asr_result = None
|
42 |
video_asr_result = None
|
43 |
|
|
|
73 |
# Perform ASR on the audio waveform
|
74 |
video_asr_result = perform_asr(waveform)
|
75 |
|
76 |
+
delta_t = time.time() - t_start
|
77 |
+
print(f'Total Time = {delta_t:5.1f} s\n')
|
78 |
+
|
79 |
return audio_asr_result, video_asr_result
|
80 |
|
81 |
|
|
|
89 |
else:
|
90 |
raise ValueError(f'Bad audio array shape: "{waveform.shape}"')
|
91 |
|
92 |
+
t_start = time.time()
|
93 |
# Split the audio array into smaller frames
|
94 |
audio_frames = []
|
95 |
start_idx = 0
|
|
|
113 |
audio_frames.append(waveform[start_idx:break_point])
|
114 |
start_idx = break_point
|
115 |
|
116 |
+
delta_t = time.time() - t_start
|
117 |
+
print(f'Audio Framing = {delta_t:5.1f} s')
|
118 |
+
|
119 |
+
t_start = time.time()
|
120 |
# Apply noise reduction on each audio frame
|
121 |
audio_frames = [
|
122 |
nr.reduce_noise(y=frame, sr=AUDIO_SAMPLING_RATE)
|
123 |
for frame in audio_frames
|
124 |
]
|
125 |
+
delta_t = time.time() - t_start
|
126 |
+
print(f'Noise Reduction = {delta_t:5.1f} s')
|
127 |
|
128 |
######################### Method 1 - For Loop #########################
|
129 |
|
|
|
147 |
|
148 |
######################### Method 2 - Batch ############################
|
149 |
|
150 |
+
t_start = time.time()
|
151 |
# Process the entire batch of audio frames
|
152 |
inputs = processor(
|
153 |
audio=audio_frames,
|
|
|
167 |
predicted_ids,
|
168 |
skip_special_tokens=True
|
169 |
)
|
170 |
+
delta_t = time.time() - t_start
|
171 |
+
print(f'Text Generation = {delta_t:5.1f} s')
|
172 |
|
173 |
+
t_start = time.time()
|
174 |
# Clean the model-generated transcriptions
|
175 |
transcriptions = [clean_model_answer(t) for t in transcriptions]
|
176 |
+
delta_t = time.time() - t_start
|
177 |
+
print(f'Text Cleaning = {delta_t:5.1f} s')
|
178 |
|
179 |
return '\n\n'.join(transcriptions)
|
180 |
|