Spaces:

alibababeig
/

SpeechT5-Farsi-ASR

Running

App Files Files Community

alibababeig commited on Feb 3

Commit

6eedc89

verified ·

1 Parent(s): 648ae92

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -0

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import re
 import ffmpeg
 import gradio as gr
@@ -36,6 +37,7 @@ def initialize_model():
 def handle_user_input(audio_path, video_path):
     audio_asr_result = None
     video_asr_result = None
@@ -71,6 +73,9 @@ def handle_user_input(audio_path, video_path):
         # Perform ASR on the audio waveform
         video_asr_result = perform_asr(waveform)
     return audio_asr_result, video_asr_result
@@ -84,6 +89,7 @@ def perform_asr(waveform):
     else:
         raise ValueError(f'Bad audio array shape: "{waveform.shape}"')
     # Split the audio array into smaller frames
     audio_frames = []
     start_idx = 0
@@ -107,11 +113,17 @@ def perform_asr(waveform):
         audio_frames.append(waveform[start_idx:break_point])
         start_idx = break_point
     # Apply noise reduction on each audio frame
     audio_frames = [
         nr.reduce_noise(y=frame, sr=AUDIO_SAMPLING_RATE)
         for frame in audio_frames
     ]
     ######################### Method 1 - For Loop #########################
@@ -135,6 +147,7 @@ def perform_asr(waveform):
     ######################### Method 2 - Batch ############################
     # Process the entire batch of audio frames
     inputs = processor(
         audio=audio_frames,
@@ -154,9 +167,14 @@ def perform_asr(waveform):
         predicted_ids,
         skip_special_tokens=True
     )
     # Clean the model-generated transcriptions
     transcriptions = [clean_model_answer(t) for t in transcriptions]
     return '\n\n'.join(transcriptions)

 import re
+import time
 import ffmpeg
 import gradio as gr
 def handle_user_input(audio_path, video_path):
+    t_start = time.time()
     audio_asr_result = None
     video_asr_result = None
         # Perform ASR on the audio waveform
         video_asr_result = perform_asr(waveform)
+    delta_t = time.time() - t_start
+    print(f'Total Time      = {delta_t:5.1f} s\n')
     return audio_asr_result, video_asr_result
     else:
         raise ValueError(f'Bad audio array shape: "{waveform.shape}"')
+    t_start = time.time()
     # Split the audio array into smaller frames
     audio_frames = []
     start_idx = 0
         audio_frames.append(waveform[start_idx:break_point])
         start_idx = break_point
+    delta_t = time.time() - t_start
+    print(f'Audio Framing   = {delta_t:5.1f} s')
+    t_start = time.time()
     # Apply noise reduction on each audio frame
     audio_frames = [
         nr.reduce_noise(y=frame, sr=AUDIO_SAMPLING_RATE)
         for frame in audio_frames
     ]
+    delta_t = time.time() - t_start
+    print(f'Noise Reduction = {delta_t:5.1f} s')
     ######################### Method 1 - For Loop #########################
     ######################### Method 2 - Batch ############################
+    t_start = time.time()
     # Process the entire batch of audio frames
     inputs = processor(
         audio=audio_frames,
         predicted_ids,
         skip_special_tokens=True
     )
+    delta_t = time.time() - t_start
+    print(f'Text Generation = {delta_t:5.1f} s')
+    t_start = time.time()
     # Clean the model-generated transcriptions
     transcriptions = [clean_model_answer(t) for t in transcriptions]
+    delta_t = time.time() - t_start
+    print(f'Text Cleaning   = {delta_t:5.1f} s')
     return '\n\n'.join(transcriptions)