Yehor Smoliakov commited on
Commit
d6446fc
1 Parent(s): 3f925e7

Remove librosa

Browse files
Files changed (3) hide show
  1. README.md +7 -1
  2. app.py +25 -11
  3. requirements.txt +2 -2
README.md CHANGED
@@ -20,4 +20,10 @@ uv pip install -r requirements.txt
20
 
21
  # in development mode
22
  uv pip install -r requirements-dev.txt
23
- ```
 
 
 
 
 
 
 
20
 
21
  # in development mode
22
  uv pip install -r requirements-dev.txt
23
+ ```
24
+
25
+ ## Run
26
+
27
+ ```shell
28
+ python app.py
29
+ ```
app.py CHANGED
@@ -2,7 +2,8 @@ import sys
2
  import time
3
 
4
  import torch
5
- import librosa
 
6
 
7
  import gradio as gr
8
 
@@ -74,7 +75,7 @@ description_head = f"""
74
 
75
  This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model to recognize audio files.
76
 
77
- > For demo, audio duration **must not** exceed **{max_duration}** seconds.
78
  """.strip()
79
 
80
  description_foot = f"""
@@ -93,7 +94,7 @@ Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
93
  transcription_value = """
94
  Recognized text will appear here.
95
 
96
- Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record something.
97
  """.strip()
98
 
99
  tech_env = f"""
@@ -108,10 +109,10 @@ tech_env = f"""
108
  tech_libraries = f"""
109
  #### Libraries
110
 
111
- - PyTorch: {torch.__version__}
112
- - Transformers: {transformers_version}
113
- - Librosa: {librosa.version.version}
114
- - Gradio: {gr.__version__}
115
  """.strip()
116
 
117
 
@@ -122,8 +123,10 @@ def inference(audio_path, progress=gr.Progress()):
122
  gr.Info("Starting recognition", duration=2)
123
 
124
  progress(0, desc="Recognizing")
 
 
 
125
 
126
- duration = librosa.get_duration(path=audio_path)
127
  if duration < min_duration:
128
  raise gr.Error(
129
  f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
@@ -140,8 +143,19 @@ def inference(audio_path, progress=gr.Progress()):
140
  for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
141
  t0 = time.time()
142
 
143
- audio_duration = librosa.get_duration(path=path, sr=16_000)
144
- audio_input, _ = librosa.load(path, mono=True, sr=16_000)
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  features = processor([audio_input], sampling_rate=16_000).input_features
147
  features = torch.tensor(features).to(device)
@@ -196,7 +210,7 @@ demo = gr.Blocks(
196
  with demo:
197
  gr.Markdown(description_head)
198
 
199
- gr.Markdown("## Demo")
200
 
201
  with gr.Row():
202
  audio_file = gr.Audio(label="Audio file", type="filepath")
 
2
  import time
3
 
4
  import torch
5
+ import torchaudio
6
+ import torchaudio.transforms as T
7
 
8
  import gradio as gr
9
 
 
75
 
76
  This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model to recognize audio files.
77
 
78
+ > Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds.
79
  """.strip()
80
 
81
  description_foot = f"""
 
94
  transcription_value = """
95
  Recognized text will appear here.
96
 
97
+ Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record own voice.
98
  """.strip()
99
 
100
  tech_env = f"""
 
109
  tech_libraries = f"""
110
  #### Libraries
111
 
112
+ - torch: {torch.__version__}
113
+ - torchaudio: {torchaudio.__version__}
114
+ - transformers: {transformers_version}
115
+ - gradio: {gr.__version__}
116
  """.strip()
117
 
118
 
 
123
  gr.Info("Starting recognition", duration=2)
124
 
125
  progress(0, desc="Recognizing")
126
+
127
+ meta = torchaudio.info(audio_path)
128
+ duration = meta.num_frames / meta.sample_rate
129
 
 
130
  if duration < min_duration:
131
  raise gr.Error(
132
  f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
 
143
  for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
144
  t0 = time.time()
145
 
146
+ meta = torchaudio.info(audio_path)
147
+ audio_duration = meta.num_frames / meta.sample_rate
148
+
149
+ audio_input, sr = torchaudio.load(path)
150
+
151
+ if meta.num_channels > 1:
152
+ audio_input = torch.mean(audio_input, dim=0, keepdim=True)
153
+
154
+ if meta.sample_rate != 16_000:
155
+ resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype)
156
+ audio_input = resampler(audio_input)
157
+
158
+ audio_input = audio_input.squeeze().numpy()
159
 
160
  features = processor([audio_input], sampling_rate=16_000).input_features
161
  features = torch.tensor(features).to(device)
 
210
  with demo:
211
  gr.Markdown(description_head)
212
 
213
+ gr.Markdown("## Usage")
214
 
215
  with gr.Row():
216
  audio_file = gr.Audio(label="Audio file", type="filepath")
requirements.txt CHANGED
@@ -3,9 +3,9 @@ gradio
3
  torch
4
  torchaudio
5
 
 
 
6
  triton
7
  setuptools
8
 
9
  transformers
10
-
11
- librosa
 
3
  torch
4
  torchaudio
5
 
6
+ soundfile
7
+
8
  triton
9
  setuptools
10
 
11
  transformers