Spaces:

Yehor
/

hubert-uk-demo

Sleeping

App Files Files Community

Yehor Smoliakov commited on Jul 26

Commit

bd540a9

•

1 Parent(s): 9cece8a

Refactor the app

Browse files

Files changed (15) hide show

README.md +1 -1
app.py +117 -31
example_1.wav +0 -0
example_2.wav +0 -0
example_3.wav +0 -0
example_4.wav +0 -0
example_5.wav +0 -0
example_6.wav +0 -0
requirements.txt +3 -0
sample_1.wav +0 -3
sample_2.wav +0 -3
sample_3.wav +0 -3
sample_4.wav +0 -3
sample_5.wav +0 -3
sample_6.wav +0 -3

README.md CHANGED Viewed

@@ -11,7 +11,7 @@ pinned: true
 ## Install
 ```shell
-uv venv --python 3.12
 source .venv/bin/activate

 ## Install
 ```shell
+uv venv --python 3.11
 source .venv/bin/activate

app.py CHANGED Viewed

@@ -1,59 +1,129 @@
 import time
 import torch
 import librosa
 import gradio as gr
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
 model_name = "Yehor/w2v-bert-2.0-uk-v2"
-device = "cpu"
-max_duration = 30
-asr_model = AutoModelForCTC.from_pretrained(model_name).to(device)
 processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
-audio_samples = [
- "sample_1.wav",
- "sample_2.wav",
- "sample_3.wav",
- "sample_4.wav",
- "sample_5.wav",
- "sample_6.wav",
 ]
-description_head = """
 # Speech-to-Text for Ukrainian v2
 ## Overview
-This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model that solves
-a Speech-to-Text task for the Ukrainian language.
 """.strip()
-description_foot = """
 ## Community
-- Join our Discord server - https://discord.gg/yVAjkBgmt4 - where we're talking about Data Science,
-Machine Learning, Deep Learning, and Artificial Intelligence.
-- Join our Speech Recognition Group in Telegram: https://t.me/speech_recognition_uk
-## Authors
-Yehor Smoliakov: https://github.com/egorsmkv on GitHub, and [email protected] for private discussions.
 """.strip()
 def inference(audio_path, progress=gr.Progress()):
- gr.Info("Starting process", duration=2)
- progress(0, desc="Starting")
  duration = librosa.get_duration(path=audio_path)
  if duration > max_duration:
- raise gr.Error("The duration of the file exceeds 10 seconds.")
  paths = [
  audio_path,
@@ -70,12 +140,18 @@ def inference(audio_path, progress=gr.Progress()):
  features = processor([audio_input], sampling_rate=16_000).input_features
  features = torch.tensor(features).to(device)
  with torch.inference_mode():
  logits = asr_model(features).logits
  predicted_ids = torch.argmax(logits, dim=-1)
  predictions = processor.batch_decode(predicted_ids)
  elapsed_time = round(time.time() - t0, 2)
  rtf = round(elapsed_time / audio_duration, 4)
  audio_duration = round(audio_duration, 2)
@@ -89,7 +165,7 @@ def inference(audio_path, progress=gr.Progress()):
  }
  )
- gr.Info("Finished...", duration=2)
  result_texts = []
@@ -113,24 +189,34 @@ demo = gr.Blocks(
 with demo:
  gr.Markdown(description_head)
- gr.Markdown(f"## Demo (max. duration: **{max_duration}** seconds)")
  with gr.Row():
  audio_file = gr.Audio(label="Audio file", type="filepath")
  transcription = gr.Markdown(
  label="Transcription",
- value="Recognized text will appear here. Use **an example file** below the Recognize button,"
- "upload **your audio file**, or use **the microphone** to record something...",
  )
- gr.Button("Recognize").click(inference, inputs=audio_file, outputs=transcription)
  with gr.Row():
- gr.Examples(
- label="Choose an example audio", inputs=audio_file, examples=audio_samples
- )
  gr.Markdown(description_foot)
 if __name__ == "__main__":
  demo.launch()

+import sys
 import time
 import torch
+import torchaudio
 import librosa
 import gradio as gr
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
+# Config
 model_name = "Yehor/w2v-bert-2.0-uk-v2"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch_dtype = torch.float16
+min_duration = 0.5
+max_duration = 60
+concurrency_limit = 1
+use_torch_compile = False
+# Load the model
+asr_model = AutoModelForCTC.from_pretrained(model_name, torch_dtype=torch_dtype).to(device)
 processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
+if use_torch_compile:
+ asr_model = torch.compile(asr_model)
+# Elements
+examples = [
+ "example_1.wav",
+ "example_2.wav",
+ "example_3.wav",
+ "example_4.wav",
+ "example_5.wav",
+ "example_6.wav",
 ]
+examples_table = '''
+| File | Text |
+| ------------- | ------------- |
+| `example_1.wav` | тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену |
+| `example_2.wav` | всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування |
+| `example_3.wav` | хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні |
+| `example_4.wav` | використовує на фронті все що має і хімічна зброя не нийняток тож з чим маємо справу розбиралася марія моганисян |
+| `example_5.wav` | двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами |
+| `example_6.wav` | на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини |
+'''.strip()
+# https://www.tablesgenerator.com/markdown_tables
+authors_table = '''
+## Authors
+Follow them in social networks and **contact** if you need any help or have any questions:
+| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
+|-------------------------------------------------------------------------------------------------|
+| https://t.me/smlkw in Telegram |
+| https://x.com/yehor_smoliakov at X |
+| https://github.com/egorsmkv at GitHub |
+| https://huggingface.co/Yehor at Hugging Face |
+| or use [email protected] |
+'''.strip()
+description_head = f"""
 # Speech-to-Text for Ukrainian v2
 ## Overview
+This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model to recognize audio files.
+> For demo, audio duration **must not** exceed **{max_duration}** seconds.
 """.strip()
+description_foot = f"""
 ## Community
+- Join our Discord server where we talk about AI/ML/DL: https://discord.gg/yVAjkBgmt4
+- Join our Speech Recognition group in Telegram: https://t.me/speech_recognition_uk
+## More
+Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
+{authors_table}
+""".strip()
+transcription_value = """
+Recognized text will appear here.
+Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record something.
+""".strip()
+tech_env = f"""
+#### Environment
+- Python: {sys.version}
+- Torch device: {device}
+- Torch dtype: {torch_dtype}
+- Use torch.compile: {use_torch_compile}
+""".strip()
+tech_libraries = f"""
+#### Libraries
+- PyTorch: {torch.__version__}
+- Transformers: {torch.__version__}
+- Librosa: {librosa.version.version}
+- Gradio: {gr.__version__}
 """.strip()
 def inference(audio_path, progress=gr.Progress()):
+ if not audio_path:
+ raise gr.Error("Please upload an audio file.")
+ gr.Info("Starting recognition", duration=2)
+ progress(0, desc="Recognizing")
  duration = librosa.get_duration(path=audio_path)
+ if duration < min_duration:
+ raise gr.Error(f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds.")
  if duration > max_duration:
+ raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")
  paths = [
  audio_path,
  features = processor([audio_input], sampling_rate=16_000).input_features
  features = torch.tensor(features).to(device)
+ if torch_dtype == torch.float16:
+ features = features.half()
  with torch.inference_mode():
  logits = asr_model(features).logits
  predicted_ids = torch.argmax(logits, dim=-1)
  predictions = processor.batch_decode(predicted_ids)
+ if not predictions:
+ predictions = '-'
  elapsed_time = round(time.time() - t0, 2)
  rtf = round(elapsed_time / audio_duration, 4)
  audio_duration = round(audio_duration, 2)
  }
  )
+ gr.Info("Finished!", duration=2)
  result_texts = []
 with demo:
  gr.Markdown(description_head)
+ gr.Markdown("## Demo")
  with gr.Row():
  audio_file = gr.Audio(label="Audio file", type="filepath")
  transcription = gr.Markdown(
  label="Transcription",
+ value=transcription_value,
  )
+ gr.Button("Recognize").click(
+ inference,
+ concurrency_limit=concurrency_limit,
+ inputs=audio_file,
+ outputs=transcription,
+ )
  with gr.Row():
+ gr.Examples(label="Choose an example", inputs=audio_file, examples=examples)
+ gr.Markdown(examples_table)
  gr.Markdown(description_foot)
+ gr.Markdown('### Gradio app uses the following technologies:')
+ with gr.Row():
+ gr.Markdown(tech_env)
+ gr.Markdown(tech_libraries)
 if __name__ == "__main__":
+ demo.queue()
  demo.launch()

example_1.wav ADDED Viewed

Binary file (273 kB). View file

example_2.wav ADDED Viewed

Binary file (200 kB). View file

example_3.wav ADDED Viewed

Binary file (193 kB). View file

example_4.wav ADDED Viewed

Binary file (241 kB). View file

example_5.wav ADDED Viewed

Binary file (193 kB). View file

example_6.wav ADDED Viewed

Binary file (186 kB). View file

requirements.txt CHANGED Viewed

@@ -3,6 +3,9 @@ gradio
 torch
 torchaudio
 transformers
 librosa

 torch
 torchaudio
+triton
+setuptools
 transformers
 librosa

sample_1.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:172ade978b299f4a0c47e3b76666d1a06161e6001fbb5591b82038a1bbc4b5ad
-size 272568

sample_2.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:98fe42f22f8ea632714081a958dc035f3d507523fd340b320a1223ac2f55ccac
-size 199942

sample_3.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:83c0b7375beada8cee74b5de226da494368fcc6a3ce692913b3302dcda0bd9a2
-size 192842

sample_4.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:19e466ee9c0c129c1eecf93eb6791a44c2ee8d68dce2c3e8fd3734b87f28324a
-size 241442

sample_5.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5af19120c92859846a08496e0a617c21877cae2db5807d211f0a431d95163a3e
-size 193388

sample_6.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac877968d5749438930339497f7548046003390a848496136f6cbe8a74c51629
-size 186290