File size: 3,673 Bytes
d4bbf90
 
8cd0fcd
d4bbf90
8cd0fcd
d4bbf90
8cd0fcd
d4bbf90
8cd0fcd
d4bbf90
 
 
8cd0fcd
d4bbf90
 
 
 
 
 
 
 
 
 
8cd0fcd
d4bbf90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e54a92c
d4bbf90
e54a92c
d4bbf90
e54a92c
d4bbf90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import time

import torch
import librosa

import gradio as gr

from transformers import AutoModelForCTC, Wav2Vec2BertProcessor

model_name = "Yehor/w2v-bert-2.0-uk"
device = "cpu"
max_duration = 30

asr_model = AutoModelForCTC.from_pretrained(model_name).to(device)
processor = Wav2Vec2BertProcessor.from_pretrained(model_name)

audio_samples = [
    "sample_1.wav",
    "sample_2.wav",
    "sample_3.wav",
    "sample_4.wav",
    "sample_5.wav",
    "sample_6.wav",
]

description_head = """
# Speech-to-Text for Ukrainian

## Overview

This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk model that solves 
a Speech-to-Text task for the Ukrainian language.
""".strip()

description_foot = """
## Community

- Join our Discord server - https://discord.gg/yVAjkBgmt4 - where we're talking about Data Science, 
Machine Learning, Deep Learning, and Artificial Intelligence.

- Join our Speech Recognition Group in Telegram: https://t.me/speech_recognition_uk
""".strip()


def inference(audio_path, progress=gr.Progress()):
    gr.Info("Starting process", duration=2)

    progress(0, desc="Starting")

    duration = librosa.get_duration(path=audio_path)
    if duration > max_duration:
        raise gr.Error("The duration of the file exceeds 10 seconds.")

    paths = [
        audio_path,
    ]

    results = []

    for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
        t0 = time.time()

        audio_duration = librosa.get_duration(path=path, sr=16_000)
        audio_input, _ = librosa.load(path, mono=True, sr=16_000)

        features = processor([audio_input], sampling_rate=16_000).input_features
        features = torch.tensor(features).to(device)

        with torch.inference_mode():
            logits = asr_model(features).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predictions = processor.batch_decode(predicted_ids)

        elapsed_time = round(time.time() - t0, 2)
        rtf = round(elapsed_time / audio_duration, 4)
        audio_duration = round(audio_duration, 2)

        results.append(
            {
                "path": path.split("/")[-1],
                "transcription": "\n".join(predictions),
                "audio_duration": audio_duration,
                "rtf": rtf,
            }
        )

    gr.Info("Finished...", duration=2)

    result_texts = []

    for result in results:
        result_texts.append(f'**{result["path"]}**')
        result_texts.append("\n\n")
        result_texts.append(f'> {result["transcription"]}')
        result_texts.append("\n\n")
        result_texts.append(f'**Audio duration**: {result["audio_duration"]}')
        result_texts.append("\n")
        result_texts.append(f'**Real-Time Factor**: {result["rtf"]}')

    return "\n".join(result_texts)


demo = gr.Blocks(
    title="Speech-to-Text for Ukrainian",
    analytics_enabled=False,
)

with demo:
    gr.Markdown(description_head)

    gr.Markdown(f"## Demo (max. duration: **{max_duration}** seconds)")

    with gr.Row():
        audio_file = gr.Audio(label="Audio file", type="filepath")
        transcription = gr.Markdown(
            label="Transcription",
            value="Recognized text will appear here. Use **an example file** below the Recognize button,"
            "upload **your audio file**, or use **the microphone** to record something...",
        )

    gr.Button("Recognize").click(inference, inputs=audio_file, outputs=transcription)

    with gr.Row():
        gr.Examples(
            label="Choose an example audio", inputs=audio_file, examples=audio_samples
        )

    gr.Markdown(description_foot)

if __name__ == "__main__":
    demo.launch()