File size: 1,749 Bytes
75795f2
6186718
 
 
f0698ec
33bef80
 
 
 
 
8197c6e
d6bc47b
 
 
 
 
 
75795f2
 
 
 
 
 
 
 
8197c6e
d3e099a
 
f0698ec
8197c6e
91f6a7c
8197c6e
a61e46a
8197c6e
a61e46a
8197c6e
 
 
f0698ec
8197c6e
 
e0a729c
8197c6e
50a8cbc
a61e46a
f0698ec
8197c6e
e0a729c
f0698ec
 
8197c6e
 
 
6186718
e0a729c
8197c6e
fe0ca9c
 
 
 
 
 
8197c6e
6186718
 
 
0e42c86
0547be0
6186718
 
 
 
 
 
 
d6bc47b
 
75795f2
d6bc47b
 
 
 
 
6186718
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from difflib import Differ

import gradio as gr

import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    pipeline,
)

description = """
 <div>
    <p>Roll up, roll up come test your diction against a 🤖</p>
 </div>
"""

diction_text = """
    <div>
        <p>How now brown cow</p>
    </div>
"""

diction = gr.HTML(diction_text)


device = "cpu"
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"


model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=8,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


def transcribe_audio(audio):
    result = pipe(audio)
    print(f'TRANSCRIPTION {result["text"]}')
    try:
        for r in result:
            print(r)
    except:
        print("ERROR")
    return result


input_audio = gr.Audio(
    sources=["microphone"],
    type="filepath",
    waveform_options=gr.WaveformOptions(
        waveform_color="#01C6FF",
        waveform_progress_color="#0066B4",
        skip_length=2,
        show_controls=False,
    ),
)
demo = gr.Interface(
    fn=transcribe_audio,
    inputs=[diction, input_audio],
    outputs="text",
    title="Test your diction",
    description=description,
    theme="abidlabs/Lime",
)

if __name__ == "__main__":
    demo.launch()