File size: 4,177 Bytes
75795f2
6186718
 
 
3ae01f4
 
f0698ec
33bef80
 
 
 
 
8197c6e
10f0b14
ab3ecd5
9a88197
04716fa
 
ab3ecd5
 
e384807
 
 
 
 
 
 
 
 
8197c6e
d3e099a
ce1af45
f0698ec
8197c6e
34aef7d
 
 
 
04716fa
09a1814
 
 
 
 
04716fa
34aef7d
 
8197c6e
a61e46a
8197c6e
a61e46a
8197c6e
 
 
f0698ec
8197c6e
 
e0a729c
8197c6e
50a8cbc
a61e46a
f0698ec
8197c6e
e0a729c
f0698ec
 
8197c6e
 
 
6186718
3ae01f4
 
 
 
 
 
 
ce1af45
1a27263
 
 
 
ce1af45
1a27263
 
 
ab3ecd5
 
 
 
 
9a88197
 
ab3ecd5
 
 
ce1af45
9333506
3ae01f4
 
 
 
ab3ecd5
 
1a27263
04716fa
 
cec13d4
7e182ab
 
 
1a27263
 
 
cec13d4
1a27263
 
7e182ab
1a27263
6186718
7e182ab
1596c0d
6186718
 
0e42c86
0547be0
6186718
 
 
 
 
 
 
a10fa69
b802d71
 
cec13d4
 
04716fa
b802d71
 
 
6186718
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from difflib import Differ

import gradio as gr

import string

import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    pipeline,
)


diction_text = """
How is this leisure to be disposed of? In the public-house? the singing hall? the dancing-saloon?
which hold out seductions somewhat more dangerous, methinks, to honest labour than those presented by a library...
We may well rejoice, then, when we see a room such as this filled with attentive and reflective readers.
"""


def set_text(text_for_display=diction_text):

    return text_for_display


diction_script = gr.Textbox(
    set_text, interactive=False, show_label=False, placeholder=diction_text
)

device = "cpu"

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

description = f"""
 <div>
    <p>Welcome to Redmond Barry-oke! </p>
    <p>This app aims to demonstrate the potential of using machine learning to transcribe audio. Users are invited to record themselves reading a brief and abridged excerpt from a speech delivered by Sir Redmond Barry at the opening of The Free Public Library of Ballarat Est in 1869<./p>
    <p>Once recorded the audio can be submitted which will invoke the {model_id} machine learning model that is designed to convert the audio to text</p>
    <p>When a transcript is ready, any punctuation is stripped out and it's compared with a stripped version of the original text</p>
    <p>Any differences are highlighted using colour</p>
    <p>Finally the differences are calculated as a percentage of the total number of characters, giving an accuracy score</p>
    <p>A full transcript of Sir Redmond Barry's speech can be read in the <a href="https://latrobejournal.slv.vic.gov.au/latrobejournal/issue/latrobe-26/t1-g-t3.html" target="_blank">La Trobe Journal</a></p>
 </div>
"""


model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=8,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


def prepare_text_for_comparison(text_to_clean: str):

    text_to_clean = text_to_clean.translate(str.maketrans("", "", string.punctuation))

    return text_to_clean.casefold()


def diff_texts(diction_text: str, audio_input: str):
    d = Differ()

    return [
        (token[2:], token[0] if token[0] != "" else None)
        for token in d.compare(diction_text, audio_input)
    ]


def calc_score(diff_texts: list) -> float:

    diff_chars = [char for char in diff_texts if char[1] != " "]
    score = float((len(diff_chars) / len(diff_texts)) * 100)

    score = 100 - score

    return score


def transcribe_audio(diction_text, audio):
    result = pipe(audio)

    cleaned_result = prepare_text_for_comparison(result["text"])
    cleaned_diction_text = prepare_text_for_comparison(diction_text)
    diff_text = diff_texts(cleaned_diction_text, cleaned_result)

    score = calc_score(diff_text)

    formatted_score = f"{str(round(score,3))}%"

    return result["text"], diff_text, formatted_score


transcribed_text = gr.Textbox(label="Transcribed text")


highlighted_results = gr.HighlightedText(
    label="Text highlighted with diffs",
    combine_adjacent=True,
    show_legend=True,
    color_map={"+": "green", "-": "red"},
)

score = gr.Textbox("0%", label="Barry-oke score")


input_audio = gr.Audio(
    sources=["microphone"],
    type="filepath",
    waveform_options=gr.WaveformOptions(
        waveform_color="#01C6FF",
        waveform_progress_color="#0066B4",
        skip_length=2,
        show_controls=False,
    ),
)

demo = gr.Interface(
    fn=transcribe_audio,
    inputs=[diction_script, input_audio],
    outputs=[transcribed_text, highlighted_results, score],
    title="Redmond Barry-oke",
    description=description,
)


if __name__ == "__main__":
    demo.launch()