File size: 3,730 Bytes
75795f2
6186718
 
 
3ae01f4
 
f0698ec
33bef80
 
 
 
 
8197c6e
10f0b14
ab3ecd5
9a88197
04716fa
 
ab3ecd5
 
eeb2fc4
8197c6e
d3e099a
ce1af45
f0698ec
8197c6e
34aef7d
 
 
 
04716fa
 
 
 
34aef7d
 
8197c6e
a61e46a
8197c6e
a61e46a
8197c6e
 
 
f0698ec
8197c6e
 
e0a729c
8197c6e
50a8cbc
a61e46a
f0698ec
8197c6e
e0a729c
f0698ec
 
8197c6e
 
 
6186718
3ae01f4
 
 
 
 
 
 
ce1af45
1a27263
 
 
 
ce1af45
1a27263
 
 
ab3ecd5
 
 
 
 
9a88197
 
ab3ecd5
 
 
ce1af45
9333506
3ae01f4
 
 
 
ab3ecd5
 
1a27263
04716fa
 
7e182ab
 
 
 
1a27263
 
 
 
 
 
7e182ab
1a27263
6186718
7e182ab
1596c0d
6186718
 
0e42c86
0547be0
6186718
 
 
 
 
 
 
a10fa69
b802d71
 
7e182ab
1596c0d
04716fa
b802d71
 
 
6186718
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from difflib import Differ

import gradio as gr

import string

import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    pipeline,
)


diction_text = """
How is this leisure to be disposed of? In the public-house? the singing hall? the dancing-saloon?
which hold out seductions somewhat more dangerous, methinks, to honest labour than those presented by a library...
We may well rejoice, then, when we see a room such as this filled with attentive and reflective readers.
"""

diction_script = gr.Textbox(diction_text, interactive=False, show_label=False)

device = "cpu"

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

description = f"""
 <div>
    <p>Welcome to Redmond Barry-oke! </p>
    <p>This app aims to demonstrate the potential of using machine learning to transcribe audio. Users are invited to record themselves reading a brief and abridged excerpt from a speech delivered by Sir Redmond Barry at the opening of The Free Public Library of Ballarat Est in 1869. Once recorded and submitted the app will transcribe and return a "diction" score.</p>
    <p>This app uses {model_id} to perform automated transcription</p>
    <p>A full transcript of Sir Redmond Barry's speech can be read in the <a href="https://latrobejournal.slv.vic.gov.au/latrobejournal/issue/latrobe-26/t1-g-t3.html" target="_blank">La Trobe Journal</a></p>
 </div>
"""


model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=8,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


def prepare_text_for_comparison(text_to_clean: str):

    text_to_clean = text_to_clean.translate(str.maketrans("", "", string.punctuation))

    return text_to_clean.casefold()


def diff_texts(diction_text: str, audio_input: str):
    d = Differ()

    return [
        (token[2:], token[0] if token[0] != "" else None)
        for token in d.compare(diction_text, audio_input)
    ]


def calc_score(diff_texts: list) -> float:

    diff_chars = [char for char in diff_texts if char[1] != " "]
    score = float((len(diff_chars) / len(diff_texts)) * 100)

    score = 100 - score

    return score


def transcribe_audio(diction_text, audio):
    result = pipe(audio)

    cleaned_result = prepare_text_for_comparison(result["text"])
    cleaned_diction_text = prepare_text_for_comparison(diction_text)
    diff_text = diff_texts(cleaned_diction_text, cleaned_result)

    score = calc_score(diff_text)

    formatted_score = f"{str(round(score,3))}%"

    return diction_text, diff_text, formatted_score


transcribed_text = gr.Textbox(label="Transcribed text")


highlighted_results = gr.HighlightedText(
    label="Diff",
    combine_adjacent=True,
    show_legend=True,
    color_map={"+": "green", "-": "red"},
)

score = gr.Textbox("0%", label="Barry-oke score")


input_audio = gr.Audio(
    sources=["microphone"],
    type="filepath",
    waveform_options=gr.WaveformOptions(
        waveform_color="#01C6FF",
        waveform_progress_color="#0066B4",
        skip_length=2,
        show_controls=False,
    ),
)

demo = gr.Interface(
    fn=transcribe_audio,
    inputs=[transcribed_text, diction_script, input_audio],
    outputs=[highlighted_results, score],
    title="Redmond Barry-oke",
    description=description,
)


if __name__ == "__main__":
    demo.launch()