Spaces:
Sleeping
Sleeping
File size: 4,177 Bytes
75795f2 6186718 3ae01f4 f0698ec 33bef80 8197c6e 10f0b14 ab3ecd5 9a88197 04716fa ab3ecd5 e384807 8197c6e d3e099a ce1af45 f0698ec 8197c6e 34aef7d 04716fa 09a1814 04716fa 34aef7d 8197c6e a61e46a 8197c6e a61e46a 8197c6e f0698ec 8197c6e e0a729c 8197c6e 50a8cbc a61e46a f0698ec 8197c6e e0a729c f0698ec 8197c6e 6186718 3ae01f4 ce1af45 1a27263 ce1af45 1a27263 ab3ecd5 9a88197 ab3ecd5 ce1af45 9333506 3ae01f4 ab3ecd5 1a27263 04716fa cec13d4 7e182ab 1a27263 cec13d4 1a27263 7e182ab 1a27263 6186718 7e182ab 1596c0d 6186718 0e42c86 0547be0 6186718 a10fa69 b802d71 cec13d4 04716fa b802d71 6186718 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
from difflib import Differ
import gradio as gr
import string
import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
pipeline,
)
diction_text = """
How is this leisure to be disposed of? In the public-house? the singing hall? the dancing-saloon?
which hold out seductions somewhat more dangerous, methinks, to honest labour than those presented by a library...
We may well rejoice, then, when we see a room such as this filled with attentive and reflective readers.
"""
def set_text(text_for_display=diction_text):
return text_for_display
diction_script = gr.Textbox(
set_text, interactive=False, show_label=False, placeholder=diction_text
)
device = "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
description = f"""
<div>
<p>Welcome to Redmond Barry-oke! </p>
<p>This app aims to demonstrate the potential of using machine learning to transcribe audio. Users are invited to record themselves reading a brief and abridged excerpt from a speech delivered by Sir Redmond Barry at the opening of The Free Public Library of Ballarat Est in 1869<./p>
<p>Once recorded the audio can be submitted which will invoke the {model_id} machine learning model that is designed to convert the audio to text</p>
<p>When a transcript is ready, any punctuation is stripped out and it's compared with a stripped version of the original text</p>
<p>Any differences are highlighted using colour</p>
<p>Finally the differences are calculated as a percentage of the total number of characters, giving an accuracy score</p>
<p>A full transcript of Sir Redmond Barry's speech can be read in the <a href="https://latrobejournal.slv.vic.gov.au/latrobejournal/issue/latrobe-26/t1-g-t3.html" target="_blank">La Trobe Journal</a></p>
</div>
"""
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=8,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
def prepare_text_for_comparison(text_to_clean: str):
text_to_clean = text_to_clean.translate(str.maketrans("", "", string.punctuation))
return text_to_clean.casefold()
def diff_texts(diction_text: str, audio_input: str):
d = Differ()
return [
(token[2:], token[0] if token[0] != "" else None)
for token in d.compare(diction_text, audio_input)
]
def calc_score(diff_texts: list) -> float:
diff_chars = [char for char in diff_texts if char[1] != " "]
score = float((len(diff_chars) / len(diff_texts)) * 100)
score = 100 - score
return score
def transcribe_audio(diction_text, audio):
result = pipe(audio)
cleaned_result = prepare_text_for_comparison(result["text"])
cleaned_diction_text = prepare_text_for_comparison(diction_text)
diff_text = diff_texts(cleaned_diction_text, cleaned_result)
score = calc_score(diff_text)
formatted_score = f"{str(round(score,3))}%"
return result["text"], diff_text, formatted_score
transcribed_text = gr.Textbox(label="Transcribed text")
highlighted_results = gr.HighlightedText(
label="Text highlighted with diffs",
combine_adjacent=True,
show_legend=True,
color_map={"+": "green", "-": "red"},
)
score = gr.Textbox("0%", label="Barry-oke score")
input_audio = gr.Audio(
sources=["microphone"],
type="filepath",
waveform_options=gr.WaveformOptions(
waveform_color="#01C6FF",
waveform_progress_color="#0066B4",
skip_length=2,
show_controls=False,
),
)
demo = gr.Interface(
fn=transcribe_audio,
inputs=[diction_script, input_audio],
outputs=[transcribed_text, highlighted_results, score],
title="Redmond Barry-oke",
description=description,
)
if __name__ == "__main__":
demo.launch()
|