Spaces:
Sleeping
Sleeping
from difflib import Differ | |
import gradio as gr | |
import string | |
import torch | |
from transformers import ( | |
AutoModelForSpeechSeq2Seq, | |
AutoProcessor, | |
pipeline, | |
) | |
diction_text = """ | |
How is this leisure to be disposed of? In the public-house? the singing hall? the dancing-saloon? | |
which hold out seductions somewhat more dangerous, methinks, to honest labour than those presented by a library... | |
We may well rejoice, then, when we see a room such as this filled with attentive and reflective readers. | |
""" | |
def set_text(text_for_display=diction_text): | |
return text_for_display | |
diction_script = gr.Textbox( | |
set_text, interactive=False, show_label=False, placeholder=diction_text | |
) | |
device = "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
model_id = "openai/whisper-large-v3" | |
description = f""" | |
<div> | |
<p>Welcome to Redmond Barry-oke! </p> | |
<p>This app aims to demonstrate the potential of using machine learning to transcribe audio. Users are invited to record themselves reading a brief and abridged excerpt from a speech delivered by Sir Redmond Barry at the opening of The Free Public Library of Ballarat Est in 1869<./p> | |
<p>Once recorded the audio can be submitted which will invoke the {model_id} machine learning model that is designed to convert the audio to text</p> | |
<p>When a transcript is ready, any punctuation is stripped out and it's compared with a stripped version of the original text</p> | |
<p>Any differences are highlighted using colour</p> | |
<p>Finally the differences are calculated as a percentage of the total number of characters, giving an accuracy score</p> | |
<p>A full transcript of Sir Redmond Barry's speech can be read in the <a href="https://latrobejournal.slv.vic.gov.au/latrobejournal/issue/latrobe-26/t1-g-t3.html" target="_blank">La Trobe Journal</a></p> | |
</div> | |
""" | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, low_cpu_mem_usage=True, use_safetensors=True | |
) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(model_id) | |
pipe = pipeline( | |
task="automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
max_new_tokens=128, | |
chunk_length_s=30, | |
batch_size=8, | |
return_timestamps=True, | |
torch_dtype=torch_dtype, | |
device=device, | |
) | |
def prepare_text_for_comparison(text_to_clean: str): | |
text_to_clean = text_to_clean.translate(str.maketrans("", "", string.punctuation)) | |
return text_to_clean.casefold() | |
def diff_texts(diction_text: str, audio_input: str): | |
d = Differ() | |
return [ | |
(token[2:], token[0] if token[0] != "" else None) | |
for token in d.compare(diction_text, audio_input) | |
] | |
def calc_score(diff_texts: list) -> float: | |
diff_chars = [char for char in diff_texts if char[1] != " "] | |
score = float((len(diff_chars) / len(diff_texts)) * 100) | |
score = 100 - score | |
return score | |
def transcribe_audio(diction_text, audio): | |
result = pipe(audio) | |
cleaned_result = prepare_text_for_comparison(result["text"]) | |
cleaned_diction_text = prepare_text_for_comparison(diction_text) | |
diff_text = diff_texts(cleaned_diction_text, cleaned_result) | |
score = calc_score(diff_text) | |
formatted_score = f"{str(round(score,3))}%" | |
return result["text"], diff_text, formatted_score | |
transcribed_text = gr.Textbox(label="Transcribed text") | |
highlighted_results = gr.HighlightedText( | |
label="Text highlighted with diffs", | |
combine_adjacent=True, | |
show_legend=True, | |
color_map={"+": "green", "-": "red"}, | |
) | |
score = gr.Textbox("0%", label="Barry-oke score") | |
input_audio = gr.Audio( | |
sources=["microphone"], | |
type="filepath", | |
waveform_options=gr.WaveformOptions( | |
waveform_color="#01C6FF", | |
waveform_progress_color="#0066B4", | |
skip_length=2, | |
show_controls=False, | |
), | |
) | |
demo = gr.Interface( | |
fn=transcribe_audio, | |
inputs=[diction_script, input_audio], | |
outputs=[transcribed_text, highlighted_results, score], | |
title="Redmond Barry-oke", | |
description=description, | |
) | |
if __name__ == "__main__": | |
demo.launch() | |