Spaces:
Sleeping
Sleeping
from difflib import Differ | |
import gradio as gr | |
import torch | |
from transformers import ( | |
AutoModelForSpeechSeq2Seq, | |
AutoProcessor, | |
pipeline, | |
) | |
# diction_text = """ | |
# How is this leisure to be disposed of? In the public-house? the singing hall? the dancing-saloon? which hold out seductions somewhat more dangerous, methinks, to honest labor than those presented by a library; or in listless inaction, in weary unoccupied solitude? That cannot be. While man is a social animal society he must have, and better a thousand times that he should seek relief from the tedium of unemployed hours in the improving conversation of worthy authors, dead or living, than in the debasing, brutalising communications from which it is so difficult otherwise to escape. | |
# """ | |
diction_text = """ | |
How is this leisure to be disposed of? In the public-house? the singing hall? the dancing-saloon? | |
which hold out seductions somewhat more dangerous, methinks, to honest labour than those presented by a library... | |
We may well rejoice, then, when we see a room such as this filled with attentive and reflective readers. | |
""" | |
diction_script = gr.Textbox(diction_text, interactive=False, show_label=False) | |
device = "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
# model_id = "openai/whisper-base " | |
model_id = "openai/whisper-large-v3" | |
description = f""" | |
<div> | |
<p>Welcome to Redmond Barry-oke! </p> | |
<p>This app aims to demonstrate the potential of using machine learning to transcribe audio. Users are invited to record themselves reading a brief and abridged excerpt from a speech delivered by Sir Redmond Barry at the opening of The Free Public Library of Ballarat Est in 1869. Once recorded and submitted the app will transcribe and return a "diction" score.</p> | |
<p>This app uses {model_id} to perform automated transcription</p> | |
<p>A full transcript of Sir Redmond Barry's speech can be read in the <a href="https://latrobejournal.slv.vic.gov.au/latrobejournal/issue/latrobe-26/t1-g-t3.html" target="_blank">La Trobe Journal</a></p> | |
</div> | |
""" | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, low_cpu_mem_usage=True, use_safetensors=True | |
) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(model_id) | |
pipe = pipeline( | |
task="automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
max_new_tokens=128, | |
chunk_length_s=30, | |
batch_size=8, | |
return_timestamps=True, | |
torch_dtype=torch_dtype, | |
device=device, | |
) | |
def diff_texts(diction_text: str, audio_input: str): | |
d = Differ() | |
return [ | |
(token[2:], token[0] if token[0] != "" else None) | |
for token in d.compare(diction_text, audio_input) | |
] | |
def calc_score(diff_texts: list) -> float: | |
diff_chars = [char for char in diff_texts if char[1] != " "] | |
score = float((len(diff_chars) / len(diff_texts)) * 100) | |
score = 100 - score | |
return score | |
def transcribe_audio(diction_text, audio): | |
result = pipe(audio) | |
diff_text = diff_texts(diction_text, result["text"]) | |
score = calc_score(diff_text) | |
formatted_score = f"{str(round(score,3))}%" | |
return diff_text, formatted_score | |
highlighted_results = gr.HighlightedText( | |
label="Diff", | |
combine_adjacent=True, | |
show_legend=True, | |
color_map={"+": "red", "-": "green"}, | |
) | |
score = gr.Textbox("0%", label="Score") | |
input_audio = gr.Audio( | |
sources=["microphone"], | |
type="filepath", | |
waveform_options=gr.WaveformOptions( | |
waveform_color="#01C6FF", | |
waveform_progress_color="#0066B4", | |
skip_length=2, | |
show_controls=False, | |
), | |
) | |
demo = gr.Interface( | |
fn=transcribe_audio, | |
inputs=[diction_script, input_audio], | |
outputs=[highlighted_results, score], | |
title="Redmond Barry-oke", | |
description=description, | |
) | |
if __name__ == "__main__": | |
demo.launch() | |