Spaces:
Sleeping
Sleeping
File size: 3,730 Bytes
75795f2 6186718 3ae01f4 f0698ec 33bef80 8197c6e 10f0b14 ab3ecd5 9a88197 04716fa ab3ecd5 eeb2fc4 8197c6e d3e099a ce1af45 f0698ec 8197c6e 34aef7d 04716fa 34aef7d 8197c6e a61e46a 8197c6e a61e46a 8197c6e f0698ec 8197c6e e0a729c 8197c6e 50a8cbc a61e46a f0698ec 8197c6e e0a729c f0698ec 8197c6e 6186718 3ae01f4 ce1af45 1a27263 ce1af45 1a27263 ab3ecd5 9a88197 ab3ecd5 ce1af45 9333506 3ae01f4 ab3ecd5 1a27263 04716fa 7e182ab 1a27263 7e182ab 1a27263 6186718 7e182ab 1596c0d 6186718 0e42c86 0547be0 6186718 a10fa69 b802d71 7e182ab 1596c0d 04716fa b802d71 6186718 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from difflib import Differ
import gradio as gr
import string
import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
pipeline,
)
diction_text = """
How is this leisure to be disposed of? In the public-house? the singing hall? the dancing-saloon?
which hold out seductions somewhat more dangerous, methinks, to honest labour than those presented by a library...
We may well rejoice, then, when we see a room such as this filled with attentive and reflective readers.
"""
diction_script = gr.Textbox(diction_text, interactive=False, show_label=False)
device = "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
description = f"""
<div>
<p>Welcome to Redmond Barry-oke! </p>
<p>This app aims to demonstrate the potential of using machine learning to transcribe audio. Users are invited to record themselves reading a brief and abridged excerpt from a speech delivered by Sir Redmond Barry at the opening of The Free Public Library of Ballarat Est in 1869. Once recorded and submitted the app will transcribe and return a "diction" score.</p>
<p>This app uses {model_id} to perform automated transcription</p>
<p>A full transcript of Sir Redmond Barry's speech can be read in the <a href="https://latrobejournal.slv.vic.gov.au/latrobejournal/issue/latrobe-26/t1-g-t3.html" target="_blank">La Trobe Journal</a></p>
</div>
"""
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=8,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
def prepare_text_for_comparison(text_to_clean: str):
text_to_clean = text_to_clean.translate(str.maketrans("", "", string.punctuation))
return text_to_clean.casefold()
def diff_texts(diction_text: str, audio_input: str):
d = Differ()
return [
(token[2:], token[0] if token[0] != "" else None)
for token in d.compare(diction_text, audio_input)
]
def calc_score(diff_texts: list) -> float:
diff_chars = [char for char in diff_texts if char[1] != " "]
score = float((len(diff_chars) / len(diff_texts)) * 100)
score = 100 - score
return score
def transcribe_audio(diction_text, audio):
result = pipe(audio)
cleaned_result = prepare_text_for_comparison(result["text"])
cleaned_diction_text = prepare_text_for_comparison(diction_text)
diff_text = diff_texts(cleaned_diction_text, cleaned_result)
score = calc_score(diff_text)
formatted_score = f"{str(round(score,3))}%"
return diction_text, diff_text, formatted_score
transcribed_text = gr.Textbox(label="Transcribed text")
highlighted_results = gr.HighlightedText(
label="Diff",
combine_adjacent=True,
show_legend=True,
color_map={"+": "green", "-": "red"},
)
score = gr.Textbox("0%", label="Barry-oke score")
input_audio = gr.Audio(
sources=["microphone"],
type="filepath",
waveform_options=gr.WaveformOptions(
waveform_color="#01C6FF",
waveform_progress_color="#0066B4",
skip_length=2,
show_controls=False,
),
)
demo = gr.Interface(
fn=transcribe_audio,
inputs=[transcribed_text, diction_script, input_audio],
outputs=[highlighted_results, score],
title="Redmond Barry-oke",
description=description,
)
if __name__ == "__main__":
demo.launch()
|