sotirios-slv's picture
Switched over to using blocks to give greater control over what's displayed
a10fa69
raw
history blame
2.46 kB
from difflib import Differ
import gradio as gr
import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
pipeline,
)
description = """
<div>
<p>Roll up, roll up come test your diction against a 🤖</p>
</div>
"""
diction_text = "How now brown cow"
test_text = f"""
<div>
<p>{diction_text}</p>
</div>
"""
diction = gr.HTML(test_text)
device = "cpu"
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=8,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
def diff_texts(audio_input: str):
test_text = diction_text
d = Differ()
return [
(token[2:], token[0] if token[0] != "" else None)
for token in d.compare(test_text, audio_input)
]
def transcribe_audio(diction, audio):
print("Diction", diction)
result = pipe(audio)
print(f'TRANSCRIPTION {result["text"]}')
diff_text = diff_texts(result["text"])
print("diff", diff_text)
return diff_text
highlighted_results = gr.HighlightedText(
label="Diff",
combine_adjacent=True,
show_legend=True,
color_map={"+": "red", "-": "green"},
)
input_audio = gr.Audio(
sources=["microphone"],
type="filepath",
waveform_options=gr.WaveformOptions(
waveform_color="#01C6FF",
waveform_progress_color="#0066B4",
skip_length=2,
show_controls=False,
),
)
# demo = gr.Interface(
# fn=transcribe_audio,
# inputs=[diction, input_audio],
# outputs=highlighted_results,
# title="Test your diction",
# description=description,
# theme="abidlabs/Lime",
# )
with gr.Blocks() as demo:
gr.HTML(description)
gr.HTML(test_text)
with gr.Row():
inp = input_audio
out = highlighted_results
btn = gr.Button("Run")
btn.click(fn=transcribe_audio, inputs=[diction_text, inp], outputs=out)
if __name__ == "__main__":
demo.launch()