Spaces:
Sleeping
Sleeping
File size: 1,591 Bytes
252f6f4 6186718 33bef80 8197c6e d3e099a a61e46a 8197c6e 91f6a7c 8197c6e a61e46a 8197c6e a61e46a 8197c6e 33bef80 8197c6e e0a729c 8197c6e 50a8cbc a61e46a e0a729c 8197c6e e0a729c 8197c6e 6186718 e0a729c 8197c6e 252f6f4 8197c6e 6186718 0547be0 6186718 e0a729c 6186718 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import logging
import gradio as gr
# import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
pipeline,
WhisperProcessor,
)
device = "cpu"
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
# model = AutoModelForSpeechSeq2Seq.from_pretrained(
# model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
# )
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = WhisperProcessor.from_pretrained("openai/whisper-base.en")
# processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
# max_new_tokens=128,
chunk_length_s=30,
batch_size=8,
# return_timestamps=True,
# torch_dtype=torch_dtype,
device=device,
)
def transcribe_audio(audio):
result = pipe(audio)
logging.info(f'TRANSCRIPTION {result["text"]}')
return result
input_audio = gr.Audio(
source="microphone",
type="filepath",
optional=True,
waveform_options=gr.WaveformOptions(
waveform_color="#01C6FF",
waveform_progress_color="#0066B4",
skip_length=2,
show_controls=False,
),
)
demo = gr.Interface(fn=transcribe_audio, inputs=input_audio, outputs="text")
if __name__ == "__main__":
demo.launch()
|