Spaces:

sotirios-slv
/

whispering-angle

Sleeping

File size: 1,591 Bytes

252f6f4
6186718
 
 
33bef80
 
 
 
 
 
 
8197c6e
 
d3e099a
 
a61e46a
8197c6e
91f6a7c
8197c6e
a61e46a
 
 
 
8197c6e
a61e46a
8197c6e
 
 
33bef80
 
 
8197c6e
 
e0a729c
8197c6e
50a8cbc
a61e46a
e0a729c
8197c6e
e0a729c
 
 
8197c6e
 
 
6186718
e0a729c
8197c6e
252f6f4
8197c6e
6186718
 
 
0547be0
 
 
6186718
 
 
 
 
 
 
e0a729c
6186718

import logging

import gradio as gr

# import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    pipeline,
    WhisperProcessor,
)


device = "cpu"
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

# model = AutoModelForSpeechSeq2Seq.from_pretrained(
#     model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
# )

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = WhisperProcessor.from_pretrained("openai/whisper-base.en")

# processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    # max_new_tokens=128,
    chunk_length_s=30,
    batch_size=8,
    # return_timestamps=True,
    # torch_dtype=torch_dtype,
    device=device,
)


def transcribe_audio(audio):
    result = pipe(audio)
    logging.info(f'TRANSCRIPTION {result["text"]}')
    return result


input_audio = gr.Audio(
    source="microphone",
    type="filepath",
    optional=True,
    waveform_options=gr.WaveformOptions(
        waveform_color="#01C6FF",
        waveform_progress_color="#0066B4",
        skip_length=2,
        show_controls=False,
    ),
)
demo = gr.Interface(fn=transcribe_audio, inputs=input_audio, outputs="text")

if __name__ == "__main__":
    demo.launch()