File size: 1,072 Bytes
0013d95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
import gradio as gr
import numpy as np
import librosa
import torch
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from dotenv import load_dotenv

load_dotenv()

os.environ["PATH"] += ".\env\Lib\site-packages\ffprobe"

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = os.getenv("MODEL")

model = Speech2TextForConditionalGeneration.from_pretrained(
    "facebook/s2t-small-librispeech-asr"
)
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")


def transcribe(audio):
    input, rate = librosa.load(
        audio, sr=16000
    )  # Downsample original frequency to 16000hrz
    inputs = processor(input, sampling_rate=rate, return_tensors="pt")
    generated_ids = model.generate(
        inputs["input_features"], attention_mask=inputs["attention_mask"]
    )
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return transcription


gr.Interface(
    fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text"
).launch()