File size: 1,164 Bytes
42c9935
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import copy
import time

import gradio as gr
import numpy as np
import torch
import torchaudio
from loguru import logger
from transformers import pipeline

device = "cuda" if torch.cuda.is_available() else "cpu"
transcriber = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base.en", device=device
)


def save_audio_as_wav(data, sample_rate, file_path):
    # make a tensor from the numpy array
    data = torch.tensor(data).reshape(1, -1)
    torchaudio.save(
        file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding="PCM_S"
    )


def save_and_transcribe_audio(audio):
    sample_rate, data = audio
    try:
        # add timestamp to file name
        filename = f"recordings/audio{time.time()}.wav"
        save_audio_as_wav(data, sample_rate, filename)
        data = data.astype(np.float32)
        data /= np.max(np.abs(data))
        text = transcriber({"sampling_rate": sample_rate, "raw": data})["text"]
        gr.Info(f"Transcribed text is: {text}\nProcessing the input...")

    except Exception as e:
        logger.error(f"Error: {e}")
        raise Exception("Error transcribing audio.")
    return text