Spaces:
Running
on
T4
Running
on
T4
File size: 5,801 Bytes
b6c2bc0 cc8ad72 b6c2bc0 39684bb b6c2bc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import subprocess
import torch
import gradio as gr
import yt_dlp
import pandas as pd
from nemo.collections.asr.models import ASRModel
from nemo_align import align_tdt_to_ctc_timestamps
import os
device = "cuda" if torch.cuda.is_available() else "cpu"
def process_audio(input_file, output_file):
command = [
'sox', input_file,
output_file,
'channels', '1',
'rate', '16000'
]
try:
subprocess.run(command, check=True)
return output_file
except:
raise gr.Error("Failed to convert audio to single channel and sampling rate to 16000")
def get_dataframe_segments(segments):
df = pd.DataFrame(columns=['start_time', 'end_time', 'text'])
if len(segments) == 0:
df.loc[0] = 0, 0, ''
return df
for segment in segments:
text, start_time, end_time = segment
if len(text)>0:
df.loc[len(df)] = round(start_time, 2), round(end_time, 2), text
return df
def get_video_info(url):
ydl_opts = {
'quiet': True,
'skip-download': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=False)
except:
raise gr.Error("Failed to extract video info from Youtube")
return info
def download_audio(url):
ydl_opts = {
'format': 'bestaudio/best,channels:1',
'quiet': True,
'outtmpl': 'audio_file',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'flac',
'preferredquality': '192',
}],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([url])
except yt_dlp.utils.DownloadError as err:
raise gr.Error(str(err))
return process_audio('audio_file.flac', 'processed_file.flac')
def get_audio_from_youtube(url):
info = get_video_info(url)
duration = info.get('duration', 0) # Duration in seconds
video_id = info.get('id',None)
html = f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
if duration > 2*60*60: # 2 hrs change later based on GPU
return gr.Error(str("For GPU {}, single pass maximum audio can be 2hrs"))
else:
return download_audio(url), html
def get_transcripts(audio_path, model):
with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16):
with torch.inference_mode():
text = model.transcribe(audio=[audio_path], )
return text
def pick_asr_model():
model = 'nvidia/parakeet-tdt_ctc-1.1b'
asr_model = ASRModel.from_pretrained(model).to(device)
asr_model.cfg.decoding.strategy = "greedy_batch"
asr_model.change_decoding_strategy(asr_model.cfg.decoding)
asr_model.eval()
return asr_model
asr_model = pick_asr_model()
def run_nemo_models(url, microphone, audio_path):
html = None
if url is None or len(url)<2:
path1 = microphone if microphone else audio_path
else:
gr.Info("Downloading and processing audio from Youtube")
path1, html = get_audio_from_youtube(url)
gr.Info("Running NeMo Model")
text = get_transcripts(path1, asr_model)
segments = align_tdt_to_ctc_timestamps(text, asr_model, path1)
df = get_dataframe_segments(segments)
return df, html
def clear_youtube_link():
# Remove .flac files in current directory
file_list = os.listdir()
for file in file_list:
if file.endswith(".flac"):
os.remove(file)
return None
# def run_speaker_diarization()
with gr.Blocks(
title="NeMo Parakeet Model",
css="""
textarea { font-size: 18px;}
#model_output_text_box span {
font-size: 18px;
font-weight: bold;
}
""",
theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
) as demo:
gr.HTML("<h1 style='text-align: center'>Transcription with timestamps using Parakeet TDT-CTC</h1>")
gr.Markdown('''
Choose between different sources of audio (Microphone, Audio File, Youtube Video) to transcribe along with timestamps.
Parakeet models with limited attention are quite fast due to their limited attention mechanism. The current model with 1.1B parameters can transcribe very long audios upto 11 hrs on A6000 GPU in a single pass.
Model used: [nvidia/parakeet-tdt_ctc-1.1b](https://huggingface.co/nvidia/parakeet-tdt_ctc-1.1b).
''')
# This block is for reading audio from MIC
with gr.Tab('Audio from Youtube'):
with gr.Row():
yt_link = gr.Textbox(value=None, label='Enter Youtube Link', type='text')
yt_render = gr.HTML()
with gr.Tab('Audio From File'):
file_input = gr.Audio(sources='upload', label='Upload Audio', type='filepath')
with gr.Tab('Audio From Microphone'):
mic_input = gr.Audio(sources='microphone', label='Record Audio', type='filepath')
# b1 = gr.Button("Get Transcription with Punctuation and Capitalization")
gr.Markdown('''Speech Recognition''')
# text_output = gr.Textbox(label='Transcription', type='text')
b2 = gr.Button("Get timestamps with text")
time_stamp = gr.DataFrame(wrap=True, label='Speech Recognition with TimeStamps',
row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'text'])
# b1.click(run_nemo_models, inputs=[file_input, mic_input, yt_link], outputs=[text_output, yt_render])
b2.click(run_nemo_models, inputs=[yt_link, file_input, mic_input], outputs=[time_stamp, yt_render]).then(
clear_youtube_link, None, yt_link, queue=False) #here clean up passing None to audio.
demo.queue(True)
demo.launch(share=True, debug=True)
|