File size: 5,801 Bytes
b6c2bc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc8ad72
b6c2bc0
 
 
 
 
 
 
39684bb
 
b6c2bc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

import subprocess
import torch
import gradio as gr
import yt_dlp
import pandas as pd
from nemo.collections.asr.models import ASRModel
from nemo_align import align_tdt_to_ctc_timestamps
import os


device = "cuda" if torch.cuda.is_available() else "cpu"

def process_audio(input_file, output_file):
    command = [
        'sox', input_file,
        output_file,
        'channels', '1',
        'rate', '16000'
    ]
    try:
        subprocess.run(command, check=True)
        return output_file
    except:
        raise gr.Error("Failed to convert audio to single channel and sampling rate to 16000")

def get_dataframe_segments(segments):
    df = pd.DataFrame(columns=['start_time', 'end_time', 'text'])
    if len(segments) == 0:
        df.loc[0] = 0, 0, ''
        return df
    
    for segment in segments:
        text, start_time, end_time = segment
        if len(text)>0:
            df.loc[len(df)] = round(start_time, 2), round(end_time, 2), text
    
    return df


def get_video_info(url):
    ydl_opts = {
        'quiet': True,
        'skip-download': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(url, download=False)
        except:
            raise gr.Error("Failed to extract video info from Youtube")
        return info

def download_audio(url):
    ydl_opts = {
        'format': 'bestaudio/best,channels:1',
        'quiet': True,
        'outtmpl': 'audio_file',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'flac',
            'preferredquality': '192',
        }],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([url])
        except yt_dlp.utils.DownloadError as err:
            raise gr.Error(str(err))

    return process_audio('audio_file.flac', 'processed_file.flac')


def get_audio_from_youtube(url):
    info = get_video_info(url)
    duration = info.get('duration', 0)  # Duration in seconds
    video_id = info.get('id',None)

    html = f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'

    if duration > 2*60*60: # 2 hrs change later based on GPU
        return gr.Error(str("For GPU {}, single pass maximum audio can be 2hrs"))
    else:
        return download_audio(url), html


def get_transcripts(audio_path, model):
    with torch.cuda.amp.autocast(enabled=False, dtype=torch.bfloat16):
        with torch.inference_mode():
            text = model.transcribe(audio=[audio_path], )
    return text

def pick_asr_model():
    model = 'nvidia/parakeet-tdt_ctc-1.1b'
    asr_model = ASRModel.from_pretrained(model).to(device)
    asr_model.cfg.decoding.strategy = "greedy_batch"
    asr_model.change_decoding_strategy(asr_model.cfg.decoding)
    asr_model.eval()
    return asr_model

asr_model = pick_asr_model()

def run_nemo_models(url, microphone, audio_path):
    html = None
    if url is None or len(url)<2:
        path1 = microphone if microphone else audio_path
    else:
        gr.Info("Downloading and processing audio from Youtube")
        path1, html = get_audio_from_youtube(url)
       
    gr.Info("Running NeMo Model")
    text = get_transcripts(path1, asr_model)

    segments = align_tdt_to_ctc_timestamps(text, asr_model, path1)
    
    df = get_dataframe_segments(segments)
    
    return df, html

def clear_youtube_link():
    # Remove .flac files in current directory
    file_list = os.listdir()
    for file in file_list:
        if file.endswith(".flac"):
            os.remove(file)
    
    return None


# def run_speaker_diarization()

with gr.Blocks(
	title="NeMo Parakeet Model",
	css="""
		textarea { font-size: 18px;}
		#model_output_text_box span {
			font-size: 18px;
			font-weight: bold;
		}
	""",
	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
) as demo:
    gr.HTML("<h1 style='text-align: center'>Transcription with timestamps using Parakeet TDT-CTC</h1>")
    gr.Markdown('''
    Choose between different sources of audio (Microphone, Audio File, Youtube Video) to transcribe along with timestamps. 
    
    Parakeet models with limited attention are quite fast due to their limited attention mechanism. The current model with 1.1B parameters can transcribe very long audios upto 11 hrs on A6000 GPU in a single pass.
                
    Model used: [nvidia/parakeet-tdt_ctc-1.1b](https://huggingface.co/nvidia/parakeet-tdt_ctc-1.1b).
    ''')
    # This block is for reading audio from MIC
    with gr.Tab('Audio from Youtube'):
        with gr.Row():
            yt_link = gr.Textbox(value=None, label='Enter Youtube Link', type='text')
            yt_render = gr.HTML()
    
    with gr.Tab('Audio From File'):
        file_input = gr.Audio(sources='upload', label='Upload Audio', type='filepath')

    with gr.Tab('Audio From Microphone'):
        mic_input = gr.Audio(sources='microphone', label='Record Audio', type='filepath')
        
 
    # b1 = gr.Button("Get Transcription with Punctuation and Capitalization")

    gr.Markdown('''Speech Recognition''')

    # text_output = gr.Textbox(label='Transcription', type='text')

    b2 = gr.Button("Get timestamps with text")

    time_stamp = gr.DataFrame(wrap=True, label='Speech Recognition with TimeStamps',
        row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'text'])

    # b1.click(run_nemo_models, inputs=[file_input, mic_input, yt_link], outputs=[text_output, yt_render])

    b2.click(run_nemo_models, inputs=[yt_link, file_input, mic_input], outputs=[time_stamp, yt_render]).then(
        clear_youtube_link, None, yt_link, queue=False) #here clean up passing None to audio.

    demo.queue(True)
    demo.launch(share=True, debug=True)