Aditya02 commited on
Commit
da20b78
·
1 Parent(s): 54772df

Added the file

Browse files
Files changed (3) hide show
  1. app.py +236 -0
  2. packages.txt +1 -0
  3. requirements.txt +21 -0
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+ import datetime
3
+ import subprocess
4
+ import gradio as gr
5
+ from pathlib import Path
6
+ import pandas as pd
7
+ import re
8
+ import time
9
+ import os
10
+ import numpy as np
11
+ from sklearn.cluster import AgglomerativeClustering
12
+ from sklearn.metrics import silhouette_score
13
+ import pyannote.audio
14
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
15
+ from pyannote.audio import Audio
16
+ from pyannote.core import Segment
17
+ import torch
18
+ from gpuinfo import GPUInfo
19
+ import wave
20
+ import contextlib
21
+ from transformers import pipeline
22
+ import psutil
23
+
24
+
25
+ embedding_model = PretrainedSpeakerEmbedding(
26
+ "speechbrain/spkrec-ecapa-voxceleb",
27
+ device = "cuda")
28
+ # device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
29
+
30
+
31
+ def convert_time(secs):
32
+ return datetime.timedelta(seconds=round(secs))
33
+
34
+ def speech_to_text(audio_file_path, selected_source_lang, whisper_model, num_speakers):
35
+ """
36
+ # Transcribe youtube link using OpenAI Whisper
37
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
38
+ 2. Generating speaker embeddings for each segments.
39
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
40
+
41
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
42
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
43
+ """
44
+
45
+ model = WhisperModel(whisper_model, compute_type="int8")
46
+ time_start = time.time()
47
+
48
+ try:
49
+ # Get duration
50
+ _,file_ending = os.path.splitext(f'{audio_file_path}')
51
+ print(f'file enging is {file_ending}')
52
+ audio_file = audio_file_path.replace(file_ending, ".wav")
53
+ # mp3 to wav format
54
+ os.system(f'ffmpeg -i {audio_file_path} -ar 16000 -ac 1 -acodec pcm_s16le {audio_file}')
55
+
56
+ #Video to audio
57
+ # os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
58
+
59
+ # Get duration
60
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
61
+ frames = f.getnframes()
62
+ rate = f.getframerate()
63
+ duration = frames / float(rate)
64
+
65
+ print(f"conversion to wav ready, duration of audio file: {duration}")
66
+
67
+ # Transcribe audio
68
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5)
69
+ transcribe_options = dict(task="transcribe", **options)
70
+ segments_raw, info = model.transcribe(audio_file, **transcribe_options)
71
+
72
+ # Convert back to original openai format
73
+ segments = []
74
+ i = 0
75
+ for segment_chunk in segments_raw:
76
+ chunk = {}
77
+ chunk["start"] = segment_chunk.start
78
+ chunk["end"] = segment_chunk.end
79
+ chunk["text"] = segment_chunk.text
80
+ segments.append(chunk)
81
+ i += 1
82
+ print("transcribe audio done with fast whisper")
83
+ except Exception as e:
84
+ raise RuntimeError("Error converting video to audio")
85
+
86
+ try:
87
+ # Create embedding
88
+ def segment_embedding(segment):
89
+ audio = Audio()
90
+ start = segment["start"]
91
+ # Whisper overshoots the end timestamp in the last segment
92
+ end = min(duration, segment["end"])
93
+ clip = Segment(start, end)
94
+ waveform, sample_rate = audio.crop(audio_file, clip)
95
+ return embedding_model(waveform[None])
96
+
97
+ embeddings = np.zeros(shape=(len(segments), 192))
98
+ for i, segment in enumerate(segments):
99
+ embeddings[i] = segment_embedding(segment)
100
+ embeddings = np.nan_to_num(embeddings)
101
+ print(f'Embedding shape: {embeddings.shape}')
102
+
103
+ if num_speakers == 0:
104
+ # Find the best number of speakers
105
+ score_num_speakers = {}
106
+
107
+ for num_speakers in range(2, 10+1):
108
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
109
+ score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
110
+ score_num_speakers[num_speakers] = score
111
+ best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
112
+ print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
113
+ else:
114
+ best_num_speaker = num_speakers
115
+
116
+ # Assign speaker label
117
+ clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
118
+ labels = clustering.labels_
119
+ for i in range(len(segments)):
120
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
121
+
122
+ # Make output
123
+ objects = {
124
+ 'Start' : [],
125
+ 'End': [],
126
+ 'Speaker': [],
127
+ 'Text': []
128
+ }
129
+ text = ''
130
+ for (i, segment) in enumerate(segments):
131
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
132
+ objects['Start'].append(str(convert_time(segment["start"])))
133
+ objects['Speaker'].append(segment["speaker"])
134
+ if i != 0:
135
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
136
+ objects['Text'].append(text)
137
+ text = ''
138
+ text += segment["text"] + ' '
139
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
140
+ objects['Text'].append(text)
141
+
142
+ time_end = time.time()
143
+ time_diff = time_end - time_start
144
+ memory = psutil.virtual_memory()
145
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
146
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
147
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
148
+ system_info = f"""
149
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
150
+ *Processing time: {time_diff:.5} seconds.*
151
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
152
+ """
153
+ save_path = "transcript_result.csv"
154
+ df_results = pd.DataFrame(objects)
155
+ df_results.to_csv(save_path)
156
+ return df_results, system_info, save_path
157
+ except Exception as e:
158
+ raise RuntimeError("Error Running inference with local model", e)
159
+
160
+ #Code has been inspired from https://huggingface.co/spaces/vumichien/Whisper_speaker_diarization/blob/main/app.py
161
+
162
+ whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
163
+ source_languages = {
164
+ "en": "English",
165
+ "zh": "Chinese"}
166
+
167
+
168
+ #Gradio app
169
+
170
+ memory = psutil.virtual_memory()
171
+ microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
172
+ upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
173
+ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
174
+ selected_source_lang = gr.Dropdown(choices=source_languages, type="value", value="en", label="Spoken language in video",
175
+ interactive=True)
176
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model",
177
+ interactive=True)
178
+ number_speakers = gr.Number(precision=0, value=0,
179
+ label="Input number of speakers for better results. If value=0, model will automatic find the best number of speakers",
180
+ interactive=True)
181
+ transcription_df = gr.DataFrame(value=df_init, label="Transcription dataframe", row_count=(0, "dynamic"), max_rows=10,
182
+ wrap=True, overflow_row_behaviour='paginate')
183
+ download_transcript = gr.File(label="Download transcript")
184
+ system_info = gr.Markdown(
185
+ f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
186
+ title = "Whisper speaker diarization"
187
+ demo = gr.Blocks(title=title)
188
+ demo.encrypt = False
189
+
190
+ with demo:
191
+ with gr.Tab("Whisper speaker diarization"):
192
+ gr.Markdown('''
193
+ <div>
194
+ <h1 style='text-align: center'>Whisper speaker diarization</h1>
195
+ This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
196
+ and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
197
+ </div>
198
+ ''')
199
+
200
+ # with gr.Row():
201
+ # gr.Markdown('''
202
+ # ### Transcribe youtube link using OpenAI Whisper
203
+ # ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
204
+ # ##### 2. Generating speaker embeddings for each segments.
205
+ # ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
206
+ # ''')
207
+
208
+ with gr.Row():
209
+ with gr.Column():
210
+ with gr.Column():
211
+ gr.Markdown('''
212
+ ##### Here you can start the transcription process.
213
+ ##### Please select the source language for transcription.
214
+ ##### You can select a range of assumed numbers of speakers.
215
+ ''')
216
+ selected_source_lang.render()
217
+ selected_whisper_model.render()
218
+ number_speakers.render()
219
+ upload.render()
220
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
221
+ transcribe_btn.click(speech_to_text,
222
+ [upload, selected_source_lang, selected_whisper_model, number_speakers],
223
+ [transcription_df, system_info, download_transcript]
224
+ )
225
+ with gr.Row():
226
+ gr.Markdown('''
227
+ ##### Here you will get transcription output
228
+ ##### ''')
229
+
230
+ with gr.Row():
231
+ with gr.Column():
232
+ download_transcript.render()
233
+ transcription_df.render()
234
+ system_info.render()
235
+
236
+ demo.launch(debug=True)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ git+https://github.com/pyannote/pyannote-audio
3
+ git+https://github.com/openai/whisper.git
4
+ gradio==3.12
5
+ ffmpeg-python
6
+ pandas==1.5.0
7
+ pytube==12.1.0
8
+ sacremoses
9
+ sentencepiece
10
+ tokenizers
11
+ torch
12
+ torchaudio
13
+ tqdm==4.64.1
14
+ EasyNMT==2.0.2
15
+ nltk
16
+ transformers
17
+ pysrt
18
+ psutil==5.9.2
19
+ requests
20
+ gpuinfo
21
+ faster-whisper