vumichien commited on
Commit
9c701cc
·
1 Parent(s): e318194

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +321 -92
app.py CHANGED
@@ -1,97 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- import gradio as gr
4
- import pytube as pt
5
- from transformers import pipeline
6
- from huggingface_hub import model_info
7
-
8
- MODEL_NAME = "vumichien/whisper-medium-jp" #this always needs to stay in line 8 :D sorry for the hackiness
9
- lang = "ja"
10
-
11
- device = 0 if torch.cuda.is_available() else "cpu"
12
- pipe = pipeline(
13
- task="automatic-speech-recognition",
14
- model=MODEL_NAME,
15
- chunk_length_s=30,
16
- device=device,
17
- )
18
-
19
- pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
20
-
21
- def transcribe(microphone, file_upload):
22
- warn_output = ""
23
- if (microphone is not None) and (file_upload is not None):
24
- warn_output = (
25
- "WARNING: You've uploaded an audio file and used the microphone. "
26
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
27
- )
28
-
29
- elif (microphone is None) and (file_upload is None):
30
- return "ERROR: You have to either use the microphone or upload an audio file"
31
-
32
- file = microphone if microphone is not None else file_upload
33
-
34
- text = pipe(file)["text"]
35
-
36
- return warn_output + text
37
-
38
-
39
- def _return_yt_html_embed(yt_url):
40
- video_id = yt_url.split("?v=")[-1]
41
- HTML_str = (
42
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
43
- " </center>"
44
- )
45
- return HTML_str
46
-
47
-
48
- def yt_transcribe(yt_url):
49
- yt = pt.YouTube(yt_url)
50
- html_embed_str = _return_yt_html_embed(yt_url)
51
- stream = yt.streams.filter(only_audio=True)[0]
52
- stream.download(filename="audio.mp3")
53
-
54
- text = pipe("audio.mp3")["text"]
55
-
56
- return html_embed_str, text
57
-
58
-
59
- demo = gr.Blocks()
60
-
61
- mf_transcribe = gr.Interface(
62
- fn=transcribe,
63
- inputs=[
64
- gr.inputs.Audio(source="microphone", type="filepath", optional=True),
65
- gr.inputs.Audio(source="upload", type="filepath", optional=True),
66
- ],
67
- outputs="text",
68
- layout="horizontal",
69
- theme="huggingface",
70
- title="Whisper Medium Japanese: Transcribe Audio",
71
- description=(
72
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
73
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
74
- " of arbitrary length."
75
- ),
76
- allow_flagging="never",
77
- )
78
-
79
- yt_transcribe = gr.Interface(
80
- fn=yt_transcribe,
81
- inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
82
- outputs=["html", "text"],
83
- layout="horizontal",
84
- theme="huggingface",
85
- title="Whisper Medium Japanese: Transcribe YouTube",
86
- description=(
87
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
88
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
89
- " arbitrary length."
90
- ),
91
- allow_flagging="never",
92
- )
93
 
94
  with demo:
95
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- demo.launch(enable_queue=True)
 
1
+ import whisper
2
+ import datetime
3
+ import subprocess
4
+ import gradio as gr
5
+ from pathlib import Path
6
+ import pandas as pd
7
+ import re
8
+ import time
9
+ import os
10
+ import numpy as np
11
+ from sklearn.cluster import AgglomerativeClustering
12
+
13
+ from pytube import YouTube
14
  import torch
15
+ import pyannote.audio
16
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
17
+ from pyannote.audio import Audio
18
+ from pyannote.core import Segment
19
+
20
+ import wave
21
+ import contextlib
22
+
23
+ import psutil
24
+ num_cores = psutil.cpu_count()
25
+ os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
26
+
27
+ whisper_models = ["base", "small", "medium", "large", "base.en"]
28
+ source_languages = {
29
+ "en": "English",
30
+ "zh": "Chinese",
31
+ "de": "German",
32
+ "es": "Spanish",
33
+ "ru": "Russian",
34
+ "ko": "Korean",
35
+ "fr": "French",
36
+ "ja": "Japanese",
37
+ "pt": "Portuguese",
38
+ "tr": "Turkish",
39
+ "pl": "Polish",
40
+ "ca": "Catalan",
41
+ "nl": "Dutch",
42
+ "ar": "Arabic",
43
+ "sv": "Swedish",
44
+ "it": "Italian",
45
+ "id": "Indonesian",
46
+ "hi": "Hindi",
47
+ "fi": "Finnish",
48
+ "vi": "Vietnamese",
49
+ "he": "Hebrew",
50
+ "uk": "Ukrainian",
51
+ "el": "Greek",
52
+ "ms": "Malay",
53
+ "cs": "Czech",
54
+ "ro": "Romanian",
55
+ "da": "Danish",
56
+ "hu": "Hungarian",
57
+ "ta": "Tamil",
58
+ "no": "Norwegian",
59
+ "th": "Thai",
60
+ "ur": "Urdu",
61
+ "hr": "Croatian",
62
+ "bg": "Bulgarian",
63
+ "lt": "Lithuanian",
64
+ "la": "Latin",
65
+ "mi": "Maori",
66
+ "ml": "Malayalam",
67
+ "cy": "Welsh",
68
+ "sk": "Slovak",
69
+ "te": "Telugu",
70
+ "fa": "Persian",
71
+ "lv": "Latvian",
72
+ "bn": "Bengali",
73
+ "sr": "Serbian",
74
+ "az": "Azerbaijani",
75
+ "sl": "Slovenian",
76
+ "kn": "Kannada",
77
+ "et": "Estonian",
78
+ "mk": "Macedonian",
79
+ "br": "Breton",
80
+ "eu": "Basque",
81
+ "is": "Icelandic",
82
+ "hy": "Armenian",
83
+ "ne": "Nepali",
84
+ "mn": "Mongolian",
85
+ "bs": "Bosnian",
86
+ "kk": "Kazakh",
87
+ "sq": "Albanian",
88
+ "sw": "Swahili",
89
+ "gl": "Galician",
90
+ "mr": "Marathi",
91
+ "pa": "Punjabi",
92
+ "si": "Sinhala",
93
+ "km": "Khmer",
94
+ "sn": "Shona",
95
+ "yo": "Yoruba",
96
+ "so": "Somali",
97
+ "af": "Afrikaans",
98
+ "oc": "Occitan",
99
+ "ka": "Georgian",
100
+ "be": "Belarusian",
101
+ "tg": "Tajik",
102
+ "sd": "Sindhi",
103
+ "gu": "Gujarati",
104
+ "am": "Amharic",
105
+ "yi": "Yiddish",
106
+ "lo": "Lao",
107
+ "uz": "Uzbek",
108
+ "fo": "Faroese",
109
+ "ht": "Haitian creole",
110
+ "ps": "Pashto",
111
+ "tk": "Turkmen",
112
+ "nn": "Nynorsk",
113
+ "mt": "Maltese",
114
+ "sa": "Sanskrit",
115
+ "lb": "Luxembourgish",
116
+ "my": "Myanmar",
117
+ "bo": "Tibetan",
118
+ "tl": "Tagalog",
119
+ "mg": "Malagasy",
120
+ "as": "Assamese",
121
+ "tt": "Tatar",
122
+ "haw": "Hawaiian",
123
+ "ln": "Lingala",
124
+ "ha": "Hausa",
125
+ "ba": "Bashkir",
126
+ "jw": "Javanese",
127
+ "su": "Sundanese",
128
+ }
129
+ embedding_model = PretrainedSpeakerEmbedding(
130
+ "speechbrain/spkrec-ecapa-voxceleb",
131
+ device=torch.device("cuda"))
132
+
133
+ source_language_list = [key[0] for key in source_languages.items()]
134
+
135
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
136
+ print("DEVICE IS: ")
137
+ print(device)
138
+
139
+ videos_out_path = Path("./videos_out")
140
+ videos_out_path.mkdir(parents=True, exist_ok=True)
141
+
142
+
143
+ def time(secs):
144
+ return datetime.timedelta(seconds=round(secs))
145
+
146
+ def get_youtube(video_url):
147
+ yt = YouTube(video_url)
148
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
149
+ print("Success download video")
150
+ print(abs_video_path)
151
+ return abs_video_path
152
+
153
+
154
+ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
155
+ """
156
+ # Youtube with translated subtitles using OpenAI Whisper
157
+ This space allows you to:
158
+ 1. Download youtube video with a given url
159
+ 2. Watch it in the first video component
160
+ 3. Run automatic speech recognition and diarization (speaker identification)
161
+
162
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
163
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
164
+ """
165
+
166
+ model = whisper.load_model(whisper_model)
167
+ if(video_file_path == None):
168
+ raise ValueError("Error no video input")
169
+ print(video_file_path)
170
+
171
+ try:
172
+ # Read and convert youtube video
173
+ _,file_ending = os.path.splitext(f'{video_file_path}')
174
+ print(f'file enging is {file_ending}')
175
+ print("starting conversion to wav")
176
+ os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
177
+
178
+ # Get duration
179
+ audio_file = video_file_path.replace(file_ending, ".wav")
180
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
181
+ frames = f.getnframes()
182
+ rate = f.getframerate()
183
+ duration = frames / float(rate)
184
+ print(f"conversion to wav ready, duration of audio file: {duration}")
185
+
186
+ # Transcribe audio
187
+ # options = dict(language=selected_source_lang, beam_size=5, best_of=5)
188
+ # transcribe_options = dict(task="transcribe", **options)
189
+ # result = model.transcribe(audio_file, **transcribe_options)
190
+ result = model.transcribe(audio_file, task="transcribe", language=selected_source_lang)
191
+ segments = result["segments"]
192
+ print("starting whisper done with whisper")
193
+ except Exception as e:
194
+ raise RuntimeError("Error converting video to audio")
195
+
196
+ try:
197
+ # Create embedding
198
+ def segment_embedding(segment):
199
+ audio = Audio()
200
+ start = segment["start"]
201
+ # Whisper overshoots the end timestamp in the last segment
202
+ end = min(duration, segment["end"])
203
+ clip = Segment(start, end)
204
+ waveform, sample_rate = audio.crop(audio_file, clip)
205
+ return embedding_model(waveform[None])
206
+
207
+ embeddings = np.zeros(shape=(len(segments), 192))
208
+ for i, segment in enumerate(segments):
209
+ embeddings[i] = segment_embedding(segment)
210
+ embeddings = np.nan_to_num(embeddings)
211
+ print(f'Embedding shape: {embeddings.shape}')
212
+
213
+ # Assign speaker label
214
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
215
+ labels = clustering.labels_
216
+ for i in range(len(segments)):
217
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
218
+
219
+ # Make output
220
+ objects = {
221
+ 'Start' : [],
222
+ 'End': [],
223
+ 'Speaker': [],
224
+ 'Text': []
225
+ }
226
+ for (i, segment) in enumerate(segments):
227
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
228
+ objects['Start'].append(str(time(segment["start"])))
229
+ objects['Speaker'].append(segment["speaker"])
230
+ if i != 0:
231
+ objects['End'].append(str(time(segments[i - 1]["end"])))
232
+ objects['Text'].append(text)
233
+ text = ''
234
+ text += segment["text"] + ' '
235
+ objects['End'].append(str(time(segments[i - 1]["end"])))
236
+ objects['Text'].append(text)
237
+
238
+ return pd.DataFrame(objects)
239
+
240
+ except Exception as e:
241
+ raise RuntimeError("Error Running inference with local model", e)
242
+
243
+
244
+ # ---- Gradio Layout -----
245
+ video_in = gr.Video(label="Video file", mirror_webcam=False)
246
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
247
+ video_out = gr.Video(label="Video Out", mirror_webcam=False)
248
+
249
+
250
+ df_init = pd.DataFrame(columns=['Start','End', 'Speaker', 'Text'])
251
+
252
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
253
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
254
+ number_speakers = gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)
255
+
256
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
257
+
258
+ demo = gr.Blocks(css='''
259
+ #cut_btn, #reset_btn { align-self:stretch; }
260
+ #\\31 3 { max-width: 540px; }
261
+ .output-markdown {max-width: 65ch !important;}
262
+ ''')
263
+ demo.encrypt = False
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  with demo:
267
+ transcription_var = gr.Variable()
268
+
269
+ with gr.Row():
270
+ with gr.Column():
271
+ gr.Markdown('''
272
+ ### This space allows you to:
273
+ ##### 1. Download youtube video with a given URL
274
+ ##### 2. Watch it in the first video component
275
+ ##### 3. Run automatic speech recognition and diarization (speaker identification)
276
+ ''')
277
+ memory = psutil.virtual_memory()
278
+ system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
279
+
280
+ with gr.Column():
281
+ gr.Markdown('''
282
+ ### Insert Youtube URL below. Some test youtube links below:
283
+ ''')
284
+ examples = gr.Examples(examples=
285
+ [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
286
+ "https://www.youtube.com/watch?v=-UX0X45sYe4",
287
+ "https://www.youtube.com/watch?v=7minSgqi-Gw"],
288
+ label="Examples", inputs=[youtube_url_in])
289
+
290
+
291
+
292
+ with gr.Row():
293
+ with gr.Column():
294
+ youtube_url_in.render()
295
+ download_youtube_btn = gr.Button("Download Youtube video")
296
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
297
+ video_in])
298
+ print(video_in)
299
+
300
+
301
+ with gr.Row():
302
+ with gr.Column():
303
+ video_in.render()
304
+ with gr.Column():
305
+ gr.Markdown('''
306
+ ##### Here you can start the transcription process.
307
+ ##### Please select source language for transcription.
308
+ ##### Please select number of speakers for getting better results.
309
+ ''')
310
+ selected_source_lang.render()
311
+ selected_whisper_model.render()
312
+ number_speakers.render()
313
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
314
+ transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], transcription_df)
315
+
316
+
317
+ with gr.Row():
318
+ gr.Markdown('''
319
+ ##### Here you will get transcription output
320
+ ##### ''')
321
+
322
+ with gr.Row():
323
+ with gr.Column():
324
+ transcription_df.render()
325
 
326
+ demo.launch(debug=True)