RustX commited on
Commit
d99f4b8
Β·
1 Parent(s): 54a7e83

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -1104
app.py CHANGED
@@ -1,1118 +1,43 @@
1
- # Models
2
- import torch
3
- from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
4
- from pyannote.audio import Pipeline
5
 
6
- # Audio Manipulation
7
- import audioread
8
- from pydub import AudioSegment, silence
9
- import yt_dlp
10
- from yt_dlp import DownloadError
11
 
12
- # Others
13
- import pandas as pd
14
- from datetime import timedelta
15
- import os
16
- import streamlit as st
17
- import time
18
- import pickle
19
 
20
- def config():
21
-
22
- st.set_page_config(page_title="Speech to Text / μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ", page_icon="πŸ“")
23
-
24
- # Create a data directory to store our audio files
25
- if not os.path.exists("../data"):
26
- os.makedirs("../data")
27
-
28
- # Initialize session state variables
29
- if 'page_index' not in st.session_state:
30
- st.session_state['page_index'] = -1 # Handle which page should be displayed (token page, home page, results page, rename page)
31
- st.session_state['txt_transcript'] = "" # Save the transcript as .txt so we can display it again on the results page
32
- st.session_state["process"] = [] # Save the results obtained so we can display them again on the results page
33
- st.session_state['srt_txt'] = "" # Save the transcript in a subtitles case to display it on the results page
34
- st.session_state['srt_token'] = 0 # Is subtitles parameter enabled or not
35
- st.session_state['audio_file'] = None # Save the audio file provided by the user so we can display it again on the results page
36
- st.session_state["start_time"] = 0 # Default audio player starting point (0s)
37
- st.session_state["summary"] = "" # Save the summary of the transcript so we can display it on the results page
38
- st.session_state["number_of_speakers"] = 0 # Save the number of speakers detected in the conversation (diarization)
39
- st.session_state["chosen_mode"] = 0 # Save the mode chosen by the user (Diarization or not, timestamps or not)
40
- st.session_state["btn_token_list"] = [] # List of tokens that indicates what options are activated to adapt the display on results page
41
- st.session_state["my_HF_token"] = "ACCESS_TOKEN_GOES_HERE" # User's Token that allows the use of the diarization model
42
- st.session_state["disable"] = True # Default appearance of the button to change your token
43
-
44
- # Display Text and CSS
45
- st.title("Speech to Text App / μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ μ•± πŸ“")
46
-
47
- st.markdown("""
48
- <style>
49
- .block-container.css-12oz5g7.egzxvld2{
50
- padding: 1%;}
51
-
52
- .stRadio > label:nth-child(1){
53
- font-weight: bold;
54
- }
55
- .stRadio > div{flex-direction:row;}
56
- p, span{
57
- text-align: justify;
58
- }
59
- span{
60
- text-align: center;
61
- }
62
- """, unsafe_allow_html=True)
63
-
64
- st.subheader("You want to extract text from an audio/video? You are in the right place! / μ˜€λ””μ˜€/λΉ„λ””μ˜€μ—μ„œ ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•˜κ³  μ‹ΆμŠ΅λ‹ˆκΉŒ? 당신은 λ°”λ‘œ 이곳에 μžˆμŠ΅λ‹ˆλ‹€!")
65
-
66
- def load_options(audio_length, dia_pipeline):
67
- """
68
- Display options so the user can customize the result (summarize the transcript ? trim the audio? ...)
69
- User can choose his parameters thanks to sliders & checkboxes, both displayed in a st.form so the page doesn't
70
- reload when interacting with an element (frustrating if it does because user loses fluidity).
71
- :return: the chosen parameters
72
- """
73
- # Create an st.form()
74
- with st.form("form"):
75
- st.markdown("""<h6>
76
- You can transcript a specific part of your audio by setting start and end values below (in seconds). Then,
77
- choose your parameters. / μ•„λž˜μ˜ μ‹œμž‘ 및 μ’…λ£Œ κ°’(초 λ‹¨μœ„)을 μ„€μ •ν•˜μ—¬ μ˜€λ””μ˜€μ˜ νŠΉμ • 뢀뢄을 전사할 수 μžˆμŠ΅λ‹ˆλ‹€. 그런 λ‹€μŒ λ§€κ°œλ³€μˆ˜λ₯Ό μ„ νƒν•˜μ„Έμš”.</h6>""", unsafe_allow_html=True)
78
-
79
- # Possibility to trim / cut the audio on a specific part (=> transcribe less seconds will result in saving time)
80
- # To perform that, user selects his time intervals thanks to sliders, displayed in 2 different columns
81
  col1, col2 = st.columns(2)
82
- with col1:
83
- start = st.slider("Start value (s) / μ‹œμž‘ κ°’(초)", 0, audio_length, value=0)
84
- with col2:
85
- end = st.slider("End value (s) / μ’…λ£Œ κ°’(초)", 0, audio_length, value=audio_length)
86
-
87
- # Create 3 new columns to display other options
88
- col1, col2, col3 = st.columns(3)
89
 
90
- # User selects his preferences with checkboxes
91
  with col1:
92
- # Differentiate Speakers
93
- if dia_pipeline == None:
94
- st.write("Diarization model unavailable / λΆ„ν•  λͺ¨λΈμ„ μ‚¬μš©ν•  수 μ—†μŒ")
95
- diarization_token = False
96
- else:
97
- diarization_token = st.checkbox("Differentiate speakers / μŠ€ν”Όμ»€λ₯Ό μ°¨λ³„ν™”ν•˜μ„Έμš”")
98
 
 
99
  with col2:
100
- # Summarize the transcript
101
- summarize_token = st.checkbox("Generate a summary / μš”μ•½μ„ μƒμ„±ν•˜μ„Έμš”", value=False)
102
-
103
- # Generate an SRT file instead of a TXT file (shorter timestamps)
104
- srt_token = st.checkbox("Generate subtitles file / μžλ§‰ 파일 μƒμ„±ν•˜μ„Έμš”", value=False)
105
-
106
- with col3:
107
- # Display the timestamp of each transcribed part
108
- timestamps_token = st.checkbox("Show timestamps / νƒ€μž„μŠ€νƒ¬ν”„λ₯Ό λ³΄μ—¬μ£Όμ„Έμš”", value=True)
109
-
110
- # Improve transcript with another model (better transcript but longer to obtain)
111
- choose_better_model = st.checkbox("Change STT Model / STT λͺ¨λΈμ„ λ³€κ²½ν•˜μ„Έμš”")
112
-
113
- # Srt option requires timestamps so it can match text with time => Need to correct the following case
114
- if not timestamps_token and srt_token:
115
- timestamps_token = True
116
- st.warning("Srt option requires timestamps. We activated it for you. / Srt μ˜΅μ…˜μ—λŠ” νƒ€μž„μŠ€νƒ¬ν”„κ°€ ν•„μš”ν•©λ‹ˆλ‹€. μš°λ¦¬λŠ” 당신을 μœ„ν•΄ 그것을 ν™œμ„±ν™”ν–ˆμŠ΅λ‹ˆλ‹€.")
117
-
118
- # Validate choices with a button
119
- transcript_btn = st.form_submit_button("Transcribe audio! / μ˜€λ””μ˜€λ₯Ό μ „μ‚¬ν•˜μ„Έμš”!")
120
-
121
- return transcript_btn, start, end, diarization_token, timestamps_token, srt_token, summarize_token, choose_better_model
122
-
123
- @st.cache_resource
124
- def load_models():
125
-
126
- # Load Whisper (Transcriber model)
127
- with st.spinner("Loading Speech to Text Model / μŒμ„±μ„ ν…μŠ€νŠΈ λͺ¨λΈλ‘œ λ‘œλ“œ 쀑"):
128
- try:
129
- stt_tokenizer = pickle.load(open("models/STT_processor_whisper-large-v2.sav", 'rb'))
130
- except FileNotFoundError:
131
- stt_tokenizer = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
132
-
133
- try:
134
- stt_model = pickle.load(open("models/STT_model_whisper-large-v2.sav", 'rb'))
135
- except FileNotFoundError:
136
- stt_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
137
-
138
- # Load summarizer model
139
- with st.spinner("Loading Summarization Model / μš”μ•½ λͺ¨λΈ λ‘œλ“œ 쀑"):
140
- try:
141
- summarizer = pickle.load(open("models/summarizer.sav", 'rb'))
142
- except FileNotFoundError:
143
- summarizer = pipeline("summarization", model="ainize/kobart-news")
144
-
145
- # Load Diarization model (Differentiate speakers)
146
- with st.spinner("Loading Diarization Model / λΆ„ν•  λͺ¨λΈ λ‘œλ“œ 쀑"):
147
- try:
148
- dia_pipeline = pickle.load(open("models/dia_pipeline.sav", 'rb'))
149
- except FileNotFoundError:
150
- dia_pipeline = Pipeline.from_pretrained("pyannote/[email protected]", use_auth_token=st.session_state["my_HF_token"])
151
-
152
- return stt_tokenizer, stt_model, summarizer, dia_pipeline
153
-
154
- def transcript_from_url(stt_tokenizer, stt_model, summarizer, dia_pipeline):
155
-
156
- url = st.text_input("Enter the YouTube video URL then press Enter to confirm! / YouTube λ™μ˜μƒ URL을 μž…λ ₯ν•œ λ‹€μŒ Enter ν‚€λ₯Ό 눌러 ν™•μΈν•˜μ„Έμš”!")
157
-
158
- # If link seems correct, we try to transcribe
159
- if "youtu" in url:
160
- filename = extract_audio_from_yt_video(url)
161
- if filename is not None:
162
- transcription(stt_tokenizer, stt_model, summarizer, dia_pipeline, filename)
163
- else:
164
- st.error("We were unable to extract the audio. Please verify your link, retry or choose another video / μ˜€λ””μ˜€λ₯Ό μΆ”μΆœν•  수 μ—†μŠ΅λ‹ˆλ‹€. 링크λ₯Ό ν™•μΈν•˜κ³  λ‹€μ‹œ μ‹œλ„ν•˜κ±°λ‚˜ λ‹€λ₯Έ λ™μ˜μƒμ„ μ„ νƒν•˜μ„Έμš”")
165
-
166
- def transcript_from_file(stt_tokenizer, stt_model, summarizer, dia_pipeline):
167
-
168
- uploaded_file = st.file_uploader("Upload your file! It can be a .mp3, .mp4 or .wav / νŒŒμΌμ„ μ—…λ‘œλ“œν•˜μ„Έμš”! .mp3, .mp4 λ˜λŠ” .wav일 수 μžˆμŠ΅λ‹ˆλ‹€.", type=["mp3", "mp4", "wav"], on_change=update_session_state, args=("page_index", 0,))
169
-
170
- if uploaded_file is not None:
171
- # get name and launch transcription function
172
- filename = uploaded_file.name
173
- transcription(stt_tokenizer, stt_model, summarizer, dia_pipeline, filename, uploaded_file)
174
-
175
- def transcription(stt_tokenizer, stt_model, summarizer, dia_pipeline, filename, uploaded_file=None):
176
-
177
- # If the audio comes from the YouTube extracting mode, the audio is downloaded so the uploaded_file is
178
- # the same as the filename. We need to change the uploaded_file which is currently set to None
179
- if uploaded_file is None:
180
- uploaded_file = filename
181
-
182
- # Get audio length of the file(s)
183
- myaudio = AudioSegment.from_file(uploaded_file)
184
- audio_length = myaudio.duration_seconds
185
-
186
- # Save Audio (so we can display it on another page ("DISPLAY RESULTS"), otherwise it is lost)
187
- update_session_state("audio_file", uploaded_file)
188
-
189
- # Display audio file
190
- st.audio(uploaded_file)
191
-
192
- # Is transcription possible
193
- if audio_length > 0:
194
-
195
- # We display options and user shares his wishes
196
- transcript_btn, start, end, diarization_token, timestamps_token, srt_token, summarize_token, choose_better_model = load_options(int(audio_length), dia_pipeline)
197
-
198
- # if end value hasn't been changed, we fix it to the max value so we don't cut some ms of the audio because end value is returned by an st.slider which returns end value as an int (ex: return 12 sec instead of end=12.9s)
199
- if end == int(audio_length):
200
- end = audio_length
201
-
202
- # Switching model for the better one
203
- if choose_better_model:
204
- with st.spinner("We are loading the better model. Please wait... / 더 λ‚˜μ€ λͺ¨λΈμ„ λ‘œλ“œν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. κΈ°λ‹€λ¦¬μ„Έμš”..."):
205
- try:
206
- stt_tokenizer = pickle.load(open("models/STT_processor2_whisper-large.sav", 'rb'))
207
- except FileNotFoundError:
208
- stt_tokenizer = WhisperProcessor.from_pretrained("openai/whisper-large")
209
- try:
210
- stt_model = pickle.load(open("models/STT_model2_whisper-large.sav", 'rb'))
211
- except FileNotFoundError:
212
- stt_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
213
-
214
- # Validate options and launch the transcription process thanks to the form's button
215
- if transcript_btn:
216
- # Check if start & end values are correct
217
- start, end = correct_values(start, end, audio_length)
218
-
219
- # If start a/o end value(s) has/have changed, we trim/cut the audio according to the new start/end values.
220
- if start != 0 or end != audio_length:
221
- myaudio = myaudio[start * 1000:end * 1000] # Works in milliseconds (*1000)
222
-
223
- # Transcribe process is running
224
- with st.spinner("We are transcribing your audio. Please wait / κ·€ν•˜μ˜ μ˜€λ””μ˜€λ₯Ό μ „μ‚¬ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. κΈ°λ‹€λ¦¬μ„Έμš”"):
225
-
226
- # Init variables
227
- txt_text, srt_text, save_result = init_transcription(start, int(end))
228
- min_space, max_space = silence_mode_init(srt_token)
229
-
230
- # Differentiate speakers mode
231
- if diarization_token:
232
- # Save mode chosen by user, to display expected results
233
- if not timestamps_token:
234
- update_session_state("chosen mode", "DIA")
235
- elif timestamps_token:
236
- update_session_state("chosen mode", "DIA_TS")
237
-
238
- # Convert mp3/mp4 to wav (Differentiate speakers mode only accepts wav files)
239
- if filename.endswith((".mp3", ".mp4")):
240
- myaudio, filename = convert_file_to_wav(myaudio, filename)
241
- else:
242
- filename = "../data/" + filename
243
- myaudio.export(filename, format="wav")
244
-
245
- # Differentiate speakers process
246
- diarization_timestamps, number_of_speakers = diarization_treatment(filename, dia_pipeline, max_space, srt_token)
247
-
248
- # Saving the number of detected speakers
249
- update_session_state("number_of_speakers", number_of_speakers)
250
-
251
- # Transcribe process with Diarization Mode
252
- save_result, txt_text, srt_text = transcription_diarization(filename, diarization_timestamps, stt_model, stt_tokenizer, diarization_token, srt_token, summarize_token, timestamps_token, myaudio, start, save_result, txt_text, srt_text)
253
-
254
- # Non Diarization Mode
255
- else:
256
- # Save mode chosen by user, to display expected results
257
- if not timestamps_token:
258
- update_session_state("chosen mode", "NODIA")
259
- if timestamps_token:
260
- update_session_state("chosen mode", "NODIA_TS")
261
-
262
- filename = "../data/" + filename
263
- # Transcribe process with non Diarization Mode
264
- save_result, txt_text, srt_text = transcription_non_diarization(filename, myaudio, start, end, diarization_token, timestamps_token, srt_token, summarize_token, stt_model, stt_tokenizer, min_space, max_space, save_result, txt_text, srt_text)
265
-
266
- # Save results
267
- update_session_state("process", save_result)
268
- update_session_state("srt_txt", srt_text)
269
-
270
- # Get final text
271
- # Diarization Mode
272
- if diarization_token:
273
- # Create txt text from the process
274
- txt_text = create_txt_text_from_process()
275
- # Non diarization Mode
276
- else:
277
- my_split_text_list = split_text(txt_text, 448)
278
- txt_text = ""
279
- for my_split_text in my_split_text_list:
280
- txt_text += my_split_text
281
-
282
- # Delete files
283
- clean_directory("../data") # clean folder that contains generated files
284
-
285
- # Display the final transcript
286
- if txt_text != "":
287
- st.subheader("Final text is / μ΅œμ’… ν…μŠ€νŠΈλŠ”")
288
-
289
- # Save txt_text and display it
290
- update_session_state("txt_transcript", txt_text)
291
- st.markdown(txt_text, unsafe_allow_html=True)
292
-
293
- # Summarize the transcript
294
- if summarize_token:
295
- with st.spinner("We are summarizing your audio / μ˜€λ””μ˜€λ₯Ό μš”μ•½ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€"):
296
- # Display summary in a st.expander widget to don't write too much text on the page
297
- with st.expander("Summary / μš”μ•½"):
298
- # Need to split the text by 142 text blocks size since the model has a limited input
299
- if diarization_token:
300
- # In diarization mode, the text to summarize is contained in the "summary" session state variable
301
- my_split_text_list = split_text(st.session_state["summary"], 142)
302
- else:
303
- # In non-diarization mode, it is contained in the txt_text
304
- my_split_text_list = split_text(txt_text, 142)
305
-
306
- summary = ""
307
- # Summarize each text block
308
- for my_split_text in my_split_text_list:
309
- summary += summarizer(my_split_text)[0]['summary_text']
310
-
311
- # Display summary and save it
312
- st.write(summary)
313
- update_session_state("summary", summary)
314
-
315
- # Display buttons to interact with results
316
- # We have 4 possible buttons depending on the user's choices. But we can't set 4 columns
317
- # for 4 buttons. Indeed, if the user displays only 3 buttons, it is possible that one of
318
- # the column 1, 2 or 3 is empty which would be ugly. We want the activated options to be in
319
- # the first column, so that the empty columns are not noticed. To do that, let's create a btn_token_list
320
- btn_token_list = [[diarization_token, "dia_token"], [True, "useless_txt_token"], [srt_token, "srt_token"], [summarize_token, "summarize_token"]]
321
-
322
- # Save this list to be able to reach it on the other pages of the app
323
- update_session_state("btn_token_list", btn_token_list)
324
-
325
- # Create 4 columns
326
- col1, col2, col3, col4 = st.columns(4)
327
-
328
- # Create a column list
329
- col_list = [col1, col2, col3, col4]
330
-
331
- # Check value of each token, if True, we put the respective button of the token in a column
332
- col_index = 0
333
- for elt in btn_token_list:
334
- if elt[0]:
335
- mycol = col_list[col_index]
336
- if elt[1] == "useless_txt_token":
337
- # Download your transcript.txt
338
- with mycol:
339
- st.download_button("Download as TXT / TXT둜 λ‹€μš΄λ‘œλ“œ", txt_text, file_name="my_transcription.txt", on_click=update_session_state, args=("page_index", 1,))
340
- elif elt[1] == "srt_token":
341
- # Download your transcript.srt
342
- with mycol:
343
- update_session_state("srt_token", srt_token)
344
- st.download_button("Download as SRT / SRT둜 λ‹€μš΄λ‘œλ“œ", srt_text, file_name="my_transcription.srt", on_click=update_session_state, args=("page_index", 1,))
345
- elif elt[1] == "dia_token":
346
- with mycol:
347
- # Rename the speakers detected in your audio
348
- st.button("Rename Speakers / μŠ€ν”Όμ»€ 이름 λ°”κΎΈκΈ°", on_click=update_session_state, args=("page_index", 2,))
349
- elif elt[1] == "summarize_token":
350
- with mycol:
351
- # Download the summary of your transcript.txt
352
- st.download_button("Download Summary / μš”μ•½ λ‹€μš΄λ‘œλ“œ", st.session_state["summary"], file_name="my_summary.txt", on_click=update_session_state, args=("page_index", 1,))
353
- col_index += 1
354
-
355
- else:
356
- st.write("Transcription impossible, a problem occurred with your audio or your parameters, we apologize :( / λ…ΉμŒμ΄ λΆˆκ°€λŠ₯ν•©λ‹ˆλ‹€. μ˜€λ””μ˜€ λ˜λŠ” λ§€κ°œλ³€μˆ˜μ— λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μ£„μ†‘ν•©λ‹ˆλ‹€ :(")
357
-
358
- else:
359
- st.error("Seems your audio is 0 s long, please change your file / μ˜€λ””μ˜€ 길이가 0초인 것 κ°™μŠ΅λ‹ˆλ‹€. νŒŒμΌμ„ λ³€κ²½ν•˜μ„Έμš”")
360
- time.sleep(3)
361
- st.stop()
362
-
363
- def create_txt_text_from_process():
364
- """
365
- Extracts speaker identities and spoken sentences from a process list and returns a formatted transcript.
366
- :param process: List of tuples, where each tuple represents a spoken sentence with speaker identity and timestamp.
367
- :return: Final transcript (without timestamps)
368
- """
369
-
370
- txt_text = ""
371
- process = st.session_state.get("process", [])
372
- for elt in process:
373
- if len(elt) < 3:
374
- continue
375
- speaker, sentence = elt[1], elt[2][2:]
376
- txt_text += f"{speaker}: {sentence}\n\n"
377
- return txt_text
378
-
379
- def rename_speakers_window():
380
- """
381
- Load a new page which allows the user to rename the different speakers from the diarization process
382
- For example he can switch from "Speaker1 : "I wouldn't say that"" to "Mat : "I wouldn't say that""
383
- """
384
-
385
- st.subheader("Here you can rename the speakers as you want / μ—¬κΈ°μ—μ„œ μ›ν•˜λŠ” λŒ€λ‘œ μŠ€ν”Όμ»€ 이름을 λ°”κΏ€ 수 μžˆμŠ΅λ‹ˆλ‹€")
386
- number_of_speakers = st.session_state["number_of_speakers"]
387
-
388
- if number_of_speakers > 0:
389
- # Handle displayed text according to the number_of_speakers
390
- if number_of_speakers == 1:
391
- st.write(str(number_of_speakers) + " speaker has been detected in your audio")
392
- else:
393
- st.write(str(number_of_speakers) + " speakers have been detected in your audio")
394
-
395
- # Saving the Speaker Name and its ID in a list, example : [1, 'Speaker1']
396
- list_of_speakers = []
397
- for elt in st.session_state["process"]:
398
- if st.session_state["chosen_mode"] == "DIA_TS":
399
- if [elt[1], elt[2]] not in list_of_speakers:
400
- list_of_speakers.append([elt[1], elt[2]])
401
- elif st.session_state["chosen_mode"] == "DIA":
402
- if [elt[0], elt[1]] not in list_of_speakers:
403
- list_of_speakers.append([elt[0], elt[1]])
404
-
405
- # Sorting (by ID)
406
- list_of_speakers.sort() # [[1, 'Speaker1'], [0, 'Speaker0']] => [[0, 'Speaker0'], [1, 'Speaker1']]
407
-
408
- # Display saved names so the user can modify them
409
- initial_names = ""
410
- for elt in list_of_speakers:
411
- initial_names += elt[1] + "\n"
412
-
413
- names_input = st.text_area("Just replace the names without changing the format (one per line) / ν˜•μ‹μ„ λ³€κ²½ν•˜μ§€ μ•Šκ³  μ΄λ¦„λ§Œ λ°”κΎΈμ‹­μ‹œμ˜€(ν•œ 쀄에 ν•˜λ‚˜μ”©)",
414
- value=initial_names)
415
-
416
- # Display Options (Cancel / Save)
417
- col1, col2 = st.columns(2)
418
- with col1:
419
- # Cancel changes by clicking a button - callback function to return to the results page
420
- st.button("Cancel / μ·¨μ†Œ", on_click=update_session_state, args=("page_index", 1,))
421
- with col2:
422
- # Confirm changes by clicking a button - callback function to apply changes and return to the results page
423
- st.button("Save changes / λ³€κ²½ 사항을 μ €μž₯", on_click=click_confirm_rename_btn, args=(names_input, number_of_speakers,))
424
-
425
- # Don't have anyone to rename
426
- else:
427
- st.error("0 speakers have been detected. Seems there is an issue with diarization / 0λͺ…μ˜ μŠ€ν”Όμ»€κ°€ κ°μ§€λ˜μ—ˆμŠ΅λ‹ˆλ‹€. 뢄할에 λ¬Έμ œκ°€ μžˆλŠ” 것 κ°™μŠ΅λ‹ˆλ‹€")
428
- with st.spinner("Redirecting to transcription page / 전사 νŽ˜μ΄μ§€λ‘œ λ¦¬λ””λ ‰μ…˜ 쀑"):
429
- time.sleep(4)
430
- # return to the results page
431
- update_session_state("page_index", 1)
432
-
433
- def click_confirm_rename_btn(names_input, number_of_speakers):
434
- """
435
- If the user decides to rename speakers and confirms his choices, we apply the modifications to our transcript
436
- Then we return to the results page of the app
437
- :param names_input: string
438
- :param number_of_speakers: Number of detected speakers in the audio file
439
- """
440
-
441
- try:
442
- names_input = names_input.split("\n")[:number_of_speakers]
443
-
444
- for elt in st.session_state["process"]:
445
- elt[2] = names_input[elt[1]]
446
-
447
- txt_text = create_txt_text_from_process()
448
- update_session_state("txt_transcript", txt_text)
449
- update_session_state("page_index", 1)
450
-
451
- except TypeError: # list indices must be integers or slices, not str (happened to me one time when writing nonsense names)
452
- st.error("Please respect the 1 name per line format / ν•œ 쀄에 1개의 이름 ν˜•μ‹μ„ μ€€μˆ˜ν•˜μ‹­μ‹œμ˜€")
453
- with st.spinner("We are relaunching the page / νŽ˜μ΄μ§€λ₯Ό λ‹€μ‹œ μ‹œμž‘ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€"):
454
- time.sleep(3)
455
- update_session_state("page_index", 1)
456
-
457
- def transcription_diarization(filename, diarization_timestamps, stt_model, stt_tokenizer, diarization_token, srt_token,
458
- summarize_token, timestamps_token, myaudio, start, save_result, txt_text, srt_text):
459
- """
460
- Performs transcription with the diarization mode
461
- :param filename: name of the audio file
462
- :param diarization_timestamps: timestamps of each audio part (ex 10 to 50 secs)
463
- :param stt_model: Speech to text model
464
- :param stt_tokenizer: Speech to text model's tokenizer
465
- :param diarization_token: Differentiate or not the speakers (choice fixed by user)
466
- :param srt_token: Enable/Disable generate srt file (choice fixed by user)
467
- :param summarize_token: Summarize or not the transcript (choice fixed by user)
468
- :param timestamps_token: Display and save or not the timestamps (choice fixed by user)
469
- :param myaudio: AudioSegment file
470
- :param start: int value (s) given by st.slider() (fixed by user)
471
- :param save_result: whole process
472
- :param txt_text: generated .txt transcript
473
- :param srt_text: generated .srt transcript
474
- :return: results of transcribing action
475
- """
476
- # Numeric counter that identifies each sequential subtitle
477
- srt_index = 1
478
-
479
- # Handle a rare case : Only the case if only one "list" in the list (it makes a classic list) not a list of list
480
- if not isinstance(diarization_timestamps[0], list):
481
- diarization_timestamps = [diarization_timestamps]
482
-
483
- # Transcribe each audio chunk (from timestamp to timestamp) and display transcript
484
- for index, elt in enumerate(diarization_timestamps):
485
- sub_start = elt[0]
486
- sub_end = elt[1]
487
-
488
- transcription = transcribe_audio_part(filename, stt_model, stt_tokenizer, myaudio, sub_start, sub_end,
489
- index)
490
-
491
- # Initial audio has been split with start & end values
492
- # It begins with 0s, but the timestamps need to be adjust with +start*1000 values to adapt the gap
493
- if transcription != "":
494
- save_result, txt_text, srt_text, srt_index = display_transcription(diarization_token, summarize_token,
495
- srt_token, timestamps_token,
496
- transcription, save_result, txt_text,
497
- srt_text,
498
- srt_index, sub_start + start * 1000,
499
- sub_end + start * 1000, elt)
500
- return save_result, txt_text, srt_text
501
-
502
- def transcription_non_diarization(filename, myaudio, start, end, diarization_token, timestamps_token, srt_token, summarize_token, stt_model, stt_tokenizer, min_space, max_space, save_result, txt_text, srt_text):
503
- # Numeric counter identifying each sequential subtitle
504
- srt_index = 1
505
-
506
- # get silences
507
- silence_list = detect_silences(myaudio)
508
- if silence_list != []:
509
- silence_list = get_middle_silence_time(silence_list)
510
- silence_list = silences_distribution(silence_list, min_space, max_space, start, end, srt_token)
511
- else:
512
- silence_list = generate_regular_split_till_end(silence_list, int(end), min_space, max_space)
513
-
514
- # Transcribe each audio chunk (from timestamp to timestamp) and display transcript
515
- for i in range(0, len(silence_list) - 1):
516
- sub_start = silence_list[i]
517
- sub_end = silence_list[i + 1]
518
-
519
- transcription = transcribe_audio_part(filename, stt_model, stt_tokenizer, myaudio, sub_start, sub_end, i)
520
-
521
- if transcription != "":
522
- save_result, txt_text, srt_text, srt_index = display_transcription(diarization_token, summarize_token, srt_token, timestamps_token, transcription, save_result, txt_text, srt_text, srt_index, sub_start + start * 1000, sub_end + start * 1000)
523
-
524
- return save_result, txt_text, srt_text
525
-
526
- def silence_mode_init(srt_token):
527
- """
528
- Fix min_space and max_space values
529
- If the user wants an srt file, we need to have tiny timestamps
530
- :param srt_token: Enable/Disable generate srt file option (choice fixed by user)
531
- :return: min_space and max_space values
532
- """
533
-
534
- if srt_token:
535
- # We need short intervals if we want a short text
536
- min_space = 1000 # 1 sec
537
- max_space = 8000 # 8 secs
538
-
539
- else:
540
-
541
- min_space = 25000 # 25 secs
542
- max_space = 45000 # 45 secs
543
- return min_space, max_space
544
-
545
- def detect_silences(audio):
546
-
547
- # Get Decibels (dB) so silences detection depends on the audio instead of a fixed value
548
- dbfs = audio.dBFS
549
-
550
- # Get silences timestamps > 750ms
551
- silence_list = silence.detect_silence(audio, min_silence_len=750, silence_thresh=dbfs-14)
552
-
553
- return silence_list
554
-
555
- def generate_regular_split_till_end(time_list, end, min_space, max_space):
556
-
557
- # In range loop can't handle float values so we convert to int
558
- int_last_value = int(time_list[-1])
559
- int_end = int(end)
560
-
561
- # Add maxspace to the last list value and add this value to the list
562
- for i in range(int_last_value, int_end, max_space):
563
- value = i + max_space
564
- if value < end:
565
- time_list.append(value)
566
-
567
- # Fix last automatic cut
568
- # If small gap (ex: 395 000, with end = 400 000)
569
- if end - time_list[-1] < min_space:
570
- time_list[-1] = end
571
- else:
572
- # If important gap (ex: 311 000 then 356 000, with end = 400 000, can't replace and then have 311k to 400k)
573
- time_list.append(end)
574
- return time_list
575
-
576
- def get_middle_silence_time(silence_list):
577
-
578
- length = len(silence_list)
579
- index = 0
580
- while index < length:
581
- diff = (silence_list[index][1] - silence_list[index][0])
582
- if diff < 3500:
583
- silence_list[index] = silence_list[index][0] + diff/2
584
- index += 1
585
- else:
586
- adapted_diff = 1500
587
- silence_list.insert(index + 1, silence_list[index][1] - adapted_diff)
588
- silence_list[index] = silence_list[index][0] + adapted_diff
589
- length += 1
590
- index += 2
591
-
592
- return silence_list
593
-
594
- def silences_distribution(silence_list, min_space, max_space, start, end, srt_token=False):
595
-
596
- # If starts != 0, we need to adjust end value since silences detection is performed on the trimmed/cut audio
597
- # (and not on the original audio) (ex: trim audio from 20s to 2m will be 0s to 1m40 = 2m-20s)
598
-
599
- # Shift the end according to the start value
600
- end -= start
601
- start = 0
602
- end *= 1000
603
-
604
- # Step 1 - Add start value
605
- newsilence = [start]
606
-
607
- # Step 2 - Create a regular distribution between start and the first element of silence_list to don't have a gap > max_space and run out of memory
608
- # example newsilence = [0] and silence_list starts with 100000 => It will create a massive gap [0, 100000]
609
-
610
- if silence_list[0] - max_space > newsilence[0]:
611
- for i in range(int(newsilence[0]), int(silence_list[0]), max_space): # int bc float can't be in a range loop
612
- value = i + max_space
613
- if value < silence_list[0]:
614
- newsilence.append(value)
615
-
616
- # Step 3 - Create a regular distribution until the last value of the silence_list
617
- min_desired_value = newsilence[-1]
618
- max_desired_value = newsilence[-1]
619
- nb_values = len(silence_list)
620
-
621
- while nb_values != 0:
622
- max_desired_value += max_space
623
-
624
- # Get a window of the values greater than min_desired_value and lower than max_desired_value
625
- silence_window = list(filter(lambda x: min_desired_value < x <= max_desired_value, silence_list))
626
-
627
- if silence_window != []:
628
- # Get the nearest value we can to min_desired_value or max_desired_value depending on srt_token
629
- if srt_token:
630
- nearest_value = min(silence_window, key=lambda x: abs(x - min_desired_value))
631
- nb_values -= silence_window.index(nearest_value) + 1 # (index begins at 0, so we add 1)
632
- else:
633
- nearest_value = min(silence_window, key=lambda x: abs(x - max_desired_value))
634
- # Max value index = len of the list
635
- nb_values -= len(silence_window)
636
-
637
- # Append the nearest value to our list
638
- newsilence.append(nearest_value)
639
-
640
- # If silence_window is empty we add the max_space value to the last one to create an automatic cut and avoid multiple audio cutting
641
- else:
642
- newsilence.append(newsilence[-1] + max_space)
643
-
644
- min_desired_value = newsilence[-1]
645
- max_desired_value = newsilence[-1]
646
-
647
- # Step 4 - Add the final value (end)
648
-
649
- if end - newsilence[-1] > min_space:
650
- # Gap > Min Space
651
- if end - newsilence[-1] < max_space:
652
- newsilence.append(end)
653
- else:
654
- # Gap too important between the last list value and the end value
655
- # We need to create automatic max_space cut till the end
656
- newsilence = generate_regular_split_till_end(newsilence, end, min_space, max_space)
657
- else:
658
- # Gap < Min Space <=> Final value and last value of new silence are too close, need to merge
659
- if len(newsilence) >= 2:
660
- if end - newsilence[-2] <= max_space:
661
- # Replace if gap is not too important
662
- newsilence[-1] = end
663
- else:
664
- newsilence.append(end)
665
-
666
- else:
667
- if end - newsilence[-1] <= max_space:
668
- # Replace if gap is not too important
669
- newsilence[-1] = end
670
- else:
671
- newsilence.append(end)
672
-
673
- return newsilence
674
-
675
- def init_transcription(start, end):
676
- update_session_state("summary", "")
677
- st.write("Transcription between", start, "and", end, "seconds in process.\n\n")
678
- txt_text = ""
679
- srt_text = ""
680
- save_result = []
681
- return txt_text, srt_text, save_result
682
-
683
- def transcribe_audio_part(filename, stt_model, stt_tokenizer, myaudio, sub_start, sub_end, index):
684
- stt_model = "openai/whisper-large-v2"
685
- stt_tokenizer = "openai/whisper-large-v2"
686
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
687
-
688
- pipe = pipeline(
689
- task="automatic-speech-recognition",
690
- model=stt_model,
691
- tokenizer=stt_tokenizer,
692
- chunk_length_s=30,
693
- device=device,
694
- )
695
-
696
- try:
697
- with torch.no_grad():
698
- new_audio = myaudio[sub_start:sub_end] # Works in milliseconds
699
- path = filename[:-3] + "audio_" + str(index) + ".mp3"
700
- new_audio.export(path) # Exports to a mp3 file in the current path
701
-
702
- # Decode
703
- transcription = pipe(path, generate_kwargs={"language": "korean", "task": "transcribe"})["text"]
704
- return transcription
705
-
706
- except audioread.NoBackendError:
707
- # Means we have a chunk with a [value1 : value2] case with value1>value2
708
- st.error("Sorry, seems we have a problem on our side. Please change start & end values.")
709
- time.sleep(3)
710
- st.stop()
711
-
712
- def optimize_subtitles(transcription, srt_index, sub_start, sub_end, srt_text):
713
- """
714
- Optimize the subtitles (avoid a too long reading when many words are said in a short time)
715
- :param transcription: transcript generated for an audio chunk
716
- :param srt_index: Numeric counter that identifies each sequential subtitle
717
- :param sub_start: beginning of the transcript
718
- :param sub_end: end of the transcript
719
- :param srt_text: generated .srt transcript
720
- """
721
-
722
- transcription_length = len(transcription)
723
-
724
- # Length of the transcript should be limited to about 42 characters per line to avoid this problem
725
- if transcription_length > 42:
726
- # Split the timestamp and its transcript in two parts
727
- # Get the middle timestamp
728
- diff = (timedelta(milliseconds=sub_end) - timedelta(milliseconds=sub_start)) / 2
729
- middle_timestamp = str(timedelta(milliseconds=sub_start) + diff).split(".")[0]
730
-
731
- # Get the closest middle index to a space (we don't divide transcription_length/2 to avoid cutting a word)
732
- space_indexes = [pos for pos, char in enumerate(transcription) if char == " "]
733
- nearest_index = min(space_indexes, key=lambda x: abs(x - transcription_length / 2))
734
-
735
- # First transcript part
736
- first_transcript = transcription[:nearest_index]
737
-
738
- # Second transcript part
739
- second_transcript = transcription[nearest_index + 1:]
740
-
741
- # Add both transcript parts to the srt_text
742
- srt_text += str(srt_index) + "\n" + str(timedelta(milliseconds=sub_start)).split(".")[0] + " --> " + middle_timestamp + "\n" + first_transcript + "\n\n"
743
- srt_index += 1
744
- srt_text += str(srt_index) + "\n" + middle_timestamp + " --> " + str(timedelta(milliseconds=sub_end)).split(".")[0] + "\n" + second_transcript + "\n\n"
745
- srt_index += 1
746
- else:
747
- # Add transcript without operations
748
- srt_text += str(srt_index) + "\n" + str(timedelta(milliseconds=sub_start)).split(".")[0] + " --> " + str(timedelta(milliseconds=sub_end)).split(".")[0] + "\n" + transcription + "\n\n"
749
-
750
- return srt_text, srt_index
751
-
752
- def display_transcription(diarization_token, summarize_token, srt_token, timestamps_token, transcription, save_result, txt_text, srt_text, srt_index, sub_start, sub_end, elt=None):
753
- """
754
- Display results
755
- :param diarization_token: Differentiate or not the speakers (choice fixed by user)
756
- :param summarize_token: Summarize or not the transcript (choice fixed by user)
757
- :param srt_token: Enable/Disable generate srt file (choice fixed by user)
758
- :param timestamps_token: Display and save or not the timestamps (choice fixed by user)
759
- :param transcription: transcript of the considered audio
760
- :param save_result: whole process
761
- :param txt_text: generated .txt transcript
762
- :param srt_text: generated .srt transcript
763
- :param srt_index : numeric counter that identifies each sequential subtitle
764
- :param sub_start: start value (s) of the considered audio part to transcribe
765
- :param sub_end: end value (s) of the considered audio part to transcribe
766
- :param elt: timestamp (diarization case only, otherwise elt = None)
767
- """
768
- # Display will be different depending on the mode (dia, no dia, dia_ts, nodia_ts)
769
-
770
- # diarization mode
771
- if diarization_token:
772
- if summarize_token:
773
- update_session_state("summary", transcription + " ", concatenate_token=True)
774
-
775
- if not timestamps_token:
776
- temp_transcription = elt[2] + " : " + transcription
777
- st.write(temp_transcription + "\n\n")
778
-
779
- save_result.append([int(elt[2][-1]), elt[2], " : " + transcription])
780
-
781
- elif timestamps_token:
782
- temp_timestamps = str(timedelta(milliseconds=sub_start)).split(".")[0] + " --> " + \
783
- str(timedelta(milliseconds=sub_end)).split(".")[0] + "\n"
784
- temp_transcription = elt[2] + " : " + transcription
785
- temp_list = [temp_timestamps, int(elt[2][-1]), elt[2], " : " + transcription, int(sub_start / 1000)]
786
- save_result.append(temp_list)
787
- st.button(temp_timestamps, on_click=click_timestamp_btn, args=(sub_start,))
788
- st.write(temp_transcription + "\n\n")
789
-
790
- if srt_token:
791
- srt_text, srt_index = optimize_subtitles(transcription, srt_index, sub_start, sub_end, srt_text)
792
-
793
- # Non diarization case
794
- else:
795
- if not timestamps_token:
796
- save_result.append([transcription])
797
- st.write(transcription + "\n\n")
798
-
799
- else:
800
- temp_timestamps = str(timedelta(milliseconds=sub_start)).split(".")[0] + " --> " + \
801
- str(timedelta(milliseconds=sub_end)).split(".")[0] + "\n"
802
- temp_list = [temp_timestamps, transcription, int(sub_start / 1000)]
803
- save_result.append(temp_list)
804
- st.button(temp_timestamps, on_click=click_timestamp_btn, args=(sub_start,))
805
- st.write(transcription + "\n\n")
806
-
807
- if srt_token:
808
- srt_text, srt_index = optimize_subtitles(transcription, srt_index, sub_start, sub_end, srt_text)
809
-
810
- txt_text += transcription + " " # So x seconds sentences are separated
811
-
812
- return save_result, txt_text, srt_text, srt_index
813
-
814
- def convert_file_to_wav(aud_seg, filename):
815
- """
816
- Convert an mp3/mp4 in a wav format
817
- Needs to be modified if you want to convert a format which contains less or more than 3 letters
818
-
819
- :param aud_seg: pydub.AudioSegment
820
- :param filename: name of the file
821
- :return: name of the converted file
822
- """
823
- filename = "../data/my_wav_file_" + filename[:-3] + "wav"
824
- aud_seg.export(filename, format="wav")
825
-
826
- newaudio = AudioSegment.from_file(filename)
827
-
828
- return newaudio, filename
829
-
830
- def get_diarization(dia_pipeline, filename):
831
- """
832
- Diarize an audio (find number of speakers, when they speak, ...)
833
- :param dia_pipeline: Pyannote's library (diarization pipeline)
834
- :param filename: name of a wav audio file
835
- :return: str list containing audio's diarization time intervals
836
- """
837
- # Get diarization of the audio
838
- diarization = dia_pipeline({'audio': filename}, num_speakers=2)
839
- listmapping = diarization.labels()
840
- listnewmapping = []
841
-
842
- # Rename default speakers' names (Default is A, B, ...), we want Speaker0, Speaker1, ...
843
- number_of_speakers = len(listmapping)
844
- for i in range(number_of_speakers):
845
- listnewmapping.append("Speaker" + str(i))
846
-
847
- mapping_dict = dict(zip(listmapping, listnewmapping))
848
- diarization.rename_labels(mapping_dict, copy=False) # copy set to False so we don't create a new annotation, we replace the actual one
849
-
850
- return diarization, number_of_speakers
851
-
852
- def confirm_token_change(hf_token, page_index):
853
- """
854
- A function that saves the hugging face token entered by the user.
855
- It also updates the page index variable so we can indicate we now want to display the home page instead of the token page
856
- :param hf_token: user's token
857
- :param page_index: number that represents the home page index
858
- """
859
- update_session_state("my_HF_token", hf_token)
860
- update_session_state("page_index", page_index)
861
-
862
- def convert_str_diarlist_to_timedelta(diarization_result):
863
- """
864
- Extract from Diarization result the given speakers with their respective speaking times and transform them in pandas timedelta objects
865
- :param diarization_result: result of diarization
866
- :return: list with timedelta intervals and their respective speaker
867
- """
868
-
869
- # get speaking intervals from diarization
870
- segments = diarization_result.for_json()["content"]
871
- diarization_timestamps = []
872
- for sample in segments:
873
- # Convert segment in a pd.Timedelta object
874
- new_seg = [pd.Timedelta(seconds=round(sample["segment"]["start"], 2)),
875
- pd.Timedelta(seconds=round(sample["segment"]["end"], 2)), sample["label"]]
876
- # Start and end = speaking duration
877
- # label = who is speaking
878
- diarization_timestamps.append(new_seg)
879
-
880
- return diarization_timestamps
881
-
882
- def merge_speaker_times(diarization_timestamps, max_space, srt_token):
883
- """
884
- Merge near times for each detected speaker (Same speaker during 1-2s and 3-4s -> Same speaker during 1-4s)
885
- :param diarization_timestamps: diarization list
886
- :param max_space: Maximum temporal distance between two silences
887
- :param srt_token: Enable/Disable generate srt file (choice fixed by user)
888
- :return: list with timedelta intervals and their respective speaker
889
- """
890
- if not srt_token:
891
- threshold = pd.Timedelta(seconds=max_space/1000)
892
-
893
- index = 0
894
- length = len(diarization_timestamps) - 1
895
-
896
- while index < length:
897
- if diarization_timestamps[index + 1][2] == diarization_timestamps[index][2] and \
898
- diarization_timestamps[index + 1][1] - threshold <= diarization_timestamps[index][0]:
899
- diarization_timestamps[index][1] = diarization_timestamps[index + 1][1]
900
- del diarization_timestamps[index + 1]
901
- length -= 1
902
- else:
903
- index += 1
904
- return diarization_timestamps
905
-
906
- def extending_timestamps(new_diarization_timestamps):
907
- """
908
- Extend timestamps between each diarization timestamp if possible, so we avoid word cutting
909
- :param new_diarization_timestamps: list
910
- :return: list with merged times
911
- """
912
- for i in range(1, len(new_diarization_timestamps)):
913
- if new_diarization_timestamps[i][0] - new_diarization_timestamps[i - 1][1] <= timedelta(milliseconds=3000) and new_diarization_timestamps[i][0] - new_diarization_timestamps[i - 1][1] >= timedelta(milliseconds=100):
914
- middle = (new_diarization_timestamps[i][0] - new_diarization_timestamps[i - 1][1]) / 2
915
- new_diarization_timestamps[i][0] -= middle
916
- new_diarization_timestamps[i - 1][1] += middle
917
-
918
- # Converting list so we have a milliseconds format
919
- for elt in new_diarization_timestamps:
920
- elt[0] = elt[0].total_seconds() * 1000
921
- elt[1] = elt[1].total_seconds() * 1000
922
-
923
- return new_diarization_timestamps
924
-
925
- def clean_directory(path):
926
-
927
- for file in os.listdir(path):
928
- os.remove(os.path.join(path, file))
929
-
930
- def correct_values(start, end, audio_length):
931
- """
932
- Start or/and end value(s) can be in conflict, so we check these values
933
- :param start: int value (s) given by st.slider() (fixed by user)
934
- :param end: int value (s) given by st.slider() (fixed by user)
935
- :param audio_length: audio duration (s)
936
- :return: approved values
937
- """
938
- # Start & end Values need to be checked
939
-
940
- if start >= audio_length or start >= end:
941
- start = 0
942
- st.write("Start value has been set to 0s because of conflicts with other values / λ‹€λ₯Έ κ°’κ³Ό μΆ©λŒν•˜μ—¬ μ‹œμž‘ 값을 0으둜 μ„€μ •ν–ˆμŠ΅λ‹ˆλ‹€")
943
-
944
- if end > audio_length or end == 0:
945
- end = audio_length
946
- st.write("End value has been set to maximum value because of conflicts with other values / λ‹€λ₯Έ κ°’κ³Ό μΆ©λŒν•˜μ—¬ 끝 값이 μ΅œλŒ€ κ°’μœΌλ‘œ μ„€μ •λ˜μ—ˆμŠ΅λ‹ˆλ‹€")
947
-
948
- return start, end
949
-
950
- def split_text(my_text, max_size):
951
- """
952
- Split a text
953
- Maximum sequence length for this model is max_size.
954
- If the transcript is longer, it needs to be split by the nearest possible value to max_size.
955
- To avoid cutting words, we will cut on "." characters, and " " if there is not "."
956
-
957
- :return: split text
958
- """
959
-
960
- cut2 = max_size
961
-
962
- # First, we get indexes of "."
963
- my_split_text_list = []
964
- nearest_index = 0
965
- length = len(my_text)
966
- # We split the transcript in text blocks of size <= max_size.
967
- if cut2 == length:
968
- my_split_text_list.append(my_text)
969
- else:
970
- while cut2 <= length:
971
- cut1 = nearest_index
972
- cut2 = nearest_index + max_size
973
- # Find the best index to split
974
-
975
- dots_indexes = [index for index, char in enumerate(my_text[cut1:cut2]) if
976
- char == "."]
977
- if dots_indexes != []:
978
- nearest_index = max(dots_indexes) + 1 + cut1
979
- else:
980
- spaces_indexes = [index for index, char in enumerate(my_text[cut1:cut2]) if
981
- char == " "]
982
- if spaces_indexes != []:
983
- nearest_index = max(spaces_indexes) + 1 + cut1
984
- else:
985
- nearest_index = cut2 + cut1
986
- my_split_text_list.append(my_text[cut1: nearest_index])
987
-
988
- return my_split_text_list
989
-
990
- def update_session_state(var, data, concatenate_token=False):
991
- """
992
- A simple function to update a session state variable
993
- :param var: variable's name
994
- :param data: new value of the variable
995
- :param concatenate_token: do we replace or concatenate
996
- """
997
-
998
- if concatenate_token:
999
- st.session_state[var] += data
1000
- else:
1001
- st.session_state[var] = data
1002
-
1003
- def display_results():
1004
-
1005
- st.button("Load another file / λ‹€λ₯Έ νŒŒμΌμ„ λ‘œλ“œν•˜μ„Έμš”", on_click=update_session_state, args=("page_index", 0,))
1006
- st.audio(st.session_state['audio_file'], start_time=st.session_state["start_time"])
1007
-
1008
- # Display results of transcription by steps
1009
- if st.session_state["process"] != []:
1010
- if st.session_state["chosen_mode"] == "NODIA": # Non diarization, non timestamps case
1011
- for elt in (st.session_state['process']):
1012
- st.write(elt[0])
1013
-
1014
- elif st.session_state["chosen_mode"] == "DIA": # Diarization without timestamps case
1015
- for elt in (st.session_state['process']):
1016
- st.write(elt[1] + elt[2])
1017
-
1018
- elif st.session_state["chosen_mode"] == "NODIA_TS": # Non diarization with timestamps case
1019
- for elt in (st.session_state['process']):
1020
- st.button(elt[0], on_click=update_session_state, args=("start_time", elt[2],))
1021
- st.write(elt[1])
1022
-
1023
- elif st.session_state["chosen_mode"] == "DIA_TS": # Diarization with timestamps case
1024
- for elt in (st.session_state['process']):
1025
- st.button(elt[0], on_click=update_session_state, args=("start_time", elt[4],))
1026
- st.write(elt[2] + elt[3])
1027
-
1028
- # Display final text
1029
- st.subheader("Final text is / μ΅œμ’… ν…μŠ€νŠΈλŠ”")
1030
- st.write(st.session_state["txt_transcript"])
1031
-
1032
- # Display Summary
1033
- if st.session_state["summary"] != "":
1034
- with st.expander("Summary / μš”μ•½"):
1035
- st.write(st.session_state["summary"])
1036
-
1037
- # Display the buttons in a list to avoid having empty columns
1038
- col1, col2, col3, col4 = st.columns(4)
1039
- col_list = [col1, col2, col3, col4]
1040
- col_index = 0
1041
-
1042
- for elt in st.session_state["btn_token_list"]:
1043
- if elt[0]:
1044
- mycol = col_list[col_index]
1045
- if elt[1] == "useless_txt_token":
1046
- # Download your transcription.txt
1047
- with mycol:
1048
- st.download_button("Download as TXT / TXT둜 λ‹€μš΄λ‘œλ“œ", st.session_state["txt_transcript"], file_name="my_transcription.txt")
1049
-
1050
- elif elt[1] == "srt_token":
1051
- # Download your transcription.srt
1052
- with mycol:
1053
- st.download_button("Download as SRT / SRT둜 λ‹€μš΄λ‘œλ“œ", st.session_state["srt_txt"], file_name="my_transcription.srt")
1054
-
1055
- elif elt[1] == "dia_token":
1056
- with mycol:
1057
- # Rename the speakers detected in your audio
1058
- st.button("Rename Speakers / μŠ€ν”Όμ»€ 이름 λ°”κΎΈκΈ°", on_click=update_session_state, args=("page_index", 2,))
1059
-
1060
- elif elt[1] == "summarize_token":
1061
- with mycol:
1062
- st.download_button("Download Summary / μš”μ•½ λ‹€μš΄λ‘œλ“œ", st.session_state["summary"], file_name="my_summary.txt")
1063
- col_index += 1
1064
-
1065
- def click_timestamp_btn(sub_start):
1066
- """
1067
- When user clicks a Timestamp button, we go to the display results page and st.audio is set to the sub_start value)
1068
- It allows the user to listen to the considered part of the audio
1069
- :param sub_start: Beginning of the considered transcript (ms)
1070
- """
1071
- update_session_state("page_index", 1)
1072
- update_session_state("start_time", int(sub_start / 1000)) # division to convert ms to s
1073
-
1074
- def diarization_treatment(filename, dia_pipeline, max_space, srt_token):
1075
- """
1076
- Launch the whole diarization process to get speakers time intervals as pandas timedelta objects
1077
- :param filename: name of the audio file
1078
- :param dia_pipeline: Diarization Model (Differentiate speakers)
1079
- :param max_space: Maximum temporal distance between two silences
1080
- :param srt_token: Enable/Disable generate srt file (choice fixed by user)
1081
- :return: speakers time intervals list and number of different detected speakers
1082
- """
1083
- # initialization
1084
- diarization_timestamps = []
1085
-
1086
- # whole diarization process
1087
- diarization, number_of_speakers = get_diarization(dia_pipeline, filename)
1088
 
1089
- if len(diarization) > 0:
1090
- diarization_timestamps = convert_str_diarlist_to_timedelta(diarization)
1091
- diarization_timestamps = merge_speaker_times(diarization_timestamps, max_space, srt_token)
1092
- diarization_timestamps = extending_timestamps(diarization_timestamps)
1093
 
1094
- return diarization_timestamps, number_of_speakers
1095
 
1096
- def extract_audio_from_yt_video(url):
1097
-
1098
- filename = "yt_download_" + url[-11:] + ".mp3"
1099
- try:
1100
 
1101
- ydl_opts = {
1102
- 'format': 'bestaudio/best',
1103
- 'outtmpl': filename,
1104
- 'postprocessors': [{
1105
- 'key': 'FFmpegExtractAudio',
1106
- 'preferredcodec': 'mp3',
1107
- }],
1108
- }
1109
- with st.spinner("We are extracting the audio from the video / λΉ„λ””μ˜€μ—μ„œ μ˜€λ””μ˜€λ₯Ό μΆ”μΆœν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€"):
1110
- #with youtube_dl.YoutubeDL(ydl_opts) as ydl:
1111
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
1112
- ydl.download([url])
1113
 
1114
- # Handle DownloadError: ERROR: unable to download video data: HTTP Error 403: Forbidden / happens sometimes
1115
- except DownloadError:
1116
- filename = None
1117
 
1118
- return filename
 
 
 
1
+ from main import *
 
 
 
2
 
3
+ if __name__ == '__main__':
4
+ config()
 
 
 
5
 
6
+ if st.session_state['page_index'] == -1:
7
+ # Specify token page (mandatory to use the diarization option)
8
+ st.warning('You must specify a token to use the diarization model. Otherwise, the app will be launched without this model. You can learn how to create your token here: https://huggingface.co/pyannote/speaker-diarization / λΆ„ν•  λͺ¨λΈμ„ μ‚¬μš©ν•˜λ €λ©΄ 토큰을 지정해야 ν•©λ‹ˆλ‹€. 그렇지 μ•ŠμœΌλ©΄ 이 λͺ¨λΈ 없이 앱이 μ‹€ν–‰λ©λ‹ˆλ‹€. μ—¬κΈ°μ—μ„œ 토큰을 λ§Œλ“œλŠ” 방법을 배울 수 μžˆμŠ΅λ‹ˆλ‹€: https://huggingface.co/pyannote/speaker-diarization')
9
+ text_input = st.text_input("Enter your Hugging Face token: / Hugging Face 토큰을 μž…λ ₯ν•˜μ„Έμš”:", placeholder="ACCESS_TOKEN_GOES_HERE", type="password")
 
 
 
10
 
11
+ # Confirm or continue without the option
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  col1, col2 = st.columns(2)
 
 
 
 
 
 
 
13
 
14
+ # Save changes button
15
  with col1:
16
+ confirm_btn = st.button("I have changed my token / 토큰을 λ³€κ²½ν–ˆμŠ΅λ‹ˆλ‹€", on_click=confirm_token_change, args=(text_input, 0), disabled=st.session_state["disable"])
17
+ # if text is changed, button is clickable
18
+ if text_input != "ACCESS_TOKEN_GOES_HERE":
19
+ st.session_state["disable"] = False
 
 
20
 
21
+ # Continue without a token (there will be no diarization option)
22
  with col2:
23
+ dont_mind_btn = st.button("Continue without this option / 이 μ˜΅μ…˜ 없이 κ³„μ†ν•˜μ‹­μ‹œμ˜€", on_click=update_session_state, args=("page_index", 0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ if st.session_state['page_index'] == 0:
26
+ # Home page
27
+ choice = st.radio("Features / νŠΉμ§•", ["By a video URL / λΉ„λ””μ˜€ URL둜", "By uploading a file / νŒŒμΌμ„ μ—…λ‘œλ“œν•˜μ—¬"])
 
28
 
29
+ stt_tokenizer, stt_model, summarizer, dia_pipeline = load_models()
30
 
31
+ if choice == "By a video URL / λΉ„λ””μ˜€ URL둜":
32
+ transcript_from_url(stt_tokenizer, stt_model, summarizer, dia_pipeline)
 
 
33
 
34
+ elif choice == "By uploading a file / νŒŒμΌμ„ μ—…λ‘œλ“œν•˜μ—¬":
35
+ transcript_from_file(stt_tokenizer, stt_model, summarizer, dia_pipeline)
 
 
 
 
 
 
 
 
 
 
36
 
37
+ elif st.session_state['page_index'] == 1:
38
+ # Display Results page
39
+ display_results()
40
 
41
+ elif st.session_state['page_index'] == 2:
42
+ # Rename speakers page
43
+ rename_speakers_window()