Spaces:
Runtime error
Runtime error
Commit
·
6c164b4
0
Parent(s):
Duplicate from BatuhanYilmaz/Whisper-Auto-Subtitled-Video-Generator
Browse filesCo-authored-by: Batuhan Yilmaz <[email protected]>
- .streamlit/config.toml +8 -0
- 01_🎥_Input_YouTube_Link.py +258 -0
- LICENSE +21 -0
- README.md +13 -0
- languages.py +101 -0
- packages.txt +1 -0
- pages/02_📼_Upload_Video_File.py +230 -0
- pages/03_📝_Upload_Video_File_and_Transcript.py +130 -0
- pages/04_🔊_Upload_Audio_File.py +205 -0
- requirements.txt +9 -0
- utils.py +96 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
primaryColor="#F63366"
|
3 |
+
backgroundColor="#FFFFFF"
|
4 |
+
secondaryBackgroundColor="#F0F2F6"
|
5 |
+
textColor="#262730"
|
6 |
+
font="sans serif"
|
7 |
+
[server]
|
8 |
+
maxUploadSize=1028
|
01_🎥_Input_YouTube_Link.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from pytube import YouTube
|
3 |
+
import requests
|
4 |
+
import time
|
5 |
+
import streamlit as st
|
6 |
+
from streamlit_lottie import st_lottie
|
7 |
+
import numpy as np
|
8 |
+
import os
|
9 |
+
from typing import Iterator
|
10 |
+
from io import StringIO
|
11 |
+
from utils import write_vtt, write_srt
|
12 |
+
import ffmpeg
|
13 |
+
from languages import LANGUAGES
|
14 |
+
|
15 |
+
st.set_page_config(page_title="Auto Subtitled Video Generator", page_icon=":movie_camera:", layout="wide")
|
16 |
+
|
17 |
+
# Define a function that we can use to load lottie files from a link.
|
18 |
+
@st.cache()
|
19 |
+
def load_lottieurl(url: str):
|
20 |
+
r = requests.get(url)
|
21 |
+
if r.status_code != 200:
|
22 |
+
return None
|
23 |
+
return r.json()
|
24 |
+
|
25 |
+
col1, col2 = st.columns([1, 3])
|
26 |
+
with col1:
|
27 |
+
lottie = load_lottieurl("https://assets8.lottiefiles.com/packages/lf20_jh9gfdye.json")
|
28 |
+
st_lottie(lottie)
|
29 |
+
|
30 |
+
with col2:
|
31 |
+
st.write("""
|
32 |
+
## Auto Subtitled Video Generator
|
33 |
+
##### Input a YouTube video link and get a video with subtitles.
|
34 |
+
###### ➠ If you want to transcribe the video in its original language, select the task as "Transcribe"
|
35 |
+
###### ➠ If you want to translate the subtitles to English, select the task as "Translate"
|
36 |
+
###### I recommend starting with the base model and then experimenting with the larger models, the small and medium models often work well. """)
|
37 |
+
|
38 |
+
|
39 |
+
@st.cache(allow_output_mutation=True)
|
40 |
+
def populate_metadata(link):
|
41 |
+
yt = YouTube(link)
|
42 |
+
author = yt.author
|
43 |
+
title = yt.title
|
44 |
+
description = yt.description
|
45 |
+
thumbnail = yt.thumbnail_url
|
46 |
+
length = yt.length
|
47 |
+
views = yt.views
|
48 |
+
return author, title, description, thumbnail, length, views
|
49 |
+
|
50 |
+
|
51 |
+
@st.cache(allow_output_mutation=True)
|
52 |
+
def download_video(link):
|
53 |
+
yt = YouTube(link)
|
54 |
+
video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
|
55 |
+
return video
|
56 |
+
|
57 |
+
|
58 |
+
def convert(seconds):
|
59 |
+
return time.strftime("%H:%M:%S", time.gmtime(seconds))
|
60 |
+
|
61 |
+
|
62 |
+
loaded_model = whisper.load_model("base")
|
63 |
+
current_size = "None"
|
64 |
+
|
65 |
+
|
66 |
+
@st.cache(allow_output_mutation=True)
|
67 |
+
def change_model(current_size, size):
|
68 |
+
if current_size != size:
|
69 |
+
loaded_model = whisper.load_model(size)
|
70 |
+
return loaded_model
|
71 |
+
else:
|
72 |
+
raise Exception("Model size is the same as the current size.")
|
73 |
+
|
74 |
+
|
75 |
+
@st.cache(allow_output_mutation=True)
|
76 |
+
def inference(link, loaded_model, task):
|
77 |
+
yt = YouTube(link)
|
78 |
+
path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp3")
|
79 |
+
if task == "Transcribe":
|
80 |
+
options = dict(task="transcribe", best_of=5)
|
81 |
+
results = loaded_model.transcribe(path, **options)
|
82 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
83 |
+
srt = getSubs(results["segments"], "srt", 80)
|
84 |
+
lang = results["language"]
|
85 |
+
return results["text"], vtt, srt, lang
|
86 |
+
elif task == "Translate":
|
87 |
+
options = dict(task="translate", best_of=5)
|
88 |
+
results = loaded_model.transcribe(path, **options)
|
89 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
90 |
+
srt = getSubs(results["segments"], "srt", 80)
|
91 |
+
lang = results["language"]
|
92 |
+
return results["text"], vtt, srt, lang
|
93 |
+
else:
|
94 |
+
raise ValueError("Task not supported")
|
95 |
+
|
96 |
+
|
97 |
+
@st.cache(allow_output_mutation=True)
|
98 |
+
def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
|
99 |
+
segmentStream = StringIO()
|
100 |
+
|
101 |
+
if format == 'vtt':
|
102 |
+
write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
103 |
+
elif format == 'srt':
|
104 |
+
write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
105 |
+
else:
|
106 |
+
raise Exception("Unknown format " + format)
|
107 |
+
|
108 |
+
segmentStream.seek(0)
|
109 |
+
return segmentStream.read()
|
110 |
+
|
111 |
+
|
112 |
+
def get_language_code(language):
|
113 |
+
if language in LANGUAGES.keys():
|
114 |
+
detected_language = LANGUAGES[language]
|
115 |
+
return detected_language
|
116 |
+
else:
|
117 |
+
raise ValueError("Language not supported")
|
118 |
+
|
119 |
+
|
120 |
+
def generate_subtitled_video(video, audio, transcript):
|
121 |
+
video_file = ffmpeg.input(video)
|
122 |
+
audio_file = ffmpeg.input(audio)
|
123 |
+
ffmpeg.concat(video_file.filter("subtitles", transcript), audio_file, v=1, a=1).output("final.mp4").run(quiet=True, overwrite_output=True)
|
124 |
+
video_with_subs = open("final.mp4", "rb")
|
125 |
+
return video_with_subs
|
126 |
+
|
127 |
+
|
128 |
+
def main():
|
129 |
+
size = st.selectbox("Select Model Size (The larger the model, the more accurate the transcription will be, but it will take longer)", ["tiny", "base", "small", "medium", "large"], index=1)
|
130 |
+
loaded_model = change_model(current_size, size)
|
131 |
+
st.write(f"Model is {'multilingual' if loaded_model.is_multilingual else 'English-only'} "
|
132 |
+
f"and has {sum(np.prod(p.shape) for p in loaded_model.parameters()):,} parameters.")
|
133 |
+
link = st.text_input("YouTube Link (The longer the video, the longer the processing time)")
|
134 |
+
task = st.selectbox("Select Task", ["Transcribe", "Translate"], index=0)
|
135 |
+
if task == "Transcribe":
|
136 |
+
if st.button("Transcribe"):
|
137 |
+
author, title, description, thumbnail, length, views = populate_metadata(link)
|
138 |
+
results = inference(link, loaded_model, task)
|
139 |
+
video = download_video(link)
|
140 |
+
lang = results[3]
|
141 |
+
detected_language = get_language_code(lang)
|
142 |
+
|
143 |
+
col3, col4 = st.columns(2)
|
144 |
+
col5, col6, col7, col8 = st.columns(4)
|
145 |
+
col9, col10 = st.columns(2)
|
146 |
+
with col3:
|
147 |
+
st.video(video)
|
148 |
+
|
149 |
+
# Write the results to a .txt file and download it.
|
150 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
151 |
+
f.writelines(results[0])
|
152 |
+
f.close()
|
153 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
154 |
+
datatxt = f.read()
|
155 |
+
|
156 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
157 |
+
f.writelines(results[1])
|
158 |
+
f.close()
|
159 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
160 |
+
datavtt = f.read()
|
161 |
+
|
162 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
163 |
+
f.writelines(results[2])
|
164 |
+
f.close()
|
165 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
166 |
+
datasrt = f.read()
|
167 |
+
|
168 |
+
with col5:
|
169 |
+
st.download_button(label="Download Transcript (.txt)",
|
170 |
+
data=datatxt,
|
171 |
+
file_name="transcript.txt")
|
172 |
+
with col6:
|
173 |
+
st.download_button(label="Download Transcript (.vtt)",
|
174 |
+
data=datavtt,
|
175 |
+
file_name="transcript.vtt")
|
176 |
+
with col7:
|
177 |
+
st.download_button(label="Download Transcript (.srt)",
|
178 |
+
data=datasrt,
|
179 |
+
file_name="transcript.srt")
|
180 |
+
with col9:
|
181 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
182 |
+
with col10:
|
183 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
184 |
+
|
185 |
+
with col4:
|
186 |
+
with st.spinner("Generating Subtitled Video"):
|
187 |
+
video_with_subs = generate_subtitled_video(video, "audio.mp3", "transcript.srt")
|
188 |
+
st.video(video_with_subs)
|
189 |
+
st.balloons()
|
190 |
+
with col8:
|
191 |
+
st.download_button(label="Download Subtitled Video",
|
192 |
+
data=video_with_subs,
|
193 |
+
file_name=f"{title} with subtitles.mp4")
|
194 |
+
elif task == "Translate":
|
195 |
+
if st.button("Translate to English"):
|
196 |
+
author, title, description, thumbnail, length, views = populate_metadata(link)
|
197 |
+
results = inference(link, loaded_model, task)
|
198 |
+
video = download_video(link)
|
199 |
+
lang = results[3]
|
200 |
+
detected_language = get_language_code(lang)
|
201 |
+
|
202 |
+
col3, col4 = st.columns(2)
|
203 |
+
col5, col6, col7, col8 = st.columns(4)
|
204 |
+
col9, col10 = st.columns(2)
|
205 |
+
with col3:
|
206 |
+
st.video(video)
|
207 |
+
|
208 |
+
# Write the results to a .txt file and download it.
|
209 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
210 |
+
f.writelines(results[0])
|
211 |
+
f.close()
|
212 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
213 |
+
datatxt = f.read()
|
214 |
+
|
215 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
216 |
+
f.writelines(results[1])
|
217 |
+
f.close()
|
218 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
219 |
+
datavtt = f.read()
|
220 |
+
|
221 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
222 |
+
f.writelines(results[2])
|
223 |
+
f.close()
|
224 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
225 |
+
datasrt = f.read()
|
226 |
+
with col5:
|
227 |
+
st.download_button(label="Download Transcript (.txt)",
|
228 |
+
data=datatxt,
|
229 |
+
file_name="transcript.txt")
|
230 |
+
with col6:
|
231 |
+
st.download_button(label="Download Transcript (.vtt)",
|
232 |
+
data=datavtt,
|
233 |
+
file_name="transcript.vtt")
|
234 |
+
with col7:
|
235 |
+
st.download_button(label="Download Transcript (.srt)",
|
236 |
+
data=datasrt,
|
237 |
+
file_name="transcript.srt")
|
238 |
+
with col9:
|
239 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
240 |
+
with col10:
|
241 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
242 |
+
|
243 |
+
with col4:
|
244 |
+
with st.spinner("Generating Subtitled Video"):
|
245 |
+
video_with_subs = generate_subtitled_video(video, "audio.mp3", "transcript.srt")
|
246 |
+
st.video(video_with_subs)
|
247 |
+
st.balloons()
|
248 |
+
with col8:
|
249 |
+
st.download_button(label="Download Subtitled Video",
|
250 |
+
data=video_with_subs,
|
251 |
+
file_name=f"{title} with subtitles.mp4")
|
252 |
+
else:
|
253 |
+
st.error("Please select a task.")
|
254 |
+
|
255 |
+
|
256 |
+
if __name__ == "__main__":
|
257 |
+
main()
|
258 |
+
st.markdown("###### Made with :heart: by [@BatuhanYılmaz](https://twitter.com/batuhan3326) [![this is an image link](https://i.imgur.com/thJhzOO.png)](https://www.buymeacoffee.com/batuhanylmz)")
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 Batuhan Yılmaz
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Whisper-Auto-Subtitled-Video-Generator
|
3 |
+
emoji: 🎥
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.10.0
|
8 |
+
app_file: 01_🎥_Input_YouTube_Link.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: BatuhanYilmaz/Whisper-Auto-Subtitled-Video-Generator
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
languages.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LANGUAGES = {
|
2 |
+
"en": "eng",
|
3 |
+
"zh": "zho",
|
4 |
+
"de": "deu",
|
5 |
+
"es": "spa",
|
6 |
+
"ru": "rus",
|
7 |
+
"ko": "kor",
|
8 |
+
"fr": "fra",
|
9 |
+
"ja": "jpn",
|
10 |
+
"pt": "por",
|
11 |
+
"tr": "tur",
|
12 |
+
"pl": "pol",
|
13 |
+
"ca": "cat",
|
14 |
+
"nl": "nld",
|
15 |
+
"ar": "ara",
|
16 |
+
"sv": "swe",
|
17 |
+
"it": "ita",
|
18 |
+
"id": "ind",
|
19 |
+
"hi": "hin",
|
20 |
+
"fi": "fin",
|
21 |
+
"vi": "vie",
|
22 |
+
"iw": "heb",
|
23 |
+
"uk": "ukr",
|
24 |
+
"el": "ell",
|
25 |
+
"ms": "msa",
|
26 |
+
"cs": "ces",
|
27 |
+
"ro": "ron",
|
28 |
+
"da": "dan",
|
29 |
+
"hu": "hun",
|
30 |
+
"ta": "tam",
|
31 |
+
"no": "nor",
|
32 |
+
"th": "tha",
|
33 |
+
"ur": "urd",
|
34 |
+
"hr": "hrv",
|
35 |
+
"bg": "bul",
|
36 |
+
"lt": "lit",
|
37 |
+
"la": "lat",
|
38 |
+
"mi": "mri",
|
39 |
+
"ml": "mal",
|
40 |
+
"cy": "cym",
|
41 |
+
"sk": "slk",
|
42 |
+
"te": "tel",
|
43 |
+
"fa": "fas",
|
44 |
+
"lv": "lav",
|
45 |
+
"bn": "ben",
|
46 |
+
"sr": "srp",
|
47 |
+
"az": "aze",
|
48 |
+
"sl": "slv",
|
49 |
+
"kn": "kan",
|
50 |
+
"et": "est",
|
51 |
+
"mk": "mkd",
|
52 |
+
"br": "bre",
|
53 |
+
"eu": "eus",
|
54 |
+
"is": "isl",
|
55 |
+
"hy": "hye",
|
56 |
+
"ne": "nep",
|
57 |
+
"mn": "mon",
|
58 |
+
"bs": "bos",
|
59 |
+
"kk": "kaz",
|
60 |
+
"sq": "sqi",
|
61 |
+
"sw": "swa",
|
62 |
+
"gl": "glg",
|
63 |
+
"mr": "mar",
|
64 |
+
"pa": "pan",
|
65 |
+
"si": "sin",
|
66 |
+
"km": "khm",
|
67 |
+
"sn": "sna",
|
68 |
+
"yo": "yor",
|
69 |
+
"so": "som",
|
70 |
+
"af": "afr",
|
71 |
+
"oc": "oci",
|
72 |
+
"ka": "kat",
|
73 |
+
"be": "bel",
|
74 |
+
"tg": "tgk",
|
75 |
+
"sd": "snd",
|
76 |
+
"gu": "guj",
|
77 |
+
"am": "amh",
|
78 |
+
"yi": "yid",
|
79 |
+
"lo": "lao",
|
80 |
+
"uz": "uzb",
|
81 |
+
"fo": "fao",
|
82 |
+
"ht": "hat",
|
83 |
+
"ps": "pus",
|
84 |
+
"tk": "tuk",
|
85 |
+
"nn": "nno",
|
86 |
+
"mt": "mlt",
|
87 |
+
"sa": "san",
|
88 |
+
"lb": "ltz",
|
89 |
+
"my": "mya",
|
90 |
+
"bo": "bod",
|
91 |
+
"tl": "tgl",
|
92 |
+
"mg": "mlg",
|
93 |
+
"as": "asm",
|
94 |
+
"tt": "tat",
|
95 |
+
"haw": "haw",
|
96 |
+
"ln": "lin",
|
97 |
+
"ha": "hau",
|
98 |
+
"ba": "bak",
|
99 |
+
"jw": "jav",
|
100 |
+
"su": "sun",
|
101 |
+
}
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
pages/02_📼_Upload_Video_File.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import streamlit as st
|
3 |
+
from streamlit_lottie import st_lottie
|
4 |
+
from utils import write_vtt, write_srt
|
5 |
+
import ffmpeg
|
6 |
+
import requests
|
7 |
+
from typing import Iterator
|
8 |
+
from io import StringIO
|
9 |
+
import numpy as np
|
10 |
+
import pathlib
|
11 |
+
import os
|
12 |
+
|
13 |
+
st.set_page_config(page_title="Auto Subtitled Video Generator", page_icon=":movie_camera:", layout="wide")
|
14 |
+
|
15 |
+
# Define a function that we can use to load lottie files from a link.
|
16 |
+
@st.cache(allow_output_mutation=True)
|
17 |
+
def load_lottieurl(url: str):
|
18 |
+
r = requests.get(url)
|
19 |
+
if r.status_code != 200:
|
20 |
+
return None
|
21 |
+
return r.json()
|
22 |
+
|
23 |
+
|
24 |
+
APP_DIR = pathlib.Path(__file__).parent.absolute()
|
25 |
+
|
26 |
+
LOCAL_DIR = APP_DIR / "local"
|
27 |
+
LOCAL_DIR.mkdir(exist_ok=True)
|
28 |
+
save_dir = LOCAL_DIR / "output"
|
29 |
+
save_dir.mkdir(exist_ok=True)
|
30 |
+
|
31 |
+
|
32 |
+
loaded_model = whisper.load_model("base")
|
33 |
+
current_size = "None"
|
34 |
+
|
35 |
+
|
36 |
+
col1, col2 = st.columns([1, 3])
|
37 |
+
with col1:
|
38 |
+
lottie = load_lottieurl("https://assets1.lottiefiles.com/packages/lf20_HjK9Ol.json")
|
39 |
+
st_lottie(lottie)
|
40 |
+
|
41 |
+
with col2:
|
42 |
+
st.write("""
|
43 |
+
## Auto Subtitled Video Generator
|
44 |
+
##### Upload a video file and get a video with subtitles.
|
45 |
+
###### ➠ If you want to transcribe the video in its original language, select the task as "Transcribe"
|
46 |
+
###### ➠ If you want to translate the subtitles to English, select the task as "Translate"
|
47 |
+
###### I recommend starting with the base model and then experimenting with the larger models, the small and medium models often work well. """)
|
48 |
+
|
49 |
+
|
50 |
+
@st.cache(allow_output_mutation=True)
|
51 |
+
def change_model(current_size, size):
|
52 |
+
if current_size != size:
|
53 |
+
loaded_model = whisper.load_model(size)
|
54 |
+
return loaded_model
|
55 |
+
else:
|
56 |
+
raise Exception("Model size is the same as the current size.")
|
57 |
+
|
58 |
+
|
59 |
+
@st.cache(allow_output_mutation=True)
|
60 |
+
def inferecence(loaded_model, uploaded_file, task):
|
61 |
+
with open(f"{save_dir}/input.mp4", "wb") as f:
|
62 |
+
f.write(uploaded_file.read())
|
63 |
+
audio = ffmpeg.input(f"{save_dir}/input.mp4")
|
64 |
+
audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
|
65 |
+
ffmpeg.run(audio, overwrite_output=True)
|
66 |
+
if task == "Transcribe":
|
67 |
+
options = dict(task="transcribe", best_of=5)
|
68 |
+
results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
|
69 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
70 |
+
srt = getSubs(results["segments"], "srt", 80)
|
71 |
+
lang = results["language"]
|
72 |
+
return results["text"], vtt, srt, lang
|
73 |
+
elif task == "Translate":
|
74 |
+
options = dict(task="translate", best_of=5)
|
75 |
+
results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
|
76 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
77 |
+
srt = getSubs(results["segments"], "srt", 80)
|
78 |
+
lang = results["language"]
|
79 |
+
return results["text"], vtt, srt, lang
|
80 |
+
else:
|
81 |
+
raise ValueError("Task not supported")
|
82 |
+
|
83 |
+
|
84 |
+
def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
|
85 |
+
segmentStream = StringIO()
|
86 |
+
|
87 |
+
if format == 'vtt':
|
88 |
+
write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
89 |
+
elif format == 'srt':
|
90 |
+
write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
91 |
+
else:
|
92 |
+
raise Exception("Unknown format " + format)
|
93 |
+
|
94 |
+
segmentStream.seek(0)
|
95 |
+
return segmentStream.read()
|
96 |
+
|
97 |
+
|
98 |
+
def generate_subtitled_video(video, audio, transcript):
|
99 |
+
video_file = ffmpeg.input(video)
|
100 |
+
audio_file = ffmpeg.input(audio)
|
101 |
+
ffmpeg.concat(video_file.filter("subtitles", transcript), audio_file, v=1, a=1).output("final.mp4").run(quiet=True, overwrite_output=True)
|
102 |
+
video_with_subs = open("final.mp4", "rb")
|
103 |
+
return video_with_subs
|
104 |
+
|
105 |
+
|
106 |
+
def main():
|
107 |
+
size = st.selectbox("Select Model Size (The larger the model, the more accurate the transcription will be, but it will take longer)", ["tiny", "base", "small", "medium", "large"], index=1)
|
108 |
+
loaded_model = change_model(current_size, size)
|
109 |
+
st.write(f"Model is {'multilingual' if loaded_model.is_multilingual else 'English-only'} "
|
110 |
+
f"and has {sum(np.prod(p.shape) for p in loaded_model.parameters()):,} parameters.")
|
111 |
+
input_file = st.file_uploader("File", type=["mp4", "avi", "mov", "mkv"])
|
112 |
+
# get the name of the input_file
|
113 |
+
if input_file is not None:
|
114 |
+
filename = input_file.name[:-4]
|
115 |
+
else:
|
116 |
+
filename = None
|
117 |
+
task = st.selectbox("Select Task", ["Transcribe", "Translate"], index=0)
|
118 |
+
if task == "Transcribe":
|
119 |
+
if st.button("Transcribe"):
|
120 |
+
results = inferecence(loaded_model, input_file, task)
|
121 |
+
col3, col4 = st.columns(2)
|
122 |
+
col5, col6, col7, col8 = st.columns(4)
|
123 |
+
col9, col10 = st.columns(2)
|
124 |
+
with col3:
|
125 |
+
st.video(input_file)
|
126 |
+
|
127 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
128 |
+
f.writelines(results[0])
|
129 |
+
f.close()
|
130 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
131 |
+
datatxt = f.read()
|
132 |
+
|
133 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
134 |
+
f.writelines(results[1])
|
135 |
+
f.close()
|
136 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
137 |
+
datavtt = f.read()
|
138 |
+
|
139 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
140 |
+
f.writelines(results[2])
|
141 |
+
f.close()
|
142 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
143 |
+
datasrt = f.read()
|
144 |
+
|
145 |
+
with col5:
|
146 |
+
st.download_button(label="Download Transcript (.txt)",
|
147 |
+
data=datatxt,
|
148 |
+
file_name="transcript.txt")
|
149 |
+
with col6:
|
150 |
+
st.download_button(label="Download Transcript (.vtt)",
|
151 |
+
data=datavtt,
|
152 |
+
file_name="transcript.vtt")
|
153 |
+
with col7:
|
154 |
+
st.download_button(label="Download Transcript (.srt)",
|
155 |
+
data=datasrt,
|
156 |
+
file_name="transcript.srt")
|
157 |
+
with col9:
|
158 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
159 |
+
with col10:
|
160 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
161 |
+
|
162 |
+
with col4:
|
163 |
+
with st.spinner("Generating Subtitled Video"):
|
164 |
+
video_with_subs = generate_subtitled_video(f"{save_dir}/input.mp4", f"{save_dir}/output.wav", "transcript.srt")
|
165 |
+
st.video(video_with_subs)
|
166 |
+
st.snow()
|
167 |
+
with col8:
|
168 |
+
st.download_button(label="Download Video with Subtitles",
|
169 |
+
data=video_with_subs,
|
170 |
+
file_name=f"{filename}_with_subs.mp4")
|
171 |
+
elif task == "Translate":
|
172 |
+
if st.button("Translate to English"):
|
173 |
+
results = inferecence(loaded_model, input_file, task)
|
174 |
+
col3, col4 = st.columns(2)
|
175 |
+
col5, col6, col7, col8 = st.columns(4)
|
176 |
+
col9, col10 = st.columns(2)
|
177 |
+
with col3:
|
178 |
+
st.video(input_file)
|
179 |
+
|
180 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
181 |
+
f.writelines(results[0])
|
182 |
+
f.close()
|
183 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
184 |
+
datatxt = f.read()
|
185 |
+
|
186 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
187 |
+
f.writelines(results[1])
|
188 |
+
f.close()
|
189 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
190 |
+
datavtt = f.read()
|
191 |
+
|
192 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
193 |
+
f.writelines(results[2])
|
194 |
+
f.close()
|
195 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
196 |
+
datasrt = f.read()
|
197 |
+
|
198 |
+
with col5:
|
199 |
+
st.download_button(label="Download Transcript (.txt)",
|
200 |
+
data=datatxt,
|
201 |
+
file_name="transcript.txt")
|
202 |
+
with col6:
|
203 |
+
st.download_button(label="Download Transcript (.vtt)",
|
204 |
+
data=datavtt,
|
205 |
+
file_name="transcript.vtt")
|
206 |
+
with col7:
|
207 |
+
st.download_button(label="Download Transcript (.srt)",
|
208 |
+
data=datasrt,
|
209 |
+
file_name="transcript.srt")
|
210 |
+
with col9:
|
211 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
212 |
+
with col10:
|
213 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
214 |
+
|
215 |
+
with col4:
|
216 |
+
with st.spinner("Generating Subtitled Video"):
|
217 |
+
video_with_subs = generate_subtitled_video(f"{save_dir}/input.mp4", f"{save_dir}/output.wav", "transcript.srt")
|
218 |
+
st.video(video_with_subs)
|
219 |
+
st.snow()
|
220 |
+
with col8:
|
221 |
+
st.download_button(label="Download Video with Subtitles",
|
222 |
+
data=video_with_subs,
|
223 |
+
file_name=f"{filename}_with_subs.mp4")
|
224 |
+
else:
|
225 |
+
st.error("Please select a task.")
|
226 |
+
|
227 |
+
|
228 |
+
if __name__ == "__main__":
|
229 |
+
main()
|
230 |
+
st.markdown("###### Made with :heart: by [@BatuhanYılmaz](https://twitter.com/batuhan3326) [![this is an image link](https://i.imgur.com/thJhzOO.png)](https://www.buymeacoffee.com/batuhanylmz)")
|
pages/03_📝_Upload_Video_File_and_Transcript.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit_lottie import st_lottie
|
3 |
+
from utils import write_vtt, write_srt
|
4 |
+
import ffmpeg
|
5 |
+
import requests
|
6 |
+
from typing import Iterator
|
7 |
+
from io import StringIO
|
8 |
+
import numpy as np
|
9 |
+
import pathlib
|
10 |
+
import os
|
11 |
+
|
12 |
+
|
13 |
+
st.set_page_config(page_title="Auto Subtitled Video Generator", page_icon=":movie_camera:", layout="wide")
|
14 |
+
|
15 |
+
# Define a function that we can use to load lottie files from a link.
|
16 |
+
@st.cache(allow_output_mutation=True)
|
17 |
+
def load_lottieurl(url: str):
|
18 |
+
r = requests.get(url)
|
19 |
+
if r.status_code != 200:
|
20 |
+
return None
|
21 |
+
return r.json()
|
22 |
+
|
23 |
+
|
24 |
+
APP_DIR = pathlib.Path(__file__).parent.absolute()
|
25 |
+
|
26 |
+
LOCAL_DIR = APP_DIR / "local_transcript"
|
27 |
+
LOCAL_DIR.mkdir(exist_ok=True)
|
28 |
+
save_dir = LOCAL_DIR / "output"
|
29 |
+
save_dir.mkdir(exist_ok=True)
|
30 |
+
|
31 |
+
|
32 |
+
col1, col2 = st.columns([1, 3])
|
33 |
+
with col1:
|
34 |
+
lottie = load_lottieurl("https://assets6.lottiefiles.com/packages/lf20_cjnxwrkt.json")
|
35 |
+
st_lottie(lottie)
|
36 |
+
|
37 |
+
with col2:
|
38 |
+
st.write("""
|
39 |
+
## Auto Subtitled Video Generator
|
40 |
+
##### ➠ Upload a video file and a transcript as .srt or .vtt file and get a video with subtitles.
|
41 |
+
##### ➠ Processing time will increase as the video length increases. """)
|
42 |
+
|
43 |
+
|
44 |
+
def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
|
45 |
+
segmentStream = StringIO()
|
46 |
+
|
47 |
+
if format == 'vtt':
|
48 |
+
write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
49 |
+
elif format == 'srt':
|
50 |
+
write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
51 |
+
else:
|
52 |
+
raise Exception("Unknown format " + format)
|
53 |
+
|
54 |
+
segmentStream.seek(0)
|
55 |
+
return segmentStream.read()
|
56 |
+
|
57 |
+
|
58 |
+
def split_video_audio(uploaded_file):
|
59 |
+
with open(f"{save_dir}/input.mp4", "wb") as f:
|
60 |
+
f.write(uploaded_file.read())
|
61 |
+
audio = ffmpeg.input(f"{save_dir}/input.mp4")
|
62 |
+
audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
|
63 |
+
ffmpeg.run(audio, overwrite_output=True)
|
64 |
+
|
65 |
+
|
66 |
+
def main():
|
67 |
+
uploaded_video = st.file_uploader("Upload Video File", type=["mp4", "avi", "mov", "mkv"])
|
68 |
+
# get the name of the input_file
|
69 |
+
if uploaded_video is not None:
|
70 |
+
filename = uploaded_video.name[:-4]
|
71 |
+
else:
|
72 |
+
filename = None
|
73 |
+
transcript_file = st.file_uploader("Upload Transcript File", type=["srt", "vtt"])
|
74 |
+
if transcript_file is not None:
|
75 |
+
transcript_name = transcript_file.name
|
76 |
+
else:
|
77 |
+
transcript_name = None
|
78 |
+
if uploaded_video is not None and transcript_file is not None:
|
79 |
+
if transcript_name[-3:] == "vtt":
|
80 |
+
with open("uploaded_transcript.vtt", "wb") as f:
|
81 |
+
f.writelines(transcript_file)
|
82 |
+
f.close()
|
83 |
+
with open(os.path.join(os.getcwd(), "uploaded_transcript.vtt"), "rb") as f:
|
84 |
+
vtt_file = f.read()
|
85 |
+
if st.button("Generate Video with Subtitles"):
|
86 |
+
with st.spinner("Generating Subtitled Video"):
|
87 |
+
split_video_audio(uploaded_video)
|
88 |
+
video_file = ffmpeg.input(f"{save_dir}/input.mp4")
|
89 |
+
audio_file = ffmpeg.input(f"{save_dir}/output.wav")
|
90 |
+
ffmpeg.concat(video_file.filter("subtitles", "uploaded_transcript.vtt"), audio_file, v=1, a=1).output("final.mp4").global_args('-report').run(quiet=True, overwrite_output=True)
|
91 |
+
video_with_subs = open("final.mp4", "rb")
|
92 |
+
col3, col4 = st.columns(2)
|
93 |
+
with col3:
|
94 |
+
st.video(uploaded_video)
|
95 |
+
with col4:
|
96 |
+
st.video(video_with_subs)
|
97 |
+
st.download_button(label="Download Video with Subtitles",
|
98 |
+
data=video_with_subs,
|
99 |
+
file_name=f"{filename}_with_subs.mp4")
|
100 |
+
|
101 |
+
elif transcript_name[-3:] == "srt":
|
102 |
+
with open("uploaded_transcript.srt", "wb") as f:
|
103 |
+
f.writelines(transcript_file)
|
104 |
+
f.close()
|
105 |
+
with open(os.path.join(os.getcwd(), "uploaded_transcript.srt"), "rb") as f:
|
106 |
+
srt_file = f.read()
|
107 |
+
if st.button("Generate Video with Subtitles"):
|
108 |
+
with st.spinner("Generating Subtitled Video"):
|
109 |
+
split_video_audio(uploaded_video)
|
110 |
+
video_file = ffmpeg.input(f"{save_dir}/input.mp4")
|
111 |
+
audio_file = ffmpeg.input(f"{save_dir}/output.wav")
|
112 |
+
ffmpeg.concat(video_file.filter("subtitles", "uploaded_transcript.srt"), audio_file, v=1, a=1).output("final.mp4").run(quiet=True, overwrite_output=True)
|
113 |
+
video_with_subs = open("final.mp4", "rb")
|
114 |
+
col3, col4 = st.columns(2)
|
115 |
+
with col3:
|
116 |
+
st.video(uploaded_video)
|
117 |
+
with col4:
|
118 |
+
st.video(video_with_subs)
|
119 |
+
st.download_button(label="Download Video with Subtitles",
|
120 |
+
data=video_with_subs,
|
121 |
+
file_name=f"{filename}_with_subs.mp4")
|
122 |
+
else:
|
123 |
+
st.error("Please upload a .srt or .vtt file")
|
124 |
+
else:
|
125 |
+
st.info("Please upload a video file and a transcript file")
|
126 |
+
|
127 |
+
|
128 |
+
if __name__ == "__main__":
|
129 |
+
main()
|
130 |
+
|
pages/04_🔊_Upload_Audio_File.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import streamlit as st
|
3 |
+
from streamlit_lottie import st_lottie
|
4 |
+
from utils import write_vtt, write_srt
|
5 |
+
import ffmpeg
|
6 |
+
import requests
|
7 |
+
from typing import Iterator
|
8 |
+
from io import StringIO
|
9 |
+
import numpy as np
|
10 |
+
import pathlib
|
11 |
+
import os
|
12 |
+
|
13 |
+
st.set_page_config(page_title="Auto Transcriber", page_icon="🔊", layout="wide")
|
14 |
+
|
15 |
+
# Define a function that we can use to load lottie files from a link.
|
16 |
+
@st.cache(allow_output_mutation=True)
|
17 |
+
def load_lottieurl(url: str):
|
18 |
+
r = requests.get(url)
|
19 |
+
if r.status_code != 200:
|
20 |
+
return None
|
21 |
+
return r.json()
|
22 |
+
|
23 |
+
|
24 |
+
APP_DIR = pathlib.Path(__file__).parent.absolute()
|
25 |
+
|
26 |
+
LOCAL_DIR = APP_DIR / "local_audio"
|
27 |
+
LOCAL_DIR.mkdir(exist_ok=True)
|
28 |
+
save_dir = LOCAL_DIR / "output"
|
29 |
+
save_dir.mkdir(exist_ok=True)
|
30 |
+
|
31 |
+
|
32 |
+
col1, col2 = st.columns([1, 3])
|
33 |
+
with col1:
|
34 |
+
lottie = load_lottieurl("https://assets1.lottiefiles.com/packages/lf20_1xbk4d2v.json")
|
35 |
+
st_lottie(lottie)
|
36 |
+
|
37 |
+
with col2:
|
38 |
+
st.write("""
|
39 |
+
## Auto Transcriber
|
40 |
+
##### Input an audio file and get a transcript.
|
41 |
+
###### ➠ If you want to transcribe the audio in its original language, select the task as "Transcribe"
|
42 |
+
###### ➠ If you want to translate the transcription to English, select the task as "Translate"
|
43 |
+
###### I recommend starting with the base model and then experimenting with the larger models, the small and medium models often work well. """)
|
44 |
+
|
45 |
+
loaded_model = whisper.load_model("base")
|
46 |
+
current_size = "None"
|
47 |
+
|
48 |
+
|
49 |
+
@st.cache(allow_output_mutation=True)
|
50 |
+
def change_model(current_size, size):
|
51 |
+
if current_size != size:
|
52 |
+
loaded_model = whisper.load_model(size)
|
53 |
+
return loaded_model
|
54 |
+
else:
|
55 |
+
raise Exception("Model size is the same as the current size.")
|
56 |
+
|
57 |
+
@st.cache(allow_output_mutation=True)
|
58 |
+
def inferecence(loaded_model, uploaded_file, task):
|
59 |
+
with open(f"{save_dir}/input.mp3", "wb") as f:
|
60 |
+
f.write(uploaded_file.read())
|
61 |
+
audio = ffmpeg.input(f"{save_dir}/input.mp3")
|
62 |
+
audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
|
63 |
+
ffmpeg.run(audio, overwrite_output=True)
|
64 |
+
if task == "Transcribe":
|
65 |
+
options = dict(task="transcribe", best_of=5)
|
66 |
+
results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
|
67 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
68 |
+
srt = getSubs(results["segments"], "srt", 80)
|
69 |
+
lang = results["language"]
|
70 |
+
return results["text"], vtt, srt, lang
|
71 |
+
elif task == "Translate":
|
72 |
+
options = dict(task="translate", best_of=5)
|
73 |
+
results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
|
74 |
+
vtt = getSubs(results["segments"], "vtt", 80)
|
75 |
+
srt = getSubs(results["segments"], "srt", 80)
|
76 |
+
lang = results["language"]
|
77 |
+
return results["text"], vtt, srt, lang
|
78 |
+
else:
|
79 |
+
raise ValueError("Task not supported")
|
80 |
+
|
81 |
+
|
82 |
+
def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
|
83 |
+
segmentStream = StringIO()
|
84 |
+
|
85 |
+
if format == 'vtt':
|
86 |
+
write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
87 |
+
elif format == 'srt':
|
88 |
+
write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
|
89 |
+
else:
|
90 |
+
raise Exception("Unknown format " + format)
|
91 |
+
|
92 |
+
segmentStream.seek(0)
|
93 |
+
return segmentStream.read()
|
94 |
+
|
95 |
+
|
96 |
+
def main():
|
97 |
+
size = st.selectbox("Select Model Size (The larger the model, the more accurate the transcription will be, but it will take longer)", ["tiny", "base", "small", "medium", "large"], index=1)
|
98 |
+
loaded_model = change_model(current_size, size)
|
99 |
+
st.write(f"Model is {'multilingual' if loaded_model.is_multilingual else 'English-only'} "
|
100 |
+
f"and has {sum(np.prod(p.shape) for p in loaded_model.parameters()):,} parameters.")
|
101 |
+
input_file = st.file_uploader("Upload an audio file", type=["mp3", "wav", "m4a"])
|
102 |
+
if input_file is not None:
|
103 |
+
filename = input_file.name[:-4]
|
104 |
+
else:
|
105 |
+
filename = None
|
106 |
+
task = st.selectbox("Select Task", ["Transcribe", "Translate"], index=0)
|
107 |
+
if task == "Transcribe":
|
108 |
+
if st.button("Transcribe"):
|
109 |
+
results = inferecence(loaded_model, input_file, task)
|
110 |
+
col3, col4 = st.columns(2)
|
111 |
+
col5, col6, col7 = st.columns(3)
|
112 |
+
col9, col10 = st.columns(2)
|
113 |
+
|
114 |
+
with col3:
|
115 |
+
st.audio(input_file)
|
116 |
+
|
117 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
118 |
+
f.writelines(results[0])
|
119 |
+
f.close()
|
120 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
121 |
+
datatxt = f.read()
|
122 |
+
|
123 |
+
|
124 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
125 |
+
f.writelines(results[1])
|
126 |
+
f.close()
|
127 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
128 |
+
datavtt = f.read()
|
129 |
+
|
130 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
131 |
+
f.writelines(results[2])
|
132 |
+
f.close()
|
133 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
134 |
+
datasrt = f.read()
|
135 |
+
|
136 |
+
with col5:
|
137 |
+
st.download_button(label="Download Transcript (.txt)",
|
138 |
+
data=datatxt,
|
139 |
+
file_name="transcript.txt")
|
140 |
+
with col6:
|
141 |
+
st.download_button(label="Download Transcript (.vtt)",
|
142 |
+
data=datavtt,
|
143 |
+
file_name="transcript.vtt")
|
144 |
+
with col7:
|
145 |
+
st.download_button(label="Download Transcript (.srt)",
|
146 |
+
data=datasrt,
|
147 |
+
file_name="transcript.srt")
|
148 |
+
with col9:
|
149 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
150 |
+
with col10:
|
151 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
152 |
+
|
153 |
+
elif task == "Translate":
|
154 |
+
if st.button("Translate to English"):
|
155 |
+
results = inferecence(loaded_model, input_file, task)
|
156 |
+
col3, col4 = st.columns(2)
|
157 |
+
col5, col6, col7 = st.columns(3)
|
158 |
+
col9, col10 = st.columns(2)
|
159 |
+
|
160 |
+
with col3:
|
161 |
+
st.audio(input_file)
|
162 |
+
|
163 |
+
with open("transcript.txt", "w+", encoding='utf8') as f:
|
164 |
+
f.writelines(results[0])
|
165 |
+
f.close()
|
166 |
+
with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
|
167 |
+
datatxt = f.read()
|
168 |
+
|
169 |
+
|
170 |
+
with open("transcript.vtt", "w+",encoding='utf8') as f:
|
171 |
+
f.writelines(results[1])
|
172 |
+
f.close()
|
173 |
+
with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
|
174 |
+
datavtt = f.read()
|
175 |
+
|
176 |
+
with open("transcript.srt", "w+",encoding='utf8') as f:
|
177 |
+
f.writelines(results[2])
|
178 |
+
f.close()
|
179 |
+
with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
|
180 |
+
datasrt = f.read()
|
181 |
+
|
182 |
+
with col5:
|
183 |
+
st.download_button(label="Download Transcript (.txt)",
|
184 |
+
data=datatxt,
|
185 |
+
file_name="transcript.txt")
|
186 |
+
with col6:
|
187 |
+
st.download_button(label="Download Transcript (.vtt)",
|
188 |
+
data=datavtt,
|
189 |
+
file_name="transcript.vtt")
|
190 |
+
with col7:
|
191 |
+
st.download_button(label="Download Transcript (.srt)",
|
192 |
+
data=datasrt,
|
193 |
+
file_name="transcript.srt")
|
194 |
+
with col9:
|
195 |
+
st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
|
196 |
+
with col10:
|
197 |
+
st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
|
198 |
+
|
199 |
+
else:
|
200 |
+
st.error("Please select a task.")
|
201 |
+
|
202 |
+
|
203 |
+
if __name__ == "__main__":
|
204 |
+
main()
|
205 |
+
st.markdown("###### Made with :heart: by [@BatuhanYılmaz](https://twitter.com/batuhan3326) [![this is an image link](https://i.imgur.com/thJhzOO.png)](https://www.buymeacoffee.com/batuhanylmz)")
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/openai/whisper.git
|
2 |
+
ffmpeg==1.4
|
3 |
+
ffmpeg_python==0.2.0
|
4 |
+
numpy==1.23.3
|
5 |
+
pytube==12.1.0
|
6 |
+
requests==2.28.1
|
7 |
+
streamlit==1.13.0
|
8 |
+
streamlit_lottie==0.0.3
|
9 |
+
whisper
|
utils.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import textwrap
|
2 |
+
import zlib
|
3 |
+
from typing import Iterator, TextIO
|
4 |
+
|
5 |
+
|
6 |
+
def exact_div(x, y):
|
7 |
+
assert x % y == 0
|
8 |
+
return x // y
|
9 |
+
|
10 |
+
|
11 |
+
def str2bool(string):
|
12 |
+
str2val = {"True": True, "False": False}
|
13 |
+
if string in str2val:
|
14 |
+
return str2val[string]
|
15 |
+
else:
|
16 |
+
raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
|
17 |
+
|
18 |
+
|
19 |
+
def optional_int(string):
|
20 |
+
return None if string == "None" else int(string)
|
21 |
+
|
22 |
+
|
23 |
+
def optional_float(string):
|
24 |
+
return None if string == "None" else float(string)
|
25 |
+
|
26 |
+
|
27 |
+
def compression_ratio(text) -> float:
|
28 |
+
return len(text) / len(zlib.compress(text.encode("utf-8")))
|
29 |
+
|
30 |
+
|
31 |
+
def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
|
32 |
+
assert seconds >= 0, "non-negative timestamp expected"
|
33 |
+
milliseconds = round(seconds * 1000.0)
|
34 |
+
|
35 |
+
hours = milliseconds // 3_600_000
|
36 |
+
milliseconds -= hours * 3_600_000
|
37 |
+
|
38 |
+
minutes = milliseconds // 60_000
|
39 |
+
milliseconds -= minutes * 60_000
|
40 |
+
|
41 |
+
seconds = milliseconds // 1_000
|
42 |
+
milliseconds -= seconds * 1_000
|
43 |
+
|
44 |
+
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
|
45 |
+
return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
|
46 |
+
|
47 |
+
|
48 |
+
def write_txt(transcript: Iterator[dict], file: TextIO):
|
49 |
+
for segment in transcript:
|
50 |
+
print(segment['text'].strip(), file=file, flush=True)
|
51 |
+
|
52 |
+
|
53 |
+
def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
|
54 |
+
print("WEBVTT\n", file=file)
|
55 |
+
for segment in transcript:
|
56 |
+
text = processText(segment['text'], maxLineWidth).replace('-->', '->')
|
57 |
+
|
58 |
+
print(
|
59 |
+
f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
|
60 |
+
f"{text}\n",
|
61 |
+
file=file,
|
62 |
+
flush=True,
|
63 |
+
)
|
64 |
+
|
65 |
+
|
66 |
+
def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
|
67 |
+
"""
|
68 |
+
Write a transcript to a file in SRT format.
|
69 |
+
Example usage:
|
70 |
+
from pathlib import Path
|
71 |
+
from whisper.utils import write_srt
|
72 |
+
result = transcribe(model, audio_path, temperature=temperature, **args)
|
73 |
+
# save SRT
|
74 |
+
audio_basename = Path(audio_path).stem
|
75 |
+
with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
|
76 |
+
write_srt(result["segments"], file=srt)
|
77 |
+
"""
|
78 |
+
for i, segment in enumerate(transcript, start=1):
|
79 |
+
text = processText(segment['text'].strip(), maxLineWidth).replace('-->', '->')
|
80 |
+
|
81 |
+
# write srt lines
|
82 |
+
print(
|
83 |
+
f"{i}\n"
|
84 |
+
f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
|
85 |
+
f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
|
86 |
+
f"{text}\n",
|
87 |
+
file=file,
|
88 |
+
flush=True,
|
89 |
+
)
|
90 |
+
|
91 |
+
def processText(text: str, maxLineWidth=None):
|
92 |
+
if (maxLineWidth is None or maxLineWidth < 0):
|
93 |
+
return text
|
94 |
+
|
95 |
+
lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
|
96 |
+
return '\n'.join(lines)
|