kazuk commited on
Commit
4e962a7
1 Parent(s): c450fd6

Add application file

Browse files
Files changed (2) hide show
  1. app.py +66 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisper
3
+ from pytube import YouTube
4
+
5
+ def get_audio(url):
6
+ yt = YouTube(url)
7
+ return yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
8
+
9
+ def get_transcript(url, model_size, lang, format):
10
+
11
+ model = whisper.load_model(model_size)
12
+
13
+ if lang == "None":
14
+ lang = None
15
+
16
+ result = model.transcribe(get_audio(url), fp16=False, language=lang)
17
+
18
+ if format == "None":
19
+ return result["text"]
20
+ elif format == ".srt":
21
+ return format_to_srt(result["segments"])
22
+
23
+ def format_to_srt(segments):
24
+ output = ""
25
+ for i, segment in enumerate(segments):
26
+ output += f"{i + 1}\n"
27
+ output += f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
28
+ output += f"{segment['text']}\n\n"
29
+ return output
30
+
31
+ def format_timestamp(t):
32
+ hh = t//3600
33
+ mm = (t - hh*3600)//60
34
+ ss = t - hh*3600 - mm*60
35
+ mi = (t - int(t))*1000
36
+ return f"{int(hh):02d}:{int(mm):02d}:{int(ss):02d},{int(mi):03d}"
37
+
38
+
39
+ langs = ["None"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
40
+ model_size = list(whisper._MODELS.keys())
41
+
42
+ with gr.Blocks() as demo:
43
+
44
+ with gr.Row():
45
+
46
+ with gr.Column():
47
+
48
+ with gr.Row():
49
+ url = gr.Textbox(placeholder='Youtube video URL', label='URL')
50
+
51
+ with gr.Row():
52
+
53
+ model_size = gr.Dropdown(choices=model_size, value='tiny', label="Model")
54
+ lang = gr.Dropdown(choices=langs, value="None", label="Language (Optional)")
55
+ format = gr.Dropdown(choices=["None", ".srt"], value="None", label="Timestamps? (Optional)")
56
+
57
+ with gr.Row():
58
+ gr.Markdown("Larger models are more accurate, but slower. For 1min video, it'll take ~30s (tiny), ~1min (base), ~3min (small), ~5min (medium), etc.")
59
+ transcribe_btn = gr.Button('Transcribe')
60
+
61
+ with gr.Column():
62
+ outputs = gr.Textbox(placeholder='Transcription of the video', label='Transcription')
63
+
64
+ transcribe_btn.click(get_transcript, inputs=[url, model_size, lang, format], outputs=outputs)
65
+
66
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ pytube
3
+ git+https://github.com/openai/whisper.git