storresbusquets commited on
Commit
3299970
1 Parent(s): 2937856

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -6
app.py CHANGED
@@ -1,17 +1,29 @@
 
1
  import gradio as gr
2
  import whisper
3
  from pytube import YouTube
4
  from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
5
  from wordcloud import WordCloud
6
 
7
-
8
  class GradioInference:
9
  def __init__(self):
 
 
10
  self.sizes = list(whisper._MODELS.keys())
 
 
11
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
 
 
12
  self.current_size = "base"
 
 
13
  self.loaded_model = whisper.load_model(self.current_size)
 
 
14
  self.yt = None
 
 
15
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
16
 
17
  # Initialize VoiceLabT5 model and tokenizer
@@ -26,8 +38,20 @@ class GradioInference:
26
  self.classifier = pipeline("text-classification")
27
 
28
  def __call__(self, link, lang, size):
 
 
 
 
 
 
 
 
 
 
29
  if self.yt is None:
30
  self.yt = YouTube(link)
 
 
31
  path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
32
 
33
  if lang == "none":
@@ -37,6 +61,7 @@ class GradioInference:
37
  self.loaded_model = whisper.load_model(size)
38
  self.current_size = size
39
 
 
40
  results = self.loaded_model.transcribe(path, language=lang)
41
 
42
  # Perform summarization on the transcription
@@ -56,8 +81,13 @@ class GradioInference:
56
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
57
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
58
 
 
59
  label = self.classifier(results["text"])[0]["label"]
 
 
60
  wordcloud = WordCloud().generate(results["text"])
 
 
61
  wordcloud_image = wordcloud.to_image()
62
 
63
  return (
@@ -69,10 +99,24 @@ class GradioInference:
69
  )
70
 
71
  def populate_metadata(self, link):
 
 
 
 
 
72
  self.yt = YouTube(link)
73
  return self.yt.thumbnail_url, self.yt.title
74
 
75
  def from_audio_input(self, lang, size, audio_file):
 
 
 
 
 
 
 
 
 
76
  if lang == "none":
77
  lang = None
78
 
@@ -99,7 +143,10 @@ class GradioInference:
99
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
100
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
101
 
 
102
  label = self.classifier(results["text"])[0]["label"]
 
 
103
  wordcloud = WordCloud().generate(
104
  results["text"]
105
  )
@@ -161,12 +208,10 @@ with block as demo:
161
  label="Keywords", placeholder="Keywords Output...", lines=5
162
  ).style(show_copy_button=True, container=True)
163
  label = gr.Label(label="Sentiment Analysis")
164
- with gr.Row().style(equal_height=True):
165
- # Display the Word Cloud
166
  wordcloud_image = gr.Image()
167
  with gr.Row().style(equal_height=True):
168
  clear = gr.ClearButton(
169
- [link, title, img, text, summary, keywords, label], scale=1
170
  )
171
  btn = gr.Button("Get video insights", variant="primary", scale=1)
172
  btn.click(
@@ -200,11 +245,12 @@ with block as demo:
200
  label="Keywords", placeholder="Keywords Output", lines=5
201
  )
202
  label = gr.Label(label="Sentiment Analysis")
 
203
  with gr.Row().style(equal_height=True):
204
- clear = gr.ClearButton([text], scale=1)
205
  btn = gr.Button(
206
  "Get video insights", variant="primary", scale=1
207
- ) # Updated button label
208
  btn.click(
209
  gio.from_audio_input,
210
  inputs=[lang, size, audio_file],
 
1
+ # Imports
2
  import gradio as gr
3
  import whisper
4
  from pytube import YouTube
5
  from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
6
  from wordcloud import WordCloud
7
 
 
8
  class GradioInference:
9
  def __init__(self):
10
+
11
+ # OpenAI's Whisper model sizes
12
  self.sizes = list(whisper._MODELS.keys())
13
+
14
+ # Whisper's available languages for ASR
15
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
16
+
17
+ # Default size
18
  self.current_size = "base"
19
+
20
+ # Default model size
21
  self.loaded_model = whisper.load_model(self.current_size)
22
+
23
+ # Initialize Pytube Object
24
  self.yt = None
25
+
26
+ # Initialize summary model
27
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
28
 
29
  # Initialize VoiceLabT5 model and tokenizer
 
38
  self.classifier = pipeline("text-classification")
39
 
40
  def __call__(self, link, lang, size):
41
+ """
42
+ Call the Gradio Inference python class.
43
+ This class gets access to a YouTube video using python's library Pytube and downloads its audio.
44
+ Then it uses the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
45
+ Once the function has the transcription of the video it proccess it to obtain:
46
+ - Summary: using Facebook's BART transformer.
47
+ - KeyWords: using VoiceLabT5 keyword extractor.
48
+ - Sentiment Analysis: using Hugging Face's default sentiment classifier
49
+ - WordCloud: using the wordcloud python library.
50
+ """
51
  if self.yt is None:
52
  self.yt = YouTube(link)
53
+
54
+ # Pytube library to access to YouTube audio stream
55
  path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
56
 
57
  if lang == "none":
 
61
  self.loaded_model = whisper.load_model(size)
62
  self.current_size = size
63
 
64
+ # Transcribe the audio extracted from pytube
65
  results = self.loaded_model.transcribe(path, language=lang)
66
 
67
  # Perform summarization on the transcription
 
81
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
82
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
83
 
84
+ # Sentiment label
85
  label = self.classifier(results["text"])[0]["label"]
86
+
87
+ # Generate WordCloud object
88
  wordcloud = WordCloud().generate(results["text"])
89
+
90
+ # WordCloud image to display
91
  wordcloud_image = wordcloud.to_image()
92
 
93
  return (
 
99
  )
100
 
101
  def populate_metadata(self, link):
102
+ """
103
+ Access to the YouTube video title and thumbnail image to further display it
104
+ params:
105
+ - link: a YouTube URL.
106
+ """
107
  self.yt = YouTube(link)
108
  return self.yt.thumbnail_url, self.yt.title
109
 
110
  def from_audio_input(self, lang, size, audio_file):
111
+ """
112
+ Call the Gradio Inference python class.
113
+ Uses it directly the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
114
+ Once the function has the transcription of the video it proccess it to obtain:
115
+ - Summary: using Facebook's BART transformer.
116
+ - KeyWords: using VoiceLabT5 keyword extractor.
117
+ - Sentiment Analysis: using Hugging Face's default sentiment classifier
118
+ - WordCloud: using the wordcloud python library.
119
+ """
120
  if lang == "none":
121
  lang = None
122
 
 
143
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
144
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
145
 
146
+ # Sentiment label
147
  label = self.classifier(results["text"])[0]["label"]
148
+
149
+ # WordCloud object
150
  wordcloud = WordCloud().generate(
151
  results["text"]
152
  )
 
208
  label="Keywords", placeholder="Keywords Output...", lines=5
209
  ).style(show_copy_button=True, container=True)
210
  label = gr.Label(label="Sentiment Analysis")
 
 
211
  wordcloud_image = gr.Image()
212
  with gr.Row().style(equal_height=True):
213
  clear = gr.ClearButton(
214
+ [link, title, img, text, summary, keywords, label, wordcloud_image], scale=1
215
  )
216
  btn = gr.Button("Get video insights", variant="primary", scale=1)
217
  btn.click(
 
245
  label="Keywords", placeholder="Keywords Output", lines=5
246
  )
247
  label = gr.Label(label="Sentiment Analysis")
248
+ wordcloud_image = gr.Image()
249
  with gr.Row().style(equal_height=True):
250
+ clear = gr.ClearButton([audio_file,text, summary, keywords, label, wordcloud_image], scale=1)
251
  btn = gr.Button(
252
  "Get video insights", variant="primary", scale=1
253
+ )
254
  btn.click(
255
  gio.from_audio_input,
256
  inputs=[lang, size, audio_file],