demo3

Runtime error

App Files Files Community

storresbusquets commited on Sep 10, 2023

Commit

3299970

•

1 Parent(s): 2937856

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -6

app.py CHANGED Viewed

@@ -1,17 +1,29 @@
 import gradio as gr
 import whisper
 from pytube import YouTube
 from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
 from wordcloud import WordCloud
 class GradioInference:
  def __init__(self):
  self.sizes = list(whisper._MODELS.keys())
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
  self.current_size = "base"
  self.loaded_model = whisper.load_model(self.current_size)
  self.yt = None
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
  # Initialize VoiceLabT5 model and tokenizer
@@ -26,8 +38,20 @@ class GradioInference:
  self.classifier = pipeline("text-classification")
  def __call__(self, link, lang, size):
  if self.yt is None:
  self.yt = YouTube(link)
  path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
  if lang == "none":
@@ -37,6 +61,7 @@ class GradioInference:
  self.loaded_model = whisper.load_model(size)
  self.current_size = size
  results = self.loaded_model.transcribe(path, language=lang)
  # Perform summarization on the transcription
@@ -56,8 +81,13 @@ class GradioInference:
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
  label = self.classifier(results["text"])[0]["label"]
  wordcloud = WordCloud().generate(results["text"])
  wordcloud_image = wordcloud.to_image()
  return (
@@ -69,10 +99,24 @@ class GradioInference:
  )
  def populate_metadata(self, link):
  self.yt = YouTube(link)
  return self.yt.thumbnail_url, self.yt.title
  def from_audio_input(self, lang, size, audio_file):
  if lang == "none":
  lang = None
@@ -99,7 +143,10 @@ class GradioInference:
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
  label = self.classifier(results["text"])[0]["label"]
  wordcloud = WordCloud().generate(
  results["text"]
  )
@@ -161,12 +208,10 @@ with block as demo:
  label="Keywords", placeholder="Keywords Output...", lines=5
  ).style(show_copy_button=True, container=True)
  label = gr.Label(label="Sentiment Analysis")
- with gr.Row().style(equal_height=True):
- # Display the Word Cloud
  wordcloud_image = gr.Image()
  with gr.Row().style(equal_height=True):
  clear = gr.ClearButton(
- [link, title, img, text, summary, keywords, label], scale=1
  )
  btn = gr.Button("Get video insights", variant="primary", scale=1)
  btn.click(
@@ -200,11 +245,12 @@ with block as demo:
  label="Keywords", placeholder="Keywords Output", lines=5
  )
  label = gr.Label(label="Sentiment Analysis")
  with gr.Row().style(equal_height=True):
- clear = gr.ClearButton([text], scale=1)
  btn = gr.Button(
  "Get video insights", variant="primary", scale=1
- ) # Updated button label
  btn.click(
  gio.from_audio_input,
  inputs=[lang, size, audio_file],

+# Imports
 import gradio as gr
 import whisper
 from pytube import YouTube
 from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
 from wordcloud import WordCloud
 class GradioInference:
  def __init__(self):
+ # OpenAI's Whisper model sizes
  self.sizes = list(whisper._MODELS.keys())
+ # Whisper's available languages for ASR
  self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
+ # Default size
  self.current_size = "base"
+ # Default model size
  self.loaded_model = whisper.load_model(self.current_size)
+ # Initialize Pytube Object
  self.yt = None
+ # Initialize summary model
  self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
  # Initialize VoiceLabT5 model and tokenizer
  self.classifier = pipeline("text-classification")
  def __call__(self, link, lang, size):
+ """
+ Call the Gradio Inference python class.
+ This class gets access to a YouTube video using python's library Pytube and downloads its audio.
+ Then it uses the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
+ Once the function has the transcription of the video it proccess it to obtain:
+ - Summary: using Facebook's BART transformer.
+ - KeyWords: using VoiceLabT5 keyword extractor.
+ - Sentiment Analysis: using Hugging Face's default sentiment classifier
+ - WordCloud: using the wordcloud python library.
+ """
  if self.yt is None:
  self.yt = YouTube(link)
+ # Pytube library to access to YouTube audio stream
  path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
  if lang == "none":
  self.loaded_model = whisper.load_model(size)
  self.current_size = size
+ # Transcribe the audio extracted from pytube
  results = self.loaded_model.transcribe(path, language=lang)
  # Perform summarization on the transcription
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
+ # Sentiment label
  label = self.classifier(results["text"])[0]["label"]
+ # Generate WordCloud object
  wordcloud = WordCloud().generate(results["text"])
+ # WordCloud image to display
  wordcloud_image = wordcloud.to_image()
  return (
  )
  def populate_metadata(self, link):
+ """
+ Access to the YouTube video title and thumbnail image to further display it
+ params:
+ - link: a YouTube URL.
+ """
  self.yt = YouTube(link)
  return self.yt.thumbnail_url, self.yt.title
  def from_audio_input(self, lang, size, audio_file):
+ """
+ Call the Gradio Inference python class.
+ Uses it directly the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
+ Once the function has the transcription of the video it proccess it to obtain:
+ - Summary: using Facebook's BART transformer.
+ - KeyWords: using VoiceLabT5 keyword extractor.
+ - Sentiment Analysis: using Hugging Face's default sentiment classifier
+ - WordCloud: using the wordcloud python library.
+ """
  if lang == "none":
  lang = None
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
+ # Sentiment label
  label = self.classifier(results["text"])[0]["label"]
+ # WordCloud object
  wordcloud = WordCloud().generate(
  results["text"]
  )
  label="Keywords", placeholder="Keywords Output...", lines=5
  ).style(show_copy_button=True, container=True)
  label = gr.Label(label="Sentiment Analysis")
  wordcloud_image = gr.Image()
  with gr.Row().style(equal_height=True):
  clear = gr.ClearButton(
+ [link, title, img, text, summary, keywords, label, wordcloud_image], scale=1
  )
  btn = gr.Button("Get video insights", variant="primary", scale=1)
  btn.click(
  label="Keywords", placeholder="Keywords Output", lines=5
  )
  label = gr.Label(label="Sentiment Analysis")
+ wordcloud_image = gr.Image()
  with gr.Row().style(equal_height=True):
+ clear = gr.ClearButton([audio_file,text, summary, keywords, label, wordcloud_image], scale=1)
  btn = gr.Button(
  "Get video insights", variant="primary", scale=1
+ )
  btn.click(
  gio.from_audio_input,
  inputs=[lang, size, audio_file],