Ask a Question to a YouTube Video and get the Video played from the answer timestamp

import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import AutoModelForQuestionAnswering
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

#from IPython.display import HTML, IFrame 
#from IPython.display import YouTubeVideo

#input - video link, output - full transcript
def get_transcript(link):
  print("******** Inside get_transcript ********")
  print(f"link to be extracted is : {link}")
  video_id = link.split("=")[1]
  print(f"video id extracted is : {video_id}")
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
  FinalTranscript = ' '.join([i['text'] for i in transcript])
  return FinalTranscript,transcript, video_id
  
  
#input - question and transcript, output - answer timestamp
def get_answers_timestamp(question, final_transcript, transcript):
  print("******** Inside get_answers_timestamp ********")
  model_ckpt = "deepset/minilm-uncased-squad2"
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
  #question = "any funny examples in video??"
  context = final_transcript
  print(f"Input Question is : {question}")
  print(f"Type of trancript is : {type(context)}, Length of transcript is : {len(context)}")
  inputs = tokenizer(question, context, return_overflowing_tokens=True, max_length=512, stride = 25)

  #overlaps
  #getting a list of contexts available after striding
  contx=[]
  for window in inputs["input_ids"]:
      #print(f"{tokenizer.decode(window)} \n")
      contx.append(tokenizer.decode(window).split('[SEP]')[1].strip())
  #print(ques)
  #print(contx)

  model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
  lst=[]
  pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
  for contexts in contx:
    #print(pipe(question=question, context=contexts))
    lst.append(pipe(question=question, context=contexts))

  lst_scores = [dicts['score'] for dicts in lst] 
  #print(lst_scores)
  #getting highest and second highest scores
  idxmax = lst_scores.index(max(lst_scores))
  lst_scores.remove(max(lst_scores))
  idxmax2 = lst_scores.index(max(lst_scores))
  #idxmax, idxmax2

  idxcont = lst[idxmax2]['start']
  answer = final_transcript[len(contx[0])-135 + idxcont:]
  sentence_keyword = answer[:50]

  dftranscript = pd.DataFrame(transcript)
  #dftranscript.head()

  modelST = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
  embedding_1= modelST.encode(dftranscript.text, convert_to_tensor=True)
  embedding_2 = modelST.encode(sentence_keyword, convert_to_tensor=True)

  similarity_tensor = util.pytorch_cos_sim(embedding_1, embedding_2)
  idx = torch.argmax(similarity_tensor)
  start_timestamp = dftranscript.iloc[[int(idx)+1]].start.values[0]
  start_timestamp = round(start_timestamp)

  return start_timestamp
   
    
def display_vid(url, question):
  print("******** display_vid ********")
  #https://www.youtube.com/watch?v=smUHQndcmOY&t=425s
  #html = HTML("<iframe width='560' height='315' src='https://www.youtube.com/watch?v=smUHQndcmOY&t=425s' frameborder='0' allowfullscreen></iframe>")
  #html = "<iframe width='560' height='315' src='https://www.youtube.com/embed/smUHQndcmOY' frameborder='0' allowfullscreen></iframe>"
  #get embedding and youtube link
  html_in = "<iframe width='560' height='315' src=" + url + " frameborder='0' allowfullscreen></iframe>"
  #print(html)
  
  #get transcript
  final_transcript, transcript, video_id = get_transcript(url)
  
  #get answer timestamp
  #input - question and transcript, output - answer timestamp
  ans_timestamp = get_answers_timestamp(question, final_transcript, transcript)
  
  #created embedding
  #sample - smUHQndcmOY?start=234 
  html_out = "<iframe width='560' height='315' src='https://www.youtube.com/embed/" + video_id + "?start=" + str(ans_timestamp) + "' title='YouTube video player' frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' allowfullscreen></iframe>"
  print(f"html output is : {html_out}")
  
  return html_out

def set_example_question(example):
    return gr.Radio.update(value=example[0])

demo = gr.Blocks()

with demo:
  gr.Markdown("<h1><center>Ask a Question to a YouTube Video and get the Video played from the answer timestamp</center></h1>")
  gr.Markdown(
        "<div>How many times have you seen a long video/podcast on Youtube and wondered only if there would have been 'explanatory' timestamps it would have been so much better..</div>"
        "<div>Well, using this Space/App you can provide a YouTube video link and then provide some questions that you would like, and the App will generate timestamps/play video at those timestamps for you in the space provided. Idea is that your question could be like 'Is this xxxx thing covered in the video?', or maybe 'does the host talks about the architecture of the model', or maybe 'Does host talk about alien doorway on Mars?' and so on.</div><br> <br> <div> This App is still little bit <Work in Progress> with some sharp edges still left, please bear with me.<br><br></div>"
    )
  with gr.Row():
    input_url = gr.Textbox(label="Input a Youtube video link") #gr.HTML(placeholder="Enter a video link here..")
    input_ques = gr.Textbox(label="Ask a Question")
    output_vid = gr.HTML(label="Video will play at the answer timestamp")
  
  with gr.Row():
    example_question = gr.Radio(
                    [
                    ["Does video talk about different modalities"], 
                    ["Can the model do classification"],
                    ["Does the model pushes state of the art in image classification"],
                    ["Is deepmind copying openai"],
                    ["Is flamingo good enough"],
                    ["Has flamingo passed andre karpathy challnge yet?"],
                    ["Are there cool examples from flamingo in the video?"],
                    ["Does the video talk about cat?"], 
                    ["Any funny examples in video?"]], label= "Choose a sample Question")
    #gr.Radio(["Does video talk about different modalities", "Can the model do classification", "Does the model pushes state of the art in image classification", "Is deepmind copying openai", "Is flamingo good enough", "Has flamingo passed andre karpathy challenge yet?", "Are there cool examples from flamingo in the video?", "Does the video talk about cat?", "Any funny examples in video?"], label= "Choose a sample Question")
    #paths = sorted(pathlib.Path('images').glob('*.jpg'))
    #example_images = gr.Dataset(components=[input_image],
    #                            samples=[[path.as_posix()]
    #                                     for path in paths])
  example_question.update(set_example_question) #,inputs=example_question, outputs= input_url) #example_styles.components)
                                                                    
  b1 = gr.Button("Publish Video")
  #b2 = gr.Button("Generate Image")

  b1.click(display_vid, inputs=[input_url,input_ques], outputs=output_vid)
  #b2.click(poem_to_image, poem_txt, output_image)
  #examples=examples
  

demo.launch(enable_queue=True, debug=True)