File size: 2,805 Bytes
8396eb2
 
 
 
 
 
 
 
 
 
 
ca67b20
 
 
 
 
8396eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82cd0cb
8396eb2
 
87c9ce9
82cd0cb
8396eb2
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import streamlit as st
import requests
from gtts import gTTS
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi
import unicodedata
from deepmultilingualpunctuation import PunctuationModel
from transformers import pipeline


def summarize_video(url):
  if "watch" in url:
    pass
  else:
    url = url.replace("youtu.be/", "www.youtube.com/watch?v=")

  parsed_url = urlparse(url)
  video_id = parse_qs(parsed_url.query)['v'][0]

  # Get the transcript 
  transcript = YouTubeTranscriptApi.get_transcript(video_id)

  # Combining all the lists into on unique list
  text = []
  for i in range(0, len(transcript)):
      text.append(transcript[i]["text"])

  # Join list items into one paragraph
  video_transcript = " ".join(text)
  print("Text transcript created")

  print(video_transcript)

  # Text normalization 
  my_string = unicodedata.normalize('NFKD', video_transcript)
  print("Text normalized")


  # Add punctuation 
  model = PunctuationModel()
  result = model.restore_punctuation(video_transcript)
  print("Punctuation restored")

  # SUMMARIZATION 

  # instantiate the summarization pipeline
  summarization_pipeline = pipeline(
      "summarization", 
      model="t5-base", # you can choose a different model, depending on your requirements
      tokenizer="t5-base" # you can choose a different tokenizer, depending on your requirements
  )

  # define the input text to summarize
  input_text = result

  # split the input text into smaller chunks
  chunk_size = 5000
  chunks = [input_text[i:i+chunk_size] for i in range(0, len(input_text), chunk_size)]

  # summarize each chunk separately
  summaries = []
  for chunk in chunks:
      summary = summarization_pipeline(chunk, max_length=200, min_length=30, do_sample=False)
      summaries.append(summary[0]['summary_text'])

  # combine the summaries of all chunks into a single summary
  final_summary = " ".join(summaries)

  # print the generated summary
  return final_summary

# Define the Streamlit app
st.title("YouTube Summarizer")

# Define the input form
form = st.form(key="input_form")

# Get the video ID from the URL
video_url = form.text_input("Enter a YouTube video URL")

# Submit button
submit_button = form.form_submit_button("Summarize Video")

# Handle form submissions
if submit_button:
    # Call the summarize_video function to get the summary
    summary = summarize_video(video_url)

    # Display the summary to the user
    st.subheader("Summary")
    st.write(summary)

    # Convert text summary into audio
    tts = gTTS(summary)
    print("converting text to audio")
    tts.save('Summary.mp3')

    # Download audio transcript 
    with open('Summary.mp3', 'rb') as f:
        st.download_button('Download mp3', f, file_name='Summary.mp3')