Spaces:
Sleeping
Sleeping
File size: 2,805 Bytes
8396eb2 ca67b20 8396eb2 82cd0cb 8396eb2 87c9ce9 82cd0cb 8396eb2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import streamlit as st
import requests
from gtts import gTTS
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi
import unicodedata
from deepmultilingualpunctuation import PunctuationModel
from transformers import pipeline
def summarize_video(url):
if "watch" in url:
pass
else:
url = url.replace("youtu.be/", "www.youtube.com/watch?v=")
parsed_url = urlparse(url)
video_id = parse_qs(parsed_url.query)['v'][0]
# Get the transcript
transcript = YouTubeTranscriptApi.get_transcript(video_id)
# Combining all the lists into on unique list
text = []
for i in range(0, len(transcript)):
text.append(transcript[i]["text"])
# Join list items into one paragraph
video_transcript = " ".join(text)
print("Text transcript created")
print(video_transcript)
# Text normalization
my_string = unicodedata.normalize('NFKD', video_transcript)
print("Text normalized")
# Add punctuation
model = PunctuationModel()
result = model.restore_punctuation(video_transcript)
print("Punctuation restored")
# SUMMARIZATION
# instantiate the summarization pipeline
summarization_pipeline = pipeline(
"summarization",
model="t5-base", # you can choose a different model, depending on your requirements
tokenizer="t5-base" # you can choose a different tokenizer, depending on your requirements
)
# define the input text to summarize
input_text = result
# split the input text into smaller chunks
chunk_size = 5000
chunks = [input_text[i:i+chunk_size] for i in range(0, len(input_text), chunk_size)]
# summarize each chunk separately
summaries = []
for chunk in chunks:
summary = summarization_pipeline(chunk, max_length=200, min_length=30, do_sample=False)
summaries.append(summary[0]['summary_text'])
# combine the summaries of all chunks into a single summary
final_summary = " ".join(summaries)
# print the generated summary
return final_summary
# Define the Streamlit app
st.title("YouTube Summarizer")
# Define the input form
form = st.form(key="input_form")
# Get the video ID from the URL
video_url = form.text_input("Enter a YouTube video URL")
# Submit button
submit_button = form.form_submit_button("Summarize Video")
# Handle form submissions
if submit_button:
# Call the summarize_video function to get the summary
summary = summarize_video(video_url)
# Display the summary to the user
st.subheader("Summary")
st.write(summary)
# Convert text summary into audio
tts = gTTS(summary)
print("converting text to audio")
tts.save('Summary.mp3')
# Download audio transcript
with open('Summary.mp3', 'rb') as f:
st.download_button('Download mp3', f, file_name='Summary.mp3')
|