DeepSoft-Tech commited on
Commit
1d45d6c
·
1 Parent(s): 7a28690

Upload 2 files

Browse files
Files changed (2) hide show
  1. app (1).py +105 -0
  2. requirements (2).txt +70 -0
app (1).py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from gtts import gTTS
4
+ from urllib.parse import urlparse, parse_qs
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ import unicodedata
7
+ from deepmultilingualpunctuation import PunctuationModel
8
+ from transformers import pipeline
9
+
10
+
11
+ def summarize_video(url):
12
+ if "watch" in url:
13
+ pass
14
+ else:
15
+ url = url.replace("youtu.be/", "www.youtube.com/watch?v=")
16
+
17
+ parsed_url = urlparse(url)
18
+ video_id = parse_qs(parsed_url.query)['v'][0]
19
+
20
+ # Get the transcript
21
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
22
+
23
+ # Combining all the lists into on unique list
24
+ text = []
25
+ for i in range(0, len(transcript)):
26
+ text.append(transcript[i]["text"])
27
+
28
+ # Join list items into one paragraph
29
+ video_transcript = " ".join(text)
30
+ print("Text transcript created")
31
+
32
+ print(video_transcript)
33
+
34
+ # Text normalization
35
+ my_string = unicodedata.normalize('NFKD', video_transcript)
36
+ print("Text normalized")
37
+
38
+
39
+ # Add punctuation
40
+ model = PunctuationModel()
41
+ result = model.restore_punctuation(video_transcript)
42
+ print("Punctuation restored")
43
+
44
+ # SUMMARIZATION
45
+
46
+ # instantiate the summarization pipeline
47
+ summarization_pipeline = pipeline(
48
+ "summarization",
49
+ model="t5-base", # you can choose a different model, depending on your requirements
50
+ tokenizer="t5-base" # you can choose a different tokenizer, depending on your requirements
51
+ )
52
+
53
+ # define the input text to summarize
54
+ input_text = result
55
+
56
+ # split the input text into smaller chunks
57
+ chunk_size = 5000
58
+ chunks = [input_text[i:i+chunk_size] for i in range(0, len(input_text), chunk_size)]
59
+
60
+ # summarize each chunk separately
61
+ summaries = []
62
+ for chunk in chunks:
63
+ summary = summarization_pipeline(chunk, max_length=200, min_length=30, do_sample=False)
64
+ summaries.append(summary[0]['summary_text'])
65
+
66
+ # combine the summaries of all chunks into a single summary
67
+ final_summary = " ".join(summaries)
68
+
69
+ # print the generated summary
70
+ return final_summary
71
+
72
+ # Define the Streamlit app
73
+ st.title("YouTube Summarizer")
74
+
75
+ # Define the input form
76
+ form = st.form(key="input_form")
77
+
78
+ # Get the video ID from the URL
79
+ video_url = form.text_input("Enter a YouTube video URL")
80
+
81
+ # Submit button
82
+ submit_button = form.form_submit_button("Summarize Video")
83
+
84
+ # Handle form submissions
85
+ if submit_button:
86
+ # Call the summarize_video function to get the summary
87
+ summary = summarize_video(video_url)
88
+
89
+ # Display the summary to the user
90
+ st.subheader("Summary")
91
+ st.write(summary)
92
+
93
+ # Convert text summary into audio
94
+ tts = gTTS(summary)
95
+ print("converting text to audio")
96
+ tts.save('Summary.mp3')
97
+
98
+ # Download audio transcript
99
+ with open('Summary.mp3', 'rb') as f:
100
+ st.download_button('Download mp3', f, file_name='Summary.mp3')
101
+
102
+
103
+
104
+
105
+
requirements (2).txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.2
2
+ anyio==3.6.2
3
+ attrs==23.1.0
4
+ blinker==1.6.2
5
+ cachetools==5.3.0
6
+ certifi==2022.12.7
7
+ charset-normalizer==3.1.0
8
+ click==8.1.3
9
+ decorator==5.1.1
10
+ deepmultilingualpunctuation==1.0.1
11
+ entrypoints==0.4
12
+ fastapi==0.95.1
13
+ filelock==3.12.0
14
+ fsspec==2023.4.0
15
+ gitdb==4.0.10
16
+ GitPython==3.1.31
17
+ gTTS==2.3.2
18
+ h11==0.14.0
19
+ huggingface-hub==0.14.1
20
+ idna==3.4
21
+ importlib-metadata==6.6.0
22
+ Jinja2==3.1.2
23
+ jsonschema==4.17.3
24
+ markdown-it-py==2.2.0
25
+ MarkupSafe==2.1.2
26
+ mdurl==0.1.2
27
+ mpmath==1.3.0
28
+ networkx==3.1
29
+ numpy==1.24.3
30
+ packaging==23.1
31
+ pandas==2.0.1
32
+ Pillow==9.5.0
33
+ protobuf==3.20.1
34
+ pyarrow==12.0.0
35
+ pydantic==1.10.7
36
+ pydeck==0.8.1b0
37
+ Pygments==2.15.1
38
+ Pympler==1.0.1
39
+ pyrsistent==0.19.3
40
+ python-dateutil==2.8.2
41
+ pytz==2023.3
42
+ pytz-deprecation-shim==0.1.0.post0
43
+ PyYAML==6.0
44
+ regex==2023.5.5
45
+ requests==2.30.0
46
+ rich==13.3.5
47
+ sentencepiece==0.1.99
48
+ six==1.16.0
49
+ smmap==5.0.0
50
+ sniffio==1.3.0
51
+ starlette==0.26.1
52
+ streamlit==1.22.0
53
+ sympy==1.11.1
54
+ tenacity==8.2.2
55
+ tokenizers==0.13.3
56
+ toml==0.10.2
57
+ toolz==0.12.0
58
+ torch==2.0.0
59
+ tornado==6.3.1
60
+ tqdm==4.65.0
61
+ transformers==4.28.1
62
+ typing_extensions==4.5.0
63
+ tzdata==2023.3
64
+ tzlocal==4.3
65
+ urllib3==2.0.2
66
+ uvicorn==0.22.0
67
+ validators==0.20.0
68
+ watchdog==3.0.0
69
+ youtube-transcript-api==0.6.0
70
+ zipp==3.15.0