Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- apiSearch.py +92 -0
- app3.py +90 -0
- preprocessText.py +33 -0
apiSearch.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
from urllib.parse import urlparse, parse_qs
|
4 |
+
from preprocessText import preprocess
|
5 |
+
from googleapiclient.discovery import build
|
6 |
+
import isodate
|
7 |
+
|
8 |
+
api_keys = ['AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE',
|
9 |
+
'AIzaSyC7KzwigUsNJ4KNvqGfPqXVK9QcDBsKU78',
|
10 |
+
'AIzaSyDEPBCb1PhEaYHuBgzW6D5-ldTHUCowuq4',
|
11 |
+
'AIzaSyD-LN8Z7xG8OHtMQ89GRDvIaRQwkVHzfEo',
|
12 |
+
'AIzaSyCW5J_uI37UPmq3mJVAhVdWNdGSMAMg5tI',
|
13 |
+
'AIzaSyC8VVO0DhDY91lfyqqaUW85VKriqBiahBA',
|
14 |
+
'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y',
|
15 |
+
'AIzaSyA-DwJmtgWFO-I-Dwv1hcISJKXGDjbpZok',
|
16 |
+
'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y',
|
17 |
+
'AIzaSyD74KqDih_2AyOIJV-HaIvU9DdUOIyRONs',
|
18 |
+
'AIzaSyALgq5vR27iGsuFuLiz-Ry4NGy6E-L1PUY',
|
19 |
+
'AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE']
|
20 |
+
|
21 |
+
current_key_index = 0 # Declare current_key_index as a global variable
|
22 |
+
|
23 |
+
def get_video_id(url):
|
24 |
+
video_id = None
|
25 |
+
parsed_url = urlparse(url)
|
26 |
+
query_params = parse_qs(parsed_url.query)
|
27 |
+
|
28 |
+
if parsed_url.netloc == 'youtu.be':
|
29 |
+
video_id = parsed_url.path[1:]
|
30 |
+
elif parsed_url.netloc in ('www.youtube.com', 'youtube.com'):
|
31 |
+
if 'v' in query_params:
|
32 |
+
video_id = query_params['v'][0]
|
33 |
+
return video_id
|
34 |
+
|
35 |
+
def get_next_api_key():
|
36 |
+
global current_key_index
|
37 |
+
current_key_index = (current_key_index + 1) % len(api_keys)
|
38 |
+
return api_keys[current_key_index]
|
39 |
+
|
40 |
+
def get_video_metadata(video_id):
|
41 |
+
try:
|
42 |
+
# Get the next API key
|
43 |
+
api_key = get_next_api_key()
|
44 |
+
|
45 |
+
# Set up the YouTube Data API client
|
46 |
+
youtube = build('youtube', 'v3', developerKey=api_key)
|
47 |
+
|
48 |
+
# Call the API to retrieve video metadata
|
49 |
+
response = youtube.videos().list(
|
50 |
+
part='snippet,contentDetails,statistics',
|
51 |
+
id=video_id
|
52 |
+
).execute()
|
53 |
+
|
54 |
+
# Extract the relevant metadata
|
55 |
+
if 'items' in response and len(response['items']) > 0:
|
56 |
+
video = response['items'][0]
|
57 |
+
metadata = {
|
58 |
+
'title': video['snippet']['title'],
|
59 |
+
'description': video['snippet']['description'],
|
60 |
+
'channel_title': video['snippet']['channelTitle'],
|
61 |
+
'publish_date': video['snippet']['publishedAt'],
|
62 |
+
'duration': video['contentDetails']['duration'],
|
63 |
+
'views': video['statistics']['viewCount'],
|
64 |
+
'likes': video['statistics']['likeCount'],
|
65 |
+
'comments': video['statistics']['commentCount'],
|
66 |
+
'category_id': video['snippet']['categoryId'],
|
67 |
+
'thumbnail_link': video['snippet']['thumbnails']['default']['url']
|
68 |
+
}
|
69 |
+
return metadata
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
print("An error occurred:", str(e))
|
73 |
+
|
74 |
+
return None
|
75 |
+
|
76 |
+
def get_metadata(url):
|
77 |
+
# Set up the YouTube Data API client
|
78 |
+
video_id = get_video_id(url)
|
79 |
+
metadata = get_video_metadata(video_id)
|
80 |
+
if metadata is not None:
|
81 |
+
# Create a DataFrame from the metadata
|
82 |
+
df = pd.DataFrame([metadata])
|
83 |
+
df['duration'] = df['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())
|
84 |
+
df['cleanTitle'] = df['title'].apply(preprocess)
|
85 |
+
df['cleanTitle'] = df['cleanTitle'].apply(lambda x: ' '.join(x))
|
86 |
+
df['titleLength'] = df['title'].apply(lambda x: len(x))
|
87 |
+
df['descriptionLength'] = df['description'].apply(lambda x: len(x))
|
88 |
+
|
89 |
+
return df
|
90 |
+
else:
|
91 |
+
return 0
|
92 |
+
|
app3.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pickle
|
3 |
+
import pandas as pd
|
4 |
+
import joblib
|
5 |
+
from preprocessText import preprocess
|
6 |
+
from apiSearch import get_metadata
|
7 |
+
|
8 |
+
|
9 |
+
# Load the model
|
10 |
+
model = joblib.load('85pct.pkl')
|
11 |
+
|
12 |
+
# Define the categories
|
13 |
+
categories = {
|
14 |
+
'Film & Animation': 1,
|
15 |
+
'Autos & Vehicles': 2,
|
16 |
+
'Music': 10,
|
17 |
+
'Pets & Animals': 15,
|
18 |
+
'Sports' : 17,
|
19 |
+
'Short Movies' : 18,
|
20 |
+
'Travel & Events' : 19,
|
21 |
+
'Gaming' : 20,
|
22 |
+
'Videoblogging' : 21,
|
23 |
+
'People & Blogs' : 22,
|
24 |
+
'Comedy' : 23,
|
25 |
+
'Entertainment' : 24,
|
26 |
+
'News & Politics' : 25,
|
27 |
+
'Howto & Style' : 26,
|
28 |
+
'Education' : 27,
|
29 |
+
'Science & Technology' : 28,
|
30 |
+
'Nonprofits & Activism' : 29
|
31 |
+
}
|
32 |
+
|
33 |
+
# Create the Streamlit web application
|
34 |
+
def main():
|
35 |
+
st.title("YouTube Trend Prediction")
|
36 |
+
st.write("Enter the video details below:")
|
37 |
+
getTitle = ""
|
38 |
+
getDuration = 0.00
|
39 |
+
getCategory = 1
|
40 |
+
|
41 |
+
# Input fields
|
42 |
+
url = st.text_input("URL")
|
43 |
+
if url:
|
44 |
+
metadata = get_metadata(url)
|
45 |
+
getTitle=metadata['title'].iloc[0]
|
46 |
+
getDuration = metadata['duration'].iloc[0]
|
47 |
+
category_id = metadata['category_id'].iloc[0]
|
48 |
+
getCategory = int(category_id)
|
49 |
+
|
50 |
+
title = st.text_input("Title",value=getTitle)
|
51 |
+
duration = st.number_input("Duration (in minutes)", min_value=0.0,value=getDuration)
|
52 |
+
category = st.selectbox("Category", list(categories.keys()),index=list(categories.values()).index(getCategory))
|
53 |
+
|
54 |
+
# Convert category to category ID
|
55 |
+
categoryId = categories[category]
|
56 |
+
|
57 |
+
# Predict button
|
58 |
+
if st.button("Predict"):
|
59 |
+
# Perform prediction
|
60 |
+
prediction = predict_trend(title, duration, categoryId)
|
61 |
+
|
62 |
+
if prediction[0] == 1:
|
63 |
+
st.success("This video is predicted to be a trend!")
|
64 |
+
else:
|
65 |
+
st.info("This video is predicted not to be a trend.")
|
66 |
+
|
67 |
+
# Function to make predictions
|
68 |
+
def predict_trend(title, duration, category_id):
|
69 |
+
duration = str(duration)
|
70 |
+
category_id = str(category_id)
|
71 |
+
clean_new_title = preprocess(title)
|
72 |
+
# Join the preprocessed words back into a string
|
73 |
+
clean_new_title_str = ' '.join(clean_new_title)
|
74 |
+
# Prepare the input data
|
75 |
+
data = {
|
76 |
+
'cleanTitle': [clean_new_title_str],
|
77 |
+
'titleLength' : [len(title)],
|
78 |
+
'categoryId': [category_id],
|
79 |
+
'duration': [duration]
|
80 |
+
}
|
81 |
+
data = pd.DataFrame(data)
|
82 |
+
data['categoryId'] = data['categoryId'].astype('category')
|
83 |
+
data['duration'] = data['duration'].astype('float64')
|
84 |
+
# Make the prediction
|
85 |
+
print(model.predict_proba(data))
|
86 |
+
prediction = model.predict(data)
|
87 |
+
return prediction
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
main()
|
preprocessText.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import nltk
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
from nltk.stem import WordNetLemmatizer
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
nltk.download('punkt')
|
7 |
+
nltk.download('corpus')
|
8 |
+
nltk.download('stopwords')
|
9 |
+
nltk.download('wordnet')
|
10 |
+
nltk.download('omw-1.4')
|
11 |
+
stop_words = set(stopwords.words('english')) # set of English stop words
|
12 |
+
lemmatizer = WordNetLemmatizer()
|
13 |
+
|
14 |
+
def preprocess(text,target_language='en'):
|
15 |
+
|
16 |
+
if not isinstance(text, str):
|
17 |
+
try:
|
18 |
+
text = str(text)
|
19 |
+
except:
|
20 |
+
raise TypeError('Input must be a string or a float')
|
21 |
+
# convert to lowercase
|
22 |
+
text = text.lower()
|
23 |
+
# Remove URLs
|
24 |
+
text = re.sub(r'http\S+', '', text)
|
25 |
+
# Remove special characters and punctuation
|
26 |
+
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
27 |
+
# Removing repeated characters
|
28 |
+
text = re.sub(r'(.)\1{2,}', r'\1', text)
|
29 |
+
|
30 |
+
words = word_tokenize(text)
|
31 |
+
words = [lemmatizer.lemmatize(w) for w in words]
|
32 |
+
words = [w for w in words if not w in stop_words]
|
33 |
+
return words
|