Spaces:

molokhovdmitry
/

social-stat

Sleeping

App Files Files Community

social-stat / src /app.py

molokhovdmitry

Update t-SNE plots to use column

5000d19 9 months ago

raw

history blame contribute delete

12.4 kB

	import os
	import urllib.parse as urlparse
	from dotenv import load_dotenv
	from transformers import pipeline
	from sentence_transformers import SentenceTransformer
	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.decomposition import NMF
	from sklearn.manifold import TSNE

	from yt_api import YouTubeAPI
	from maps import lang_map


	# Load app settings
	load_dotenv()
	YT_API_KEY = os.getenv('YT_API_KEY')
	MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
	PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
	LANG_DETECTION_CONF = float(os.getenv('LANG_DETECTION_CONF'))


	@st.cache_resource
	def init_emotions_model():
	classifier = pipeline(
	task="text-classification",
	model="SamLowe/roberta-base-go_emotions",
	top_k=None)

	return classifier


	@st.cache_resource
	def init_embedding_model():
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	return model


	@st.cache_resource
	def init_lang_model():
	model_ckpt = "papluca/xlm-roberta-base-language-detection"
	pipe = pipeline("text-classification", model=model_ckpt)
	return pipe


	def predict_emotions(df, clf):
	"""
	Predicts emotions for every `text_original` in a DataFrame `df` with a
	classifier `clf`.
	Returns a DataFrame with emotion columns.
	"""
	# Predict emotions in batches
	text_list = df['text_original'].to_list()
	batch_size = PRED_BATCH_SIZE
	text_batches = [text_list[i:i + batch_size]
	for i in range(0, len(text_list), batch_size)]
	preds = [comment_emotions
	for text_batch in text_batches
	for comment_emotions in clf(text_batch)]

	# Add predictions to DataFrame
	preds_df = pd.DataFrame([{emotion['label']: emotion['score']
	for emotion in pred} for pred in preds])
	df = pd.concat([df, preds_df], axis=1)

	return df


	def detect_languages(df, clf):
	"""
	Detects languages for every `text_original` in a DataFrame `df` with a
	classifier `clf`. Takes the language with the highest score.
	Returns a DataFrame with `predicted_language` column.
	"""
	# Detect languages in batches
	text_list = df['text_original'].to_list()
	batch_size = PRED_BATCH_SIZE
	text_batches = [text_list[i:i + batch_size]
	for i in range(0, len(text_list), batch_size)]
	preds = [batch_preds[0]['label']
	if batch_preds[0]['score'] > LANG_DETECTION_CONF
	else None
	for text_batch in text_batches
	for batch_preds in clf(text_batch, top_k=1, truncation=True)]

	# Add predictions to DataFrame
	df['predicted_language'] = preds

	return df


	def emotion_dist_plot(df, emotion_cols):
	"""
	Creates an emotion distribution plotly figure from `df` DataFrame
	and `emotion_cols` and returns it.
	"""
	fig = px.bar(df[emotion_cols].sum().sort_values(ascending=False))
	fig.update_layout(title_text="Emotion Distribution",
	width=2000)

	return fig


	def nmf_plots(df,
	nmf_components,
	tfidf_max_features,
	tfidf_stop_words='english'
	):
	"""
	Converts all `text_original` values of `df` DataFrame to TF-IDF features
	and performs Non-negative matrix factorization on them.

	Returns a tuple of the modified DataFrame with NMF values and a list of
	plotly figures (`df`, [plotly figures]).
	"""
	# Convert to TF-IDF features
	vectorizer = TfidfVectorizer(max_features=tfidf_max_features,
	stop_words=tfidf_stop_words)
	embeddings = vectorizer.fit_transform(df['text_original'])

	# Get feature_names (words) from the vectorizer
	feature_names = vectorizer.get_feature_names_out()

	# Perform NMF
	nmf = NMF(n_components=nmf_components)
	nmf_embeddings = nmf.fit_transform(embeddings).T
	topic_cols = [f'topic_{topic_num+1}'
	for topic_num in range(nmf_components)]

	# Add NMF values to the DataFrame
	for i, col in enumerate(topic_cols):
	df[col] = nmf_embeddings[i]

	# Create `main_topic` column with the highest value topic name
	df['main_topic'] = df[topic_cols].apply(lambda row: row.idxmax(), axis=1)

	# Get word values for every topic
	word_df = pd.DataFrame(
	nmf.components_.T,
	columns=topic_cols,
	index=feature_names
	)

	# Plot word distributions of each topic
	topic_words_fig = make_subplots(
	rows=1, cols=nmf_components,
	subplot_titles=topic_cols)

	for i, col in enumerate(topic_cols):
	topic_words = word_df[col].sort_values(ascending=False)
	top_topic_words = topic_words[:top_words_in_topic]
	topic_words_fig.add_trace(go.Bar(y=top_topic_words.index,
	x=top_topic_words.values,
	orientation='h',
	base=0),
	row=1, col=i+1)
	topic_words_fig.update_layout(title_text="Topic Word Distributions",
	showlegend=False)

	# Plot topic contribution for the dataset
	for col in topic_cols:
	df[col + '_cumsum'] = df[col].cumsum()
	for col in topic_cols:
	cumsum_sum = df[[col + '_cumsum' for col in topic_cols]].sum(axis=1)
	df[col + '_percentage'] = df[col + '_cumsum'] / cumsum_sum
	contributions_fig = stacked_area_plot(
	x=df['published_at'],
	y_list=[df[f'topic_{i+1}_percentage'] for i in range(nmf_components)],
	names=topic_cols)

	return df, [topic_words_fig, contributions_fig]


	def tsne_plots(df, encoder, emotion_cols, tsne_color, tsne_perplexity):
	"""
	Encodes all `text_original` values of `df` DataFrame with `encoder`,
	uses t-SNE algorithm for visualization on these embeddings and on
	predicted emotions if they were predicted.
	"""
	# Encode and add embeddings to the DataFrame
	embeddings = encoder.encode(df['text_original'])
	embedding_cols = [f'embedding_{i+1}' for i in range(embeddings.shape[1])]
	df = pd.concat([df, pd.DataFrame(embeddings, columns=embedding_cols)],
	axis=1)

	# t-SNE
	TSNE_COMPONENTS = 2
	tsne = TSNE(
	n_components=2,
	perplexity=tsne_perplexity,
	)

	# Also use predicted emotions
	if emotion_cols:
	tsne_cols = embedding_cols + emotion_cols
	color = tsne_color
	hover_data = ['first_emotion', 'second_emotion', 'text_original']
	else:
	tsne_cols = embedding_cols
	color = None
	hover_data = ['text_original']

	if 'main_topic' in df.columns:
	hover_data.append('main_topic')

	# Color column
	if 'main_topic' in df.columns or emotion_cols:
	color = tsne_color
	else:
	color = None

	tsne_results = tsne.fit_transform(df[tsne_cols])
	tsne_results = pd.DataFrame(
	tsne_results,
	columns=[f'tsne_{i+1}' for i in range(TSNE_COMPONENTS)]
	)

	df = pd.concat([df, tsne_results], axis=1)

	# 2D Visualization
	fig2d = px.scatter(
	df,
	x='tsne_1',
	y='tsne_2',
	color=color,
	hover_data=hover_data
	)
	fig2d.update_layout(
	title_text="t-SNE Visualization"
	)

	# 3D Visualization with date as the third axis
	fig3d = px.scatter_3d(
	df,
	x='published_at',
	y='tsne_1',
	z='tsne_2',
	color=color,
	hover_data=hover_data
	)
	fig3d.update_layout(
	title_text="t-SNE Visualization Over Time",
	height=800
	)

	return df, [fig2d, fig3d]


	def stacked_area_plot(x, y_list, names):
	"""Creates plotly stacked area plot. Returns a figure of that plot."""
	fig = go.Figure()
	for y, name in zip(y_list, names):
	fig.add_trace(go.Scatter(
	x=x, y=y*100,
	mode='lines',
	line=dict(width=0.5),
	stackgroup='one',
	name=name,
	))

	fig.update_layout(
	showlegend=True,
	xaxis_type='category',
	yaxis=dict(
	type='linear',
	range=[0, 100],
	ticksuffix='%')
	)

	fig.update_layout(title_text="Topic Contribution")

	return fig


	def add_top_2_emotions(row):
	emotions = row[emotion_cols].sort_values(ascending=False)
	row['first_emotion'] = emotions.index[0]
	row['second_emotion'] = emotions.index[1]
	return row


	st.set_page_config(layout='wide')
	st.title("Social-Stat")

	# Load models
	emotions_clf = init_emotions_model()
	sentence_encoder = init_embedding_model()
	lang_model = init_lang_model()

	# Init YouTube API
	yt_api = YouTubeAPI(
	api_key=YT_API_KEY,
	max_comment_size=MAX_COMMENT_SIZE
	)

	# Input form
	with st.form(key='input'):
	# Input
	url_input = st.text_input("URL or ID")
	# Get ID from URL
	url_data = urlparse.urlparse(url_input)
	query = urlparse.parse_qs(url_data.query)
	if 'v' in query:
	video_id = query['v'][0]
	else:
	video_id = url_input

	# Emotions
	emotions_checkbox = st.checkbox(
	"Predict Emotions",
	value=True,
	)

	# NMF
	nmf_checkbox = st.checkbox(
	"Non-Negative Matrix Factorization",
	value=True,
	)

	nmf_components = st.slider(
	"Topics (NMF Components)",
	min_value=2,
	max_value=12,
	value=8,
	step=1,
	)

	tfidf_max_features = st.select_slider(
	"Words (TF-IDF Vectorizer Max Features)",
	options=list(range(10, 501)) + [None],
	value=100,
	)

	top_words_in_topic = st.slider(
	"Top Topic Words",
	min_value=1,
	max_value=50,
	value=10,
	step=1,
	)

	# t-SNE
	tsne_checkbox = st.checkbox(
	"t-SNE Visualization",
	value=True,
	)

	tsne_perplexity = st.slider(
	"t-SNE Perplexity",
	min_value=5,
	max_value=50,
	value=10,
	step=1,
	)

	tsne_color = st.selectbox(
	"Plot Color",
	options=['main_topic', 'first_emotion', 'second_emotion']
	)

	# Language Map
	map_checkbox = st.checkbox(
	"Language Map",
	value=True,
	)

	submit = st.form_submit_button("Analyze")


	if submit:
	# Get comments
	try:
	bad_id = False
	comments = yt_api.get_comments(video_id)
	except KeyError:
	st.write("Video not found.")
	st.write(query)
	st.write('v' in query)
	st.write(video_id)
	bad_id = True

	if not bad_id:
	plots = []

	# Convert to pandas DataFrame and sort by publishing date
	df = pd.DataFrame(comments).sort_values('published_at')

	emotion_cols = []
	if emotions_checkbox:
	# Predict emotions
	df = predict_emotions(df, emotions_clf)
	emotion_cols = list(df.columns[11:])

	# Get emotion distribution figure
	plots.append(emotion_dist_plot(df, emotion_cols))

	# Get top 2 emotions
	df = df.apply(add_top_2_emotions, axis=1)

	if map_checkbox:
	df = detect_languages(df, lang_model)
	plots.append(lang_map(df))

	if nmf_checkbox:
	# NMF
	df, nmf_figs = nmf_plots(df, nmf_components, tfidf_max_features)
	plots.extend(nmf_figs)

	if tsne_checkbox:
	# t-SNE visualization
	if not nmf_checkbox:
	tsne_color = 'first_emotion'
	if not emotions_checkbox:
	tsne_color = 'main_topic'

	df, tsne_figs = tsne_plots(df,
	sentence_encoder,
	emotion_cols,
	tsne_color,
	tsne_perplexity)
	plots.extend(tsne_figs)

	# Draw the plots
	for i, plot in enumerate(plots):
	st.plotly_chart(
	plot, sharing='streamlit',
	theme='streamlit',
	use_container_width=True)

	# Show the final DataFrame
	st.dataframe(df)