social-stat / src /app.py
molokhovdmitry's picture
Update t-SNE plots to use column
5000d19
import os
import urllib.parse as urlparse
from dotenv import load_dotenv
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from yt_api import YouTubeAPI
from maps import lang_map
# Load app settings
load_dotenv()
YT_API_KEY = os.getenv('YT_API_KEY')
MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
LANG_DETECTION_CONF = float(os.getenv('LANG_DETECTION_CONF'))
@st.cache_resource
def init_emotions_model():
classifier = pipeline(
task="text-classification",
model="SamLowe/roberta-base-go_emotions",
top_k=None)
return classifier
@st.cache_resource
def init_embedding_model():
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
return model
@st.cache_resource
def init_lang_model():
model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt)
return pipe
def predict_emotions(df, clf):
"""
Predicts emotions for every `text_original` in a DataFrame `df` with a
classifier `clf`.
Returns a DataFrame with emotion columns.
"""
# Predict emotions in batches
text_list = df['text_original'].to_list()
batch_size = PRED_BATCH_SIZE
text_batches = [text_list[i:i + batch_size]
for i in range(0, len(text_list), batch_size)]
preds = [comment_emotions
for text_batch in text_batches
for comment_emotions in clf(text_batch)]
# Add predictions to DataFrame
preds_df = pd.DataFrame([{emotion['label']: emotion['score']
for emotion in pred} for pred in preds])
df = pd.concat([df, preds_df], axis=1)
return df
def detect_languages(df, clf):
"""
Detects languages for every `text_original` in a DataFrame `df` with a
classifier `clf`. Takes the language with the highest score.
Returns a DataFrame with `predicted_language` column.
"""
# Detect languages in batches
text_list = df['text_original'].to_list()
batch_size = PRED_BATCH_SIZE
text_batches = [text_list[i:i + batch_size]
for i in range(0, len(text_list), batch_size)]
preds = [batch_preds[0]['label']
if batch_preds[0]['score'] > LANG_DETECTION_CONF
else None
for text_batch in text_batches
for batch_preds in clf(text_batch, top_k=1, truncation=True)]
# Add predictions to DataFrame
df['predicted_language'] = preds
return df
def emotion_dist_plot(df, emotion_cols):
"""
Creates an emotion distribution plotly figure from `df` DataFrame
and `emotion_cols` and returns it.
"""
fig = px.bar(df[emotion_cols].sum().sort_values(ascending=False))
fig.update_layout(title_text="Emotion Distribution",
width=2000)
return fig
def nmf_plots(df,
nmf_components,
tfidf_max_features,
tfidf_stop_words='english'
):
"""
Converts all `text_original` values of `df` DataFrame to TF-IDF features
and performs Non-negative matrix factorization on them.
Returns a tuple of the modified DataFrame with NMF values and a list of
plotly figures (`df`, [plotly figures]).
"""
# Convert to TF-IDF features
vectorizer = TfidfVectorizer(max_features=tfidf_max_features,
stop_words=tfidf_stop_words)
embeddings = vectorizer.fit_transform(df['text_original'])
# Get feature_names (words) from the vectorizer
feature_names = vectorizer.get_feature_names_out()
# Perform NMF
nmf = NMF(n_components=nmf_components)
nmf_embeddings = nmf.fit_transform(embeddings).T
topic_cols = [f'topic_{topic_num+1}'
for topic_num in range(nmf_components)]
# Add NMF values to the DataFrame
for i, col in enumerate(topic_cols):
df[col] = nmf_embeddings[i]
# Create `main_topic` column with the highest value topic name
df['main_topic'] = df[topic_cols].apply(lambda row: row.idxmax(), axis=1)
# Get word values for every topic
word_df = pd.DataFrame(
nmf.components_.T,
columns=topic_cols,
index=feature_names
)
# Plot word distributions of each topic
topic_words_fig = make_subplots(
rows=1, cols=nmf_components,
subplot_titles=topic_cols)
for i, col in enumerate(topic_cols):
topic_words = word_df[col].sort_values(ascending=False)
top_topic_words = topic_words[:top_words_in_topic]
topic_words_fig.add_trace(go.Bar(y=top_topic_words.index,
x=top_topic_words.values,
orientation='h',
base=0),
row=1, col=i+1)
topic_words_fig.update_layout(title_text="Topic Word Distributions",
showlegend=False)
# Plot topic contribution for the dataset
for col in topic_cols:
df[col + '_cumsum'] = df[col].cumsum()
for col in topic_cols:
cumsum_sum = df[[col + '_cumsum' for col in topic_cols]].sum(axis=1)
df[col + '_percentage'] = df[col + '_cumsum'] / cumsum_sum
contributions_fig = stacked_area_plot(
x=df['published_at'],
y_list=[df[f'topic_{i+1}_percentage'] for i in range(nmf_components)],
names=topic_cols)
return df, [topic_words_fig, contributions_fig]
def tsne_plots(df, encoder, emotion_cols, tsne_color, tsne_perplexity):
"""
Encodes all `text_original` values of `df` DataFrame with `encoder`,
uses t-SNE algorithm for visualization on these embeddings and on
predicted emotions if they were predicted.
"""
# Encode and add embeddings to the DataFrame
embeddings = encoder.encode(df['text_original'])
embedding_cols = [f'embedding_{i+1}' for i in range(embeddings.shape[1])]
df = pd.concat([df, pd.DataFrame(embeddings, columns=embedding_cols)],
axis=1)
# t-SNE
TSNE_COMPONENTS = 2
tsne = TSNE(
n_components=2,
perplexity=tsne_perplexity,
)
# Also use predicted emotions
if emotion_cols:
tsne_cols = embedding_cols + emotion_cols
color = tsne_color
hover_data = ['first_emotion', 'second_emotion', 'text_original']
else:
tsne_cols = embedding_cols
color = None
hover_data = ['text_original']
if 'main_topic' in df.columns:
hover_data.append('main_topic')
# Color column
if 'main_topic' in df.columns or emotion_cols:
color = tsne_color
else:
color = None
tsne_results = tsne.fit_transform(df[tsne_cols])
tsne_results = pd.DataFrame(
tsne_results,
columns=[f'tsne_{i+1}' for i in range(TSNE_COMPONENTS)]
)
df = pd.concat([df, tsne_results], axis=1)
# 2D Visualization
fig2d = px.scatter(
df,
x='tsne_1',
y='tsne_2',
color=color,
hover_data=hover_data
)
fig2d.update_layout(
title_text="t-SNE Visualization"
)
# 3D Visualization with date as the third axis
fig3d = px.scatter_3d(
df,
x='published_at',
y='tsne_1',
z='tsne_2',
color=color,
hover_data=hover_data
)
fig3d.update_layout(
title_text="t-SNE Visualization Over Time",
height=800
)
return df, [fig2d, fig3d]
def stacked_area_plot(x, y_list, names):
"""Creates plotly stacked area plot. Returns a figure of that plot."""
fig = go.Figure()
for y, name in zip(y_list, names):
fig.add_trace(go.Scatter(
x=x, y=y*100,
mode='lines',
line=dict(width=0.5),
stackgroup='one',
name=name,
))
fig.update_layout(
showlegend=True,
xaxis_type='category',
yaxis=dict(
type='linear',
range=[0, 100],
ticksuffix='%')
)
fig.update_layout(title_text="Topic Contribution")
return fig
def add_top_2_emotions(row):
emotions = row[emotion_cols].sort_values(ascending=False)
row['first_emotion'] = emotions.index[0]
row['second_emotion'] = emotions.index[1]
return row
st.set_page_config(layout='wide')
st.title("Social-Stat")
# Load models
emotions_clf = init_emotions_model()
sentence_encoder = init_embedding_model()
lang_model = init_lang_model()
# Init YouTube API
yt_api = YouTubeAPI(
api_key=YT_API_KEY,
max_comment_size=MAX_COMMENT_SIZE
)
# Input form
with st.form(key='input'):
# Input
url_input = st.text_input("URL or ID")
# Get ID from URL
url_data = urlparse.urlparse(url_input)
query = urlparse.parse_qs(url_data.query)
if 'v' in query:
video_id = query['v'][0]
else:
video_id = url_input
# Emotions
emotions_checkbox = st.checkbox(
"Predict Emotions",
value=True,
)
# NMF
nmf_checkbox = st.checkbox(
"Non-Negative Matrix Factorization",
value=True,
)
nmf_components = st.slider(
"Topics (NMF Components)",
min_value=2,
max_value=12,
value=8,
step=1,
)
tfidf_max_features = st.select_slider(
"Words (TF-IDF Vectorizer Max Features)",
options=list(range(10, 501)) + [None],
value=100,
)
top_words_in_topic = st.slider(
"Top Topic Words",
min_value=1,
max_value=50,
value=10,
step=1,
)
# t-SNE
tsne_checkbox = st.checkbox(
"t-SNE Visualization",
value=True,
)
tsne_perplexity = st.slider(
"t-SNE Perplexity",
min_value=5,
max_value=50,
value=10,
step=1,
)
tsne_color = st.selectbox(
"Plot Color",
options=['main_topic', 'first_emotion', 'second_emotion']
)
# Language Map
map_checkbox = st.checkbox(
"Language Map",
value=True,
)
submit = st.form_submit_button("Analyze")
if submit:
# Get comments
try:
bad_id = False
comments = yt_api.get_comments(video_id)
except KeyError:
st.write("Video not found.")
st.write(query)
st.write('v' in query)
st.write(video_id)
bad_id = True
if not bad_id:
plots = []
# Convert to pandas DataFrame and sort by publishing date
df = pd.DataFrame(comments).sort_values('published_at')
emotion_cols = []
if emotions_checkbox:
# Predict emotions
df = predict_emotions(df, emotions_clf)
emotion_cols = list(df.columns[11:])
# Get emotion distribution figure
plots.append(emotion_dist_plot(df, emotion_cols))
# Get top 2 emotions
df = df.apply(add_top_2_emotions, axis=1)
if map_checkbox:
df = detect_languages(df, lang_model)
plots.append(lang_map(df))
if nmf_checkbox:
# NMF
df, nmf_figs = nmf_plots(df, nmf_components, tfidf_max_features)
plots.extend(nmf_figs)
if tsne_checkbox:
# t-SNE visualization
if not nmf_checkbox:
tsne_color = 'first_emotion'
if not emotions_checkbox:
tsne_color = 'main_topic'
df, tsne_figs = tsne_plots(df,
sentence_encoder,
emotion_cols,
tsne_color,
tsne_perplexity)
plots.extend(tsne_figs)
# Draw the plots
for i, plot in enumerate(plots):
st.plotly_chart(
plot, sharing='streamlit',
theme='streamlit',
use_container_width=True)
# Show the final DataFrame
st.dataframe(df)