Spaces:

molokhovdmitry
/

social-stat

Running

App Files Files Community

molokhovdmitry commited on Mar 23, 2024

Commit

3fd5909

•

2 Parent(s): ecaa557 bca4302

Merge pull request #2 from molokhovdmitry/web_app

Browse files

Files changed (14) hide show

.env.example +4 -0
.github/workflows/python-app.yml +0 -3
Dockerfile +1 -1
README.md +2 -50
data/countries.geo.json +0 -0
requirements.txt +6 -5
src/__init__.py +0 -0
src/app.py +413 -0
src/main.py +0 -54
src/maps.py +129 -0
src/models.py +0 -10
src/test_main.py +0 -27
src/yt_api.py +4 -0
vm_startup.sh +0 -6

.env.example ADDED Viewed

	@@ -0,0 +1,4 @@

+YT_API_KEY=""
+PRED_BATCH_SIZE=512
+MAX_COMMENT_SIZE=300
+LANG_DETECTION_CONF=0.9

.github/workflows/python-app.yml CHANGED Viewed

@@ -36,6 +36,3 @@ jobs:
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
-      run: |
-        pytest

         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics

Dockerfile CHANGED Viewed

@@ -5,4 +5,4 @@ RUN python -m pip install --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 8000
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 8000
+CMD ["streamlit", "run", "src/app.py", "--server.port", "8000"]

README.md CHANGED Viewed

@@ -1,58 +1,10 @@
 # social-stat
-API application for social network analysis.
-# Endpoints
-## GET `/predict/{video_id}`
-Returns `pandas` DataFrame with all `commentThreads` of a `YouTube` video with emotion scores estimated by [SamLowe/roberta-base-go_emotions](https://huggingface.co/SamLowe/roberta-base-go_emotions).
-<details>
-<summary>All DataFrame columns:</summary>
-- comment_id
-- video_id
-- channel_id
-- author_display_name
-- text_original
-- text_display
-- published_at
-- updated_at
-- like_count
-- can_reply
-- total_reply_count
-- neutral
-- approval
-- annoyance
-- disapproval
-- realization
-- admiration
-- disappointment
-- excitement
-- disgust
-- confusion
-- joy
-- anger
-- optimism
-- amusement
-- gratitude
-- surprise
-- sadness
-- fear
-- curiosity
-- love
-- embarrassment
-- desire
-- caring
-- pride
-- relief
-- grief
-- remorse
-- nervousness
-</details>
 # Installation and Running
 ```
 git clone https://github.com/molokhovdmitry/social-stat
 python -m pip install --upgrade pip
 pip install -r requirements.txt
-uvicorn main:app --reload
 ```

 # social-stat
+Streamlit web application for social network analysis.
 # Installation and Running
 ```
 git clone https://github.com/molokhovdmitry/social-stat
 python -m pip install --upgrade pip
 pip install -r requirements.txt
+streamlit run src/app.py
 ```

data/countries.geo.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,11 +1,12 @@
 requests
-fastapi
-uvicorn
-pydantic_settings
 torch
 torchvision
 torchaudio
 transformers
 pandas
-pytest
-httpx

 requests
+python-dotenv
 torch
 torchvision
 torchaudio
 transformers
+sentence-transformers
 pandas
+seaborn
+plotly
+nbformat
+streamlit

src/__init__.py DELETED Viewed

File without changes

src/app.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import os
+from dotenv import load_dotenv
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import NMF
+from sklearn.manifold import TSNE
+from yt_api import YouTubeAPI
+from maps import lang_map
+# Load app settings
+load_dotenv()
+YT_API_KEY = os.getenv('YT_API_KEY')
+MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
+PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
+LANG_DETECTION_CONF = float(os.getenv('LANG_DETECTION_CONF'))
+@st.cache_resource
+def init_emotions_model():
+    classifier = pipeline(
+        task="text-classification",
+        model="SamLowe/roberta-base-go_emotions",
+        top_k=None)
+    return classifier
+@st.cache_resource
+def init_embedding_model():
+    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+    return model
+@st.cache_resource
+def init_lang_model():
+    model_ckpt = "papluca/xlm-roberta-base-language-detection"
+    pipe = pipeline("text-classification", model=model_ckpt)
+    return pipe
+def predict_emotions(df, clf):
+    """
+    Predicts emotions for every `text_original` in a DataFrame `df` with a
+    classifier `clf`.
+    Returns a DataFrame with emotion columns.
+    """
+    # Predict emotions in batches
+    text_list = df['text_original'].to_list()
+    batch_size = PRED_BATCH_SIZE
+    text_batches = [text_list[i:i + batch_size]
+                    for i in range(0, len(text_list), batch_size)]
+    preds = [comment_emotions
+             for text_batch in text_batches
+             for comment_emotions in clf(text_batch)]
+    # Add predictions to DataFrame
+    preds_df = pd.DataFrame([{emotion['label']: emotion['score']
+                            for emotion in pred} for pred in preds])
+    df = pd.concat([df, preds_df], axis=1)
+    return df
+def detect_languages(df, clf):
+    """
+    Detects languages for every `text_original` in a DataFrame `df` with a
+    classifier `clf`. Takes the language with the highest score.
+    Returns a DataFrame with `predicted_language` column.
+    """
+    # Detect languages in batches
+    text_list = df['text_original'].to_list()
+    batch_size = PRED_BATCH_SIZE
+    text_batches = [text_list[i:i + batch_size]
+                    for i in range(0, len(text_list), batch_size)]
+    preds = [batch_preds[0]['label']
+             if batch_preds[0]['score'] > LANG_DETECTION_CONF
+             else None
+             for text_batch in text_batches
+             for batch_preds in clf(text_batch, top_k=1, truncation=True)]
+    # Add predictions to DataFrame
+    df['predicted_language'] = preds
+    return df
+def emotion_dist_plot(df, emotion_cols):
+    """
+    Creates an emotion distribution plotly figure from `df` DataFrame
+    and `emotion_cols` and returns it.
+    """
+    fig = px.bar(df[emotion_cols].sum().sort_values(ascending=False))
+    fig.update_layout(title_text="Emotion Distribution",
+                      width=2000)
+    return fig
+def nmf_plots(df,
+              nmf_components,
+              tfidf_max_features,
+              tfidf_stop_words='english'
+              ):
+    """
+    Converts all `text_original` values of `df` DataFrame to TF-IDF features
+    and performs Non-negative matrix factorization on them.
+    Returns a tuple of the modified DataFrame with NMF values and a list of
+    plotly figures (`df`, [plotly figures]).
+    """
+    # Convert to TF-IDF features
+    vectorizer = TfidfVectorizer(max_features=tfidf_max_features,
+                                 stop_words=tfidf_stop_words)
+    embeddings = vectorizer.fit_transform(df['text_original'])
+    # Get feature_names (words) from the vectorizer
+    feature_names = vectorizer.get_feature_names_out()
+    # Perform NMF
+    nmf = NMF(n_components=nmf_components)
+    nmf_embeddings = nmf.fit_transform(embeddings).T
+    topic_cols = [f'topic_{topic_num+1}'
+                  for topic_num in range(nmf_components)]
+    # Add NMF values to the DataFrame
+    for i, col in enumerate(topic_cols):
+        df[col] = nmf_embeddings[i]
+    # Get word values for every topic
+    word_df = pd.DataFrame(
+        nmf.components_.T,
+        columns=topic_cols,
+        index=feature_names
+    )
+    # Plot word distributions of each topic
+    topic_words_fig = make_subplots(
+        rows=1, cols=nmf_components,
+        subplot_titles=topic_cols)
+    for i, col in enumerate(topic_cols):
+        topic_words = word_df[col].sort_values(ascending=False)
+        top_topic_words = topic_words[:top_words_in_topic]
+        topic_words_fig.add_trace(go.Bar(y=top_topic_words.index,
+                                         x=top_topic_words.values,
+                                         orientation='h',
+                                         base=0),
+                                  row=1, col=i+1)
+    topic_words_fig.update_layout(title_text="Topic Word Distributions",
+                                  showlegend=False)
+    # Plot topic contribution for the dataset
+    for col in topic_cols:
+        df[col + '_cumsum'] = df[col].cumsum()
+    for col in topic_cols:
+        cumsum_sum = df[[col + '_cumsum' for col in topic_cols]].sum(axis=1)
+        df[col + '_percentage'] = df[col + '_cumsum'] / cumsum_sum
+    contributions_fig = stacked_area_plot(
+        x=df['published_at'],
+        y_list=[df[f'topic_{i+1}_percentage'] for i in range(nmf_components)],
+        names=topic_cols)
+    return df, [topic_words_fig, contributions_fig]
+def tsne_plots(df, encoder, emotion_cols, color_emotion, tsne_perplexity):
+    """
+    Encodes all `text_original` values of `df` DataFrame with `encoder`,
+    uses t-SNE algorithm for visualization on these embeddings and on
+    predicted emotions if they were predicted.
+    """
+    # Encode and add embeddings to the DataFrame
+    embeddings = encoder.encode(df['text_original'])
+    embedding_cols = [f'embedding_{i+1}' for i in range(embeddings.shape[1])]
+    df = pd.concat([df, pd.DataFrame(embeddings, columns=embedding_cols)],
+                   axis=1)
+    # t-SNE
+    TSNE_COMPONENTS = 2
+    tsne = TSNE(
+        n_components=2,
+        perplexity=tsne_perplexity,
+    )
+    # Also use predicted emotions
+    if emotion_cols:
+        tsne_cols = embedding_cols + emotion_cols
+        color = color_emotion
+        hover_data = ['first_emotion', 'second_emotion', 'text_original']
+    else:
+        tsne_cols = embedding_cols
+        color = None
+        hover_data = 'text_original'
+    tsne_results = tsne.fit_transform(df[tsne_cols])
+    tsne_results = pd.DataFrame(
+        tsne_results,
+        columns=[f'tsne_{i+1}' for i in range(TSNE_COMPONENTS)]
+    )
+    df = pd.concat([df, tsne_results], axis=1)
+    # 2D Visualization
+    fig2d = px.scatter(
+        df,
+        x='tsne_1',
+        y='tsne_2',
+        color=color,
+        hover_data=hover_data
+    )
+    fig2d.update_layout(
+        title_text="t-SNE Visualization"
+    )
+    # 3D Visualization with date as the third axis
+    fig3d = px.scatter_3d(
+        df,
+        x='published_at',
+        y='tsne_1',
+        z='tsne_2',
+        color=color,
+        hover_data=hover_data
+    )
+    fig3d.update_layout(
+        title_text="t-SNE Visualization Over Time"
+    )
+    return df, [fig2d, fig3d]
+def stacked_area_plot(x, y_list, names):
+    """Creates plotly stacked area plot. Returns a figure of that plot."""
+    fig = go.Figure()
+    for y, name in zip(y_list, names):
+        fig.add_trace(go.Scatter(
+            x=x, y=y*100,
+            mode='lines',
+            line=dict(width=0.5),
+            stackgroup='one',
+            name=name,
+        ))
+    fig.update_layout(
+        showlegend=True,
+        xaxis_type='category',
+        yaxis=dict(
+            type='linear',
+            range=[0, 100],
+            ticksuffix='%')
+        )
+    fig.update_layout(title_text="Topic Contribution")
+    return fig
+def add_top_2_emotions(row):
+    emotions = row[emotion_cols].sort_values(ascending=False)
+    row['first_emotion'] = emotions.index[0]
+    row['second_emotion'] = emotions.index[1]
+    return row
+st.set_page_config(layout='wide')
+st.title("Social-Stat")
+# Load models
+emotions_clf = init_emotions_model()
+sentence_encoder = init_embedding_model()
+lang_model = init_lang_model()
+# Init YouTube API
+yt_api = YouTubeAPI(
+    api_key=YT_API_KEY,
+    max_comment_size=MAX_COMMENT_SIZE
+)
+# Input form
+with st.form(key='input'):
+    video_id = st.text_input("Video ID")
+    # Emotions
+    emotions_checkbox = st.checkbox(
+        "Predict Emotions",
+        value=True,
+    )
+    # NMF
+    nmf_checkbox = st.checkbox(
+        "Non-Negative Matrix Factorization",
+        value=True,
+    )
+    nmf_components = st.slider(
+        "Topics (NMF Components)",
+        min_value=2,
+        max_value=20,
+        value=10,
+        step=1,
+    )
+    tfidf_max_features = st.select_slider(
+        "Words (TF-IDF Vectorizer Max Features)",
+        options=list(range(10, 501)) + [None],
+        value=100,
+    )
+    top_words_in_topic = st.slider(
+        "Top Topic Words",
+        min_value=1,
+        max_value=50,
+        value=10,
+        step=1,
+    )
+    # t-SNE
+    tsne_checkbox = st.checkbox(
+        "t-SNE Visualization",
+        value=True,
+    )
+    tsne_perplexity = st.slider(
+        "t-SNE Perplexity",
+        min_value=5,
+        max_value=50,
+        value=10,
+        step=1,
+    )
+    tsne_color_emotion = st.selectbox(
+        "Emotion For The Plot Color",
+        options=['first_emotion', 'second_emotion']
+    )
+    # Language Map
+    map_checkbox = st.checkbox(
+        "Language Map",
+        value=True,
+    )
+    submit = st.form_submit_button("Analyze")
+if submit:
+    # Get comments
+    try:
+        bad_id = False
+        comments = yt_api.get_comments(video_id)
+    except KeyError:
+        st.write("Video not found.")
+        bad_id = True
+    if not bad_id:
+        plots = []
+        # Convert to pandas DataFrame and sort by publishing date
+        df = pd.DataFrame(comments).sort_values('published_at')
+        emotion_cols = []
+        if emotions_checkbox:
+            # Predict emotions
+            df = predict_emotions(df, emotions_clf)
+            emotion_cols = list(df.columns[11:])
+            # Get emotion distribution figure
+            emotion_fig = emotion_dist_plot(df, emotion_cols)
+            # TODO: Get emotion contribution figure
+            # Get top 2 emotions
+            df = df.apply(add_top_2_emotions, axis=1)
+        if nmf_checkbox:
+            # NMF
+            df, nmf_figs = nmf_plots(df, nmf_components, tfidf_max_features)
+            plots.extend(nmf_figs)
+        if tsne_checkbox:
+            # t-SNE visualization
+            df, tsne_figs = tsne_plots(df,
+                                       sentence_encoder,
+                                       emotion_cols,
+                                       tsne_color_emotion,
+                                       tsne_perplexity)
+            plots.extend(tsne_figs)
+        if map_checkbox:
+            df = detect_languages(df, lang_model)
+            map_figure = lang_map(df)
+        # Plot all figures
+        if emotions_checkbox:
+            st.plotly_chart(emotion_fig, use_container_width=True)
+        if map_checkbox:
+            st.plotly_chart(map_figure, use_container_width=True)
+        for i, plot in enumerate(plots):
+            st.plotly_chart(
+                plot, sharing='streamlit',
+                theme='streamlit',
+                use_container_width=True)
+        # Show the final DataFrame
+        st.dataframe(df)

src/main.py DELETED Viewed

@@ -1,54 +0,0 @@
-from fastapi import FastAPI, Response
-from pydantic_settings import BaseSettings, SettingsConfigDict
-import pandas as pd
-from src.yt_api import YouTubeAPI
-from src.models import init_emotions_model
-class Settings(BaseSettings):
-    YT_API_KEY: str
-    PRED_BATCH_SIZE: int = 512
-    MAX_COMMENT_SIZE: int = 300
-    model_config = SettingsConfigDict(env_file='.env')
-settings = Settings()
-app = FastAPI(title='social-stat')
-emotions_clf = init_emotions_model()
-yt_api = YouTubeAPI(
-    api_key=settings.YT_API_KEY,
-    max_comment_size=settings.MAX_COMMENT_SIZE
-)
-@app.get('/')
-def home():
-    return 'social-stat'
-@app.get('/predict')
-def predict(video_id):
-    # Get comments
-    comments = yt_api.get_comments(video_id)
-    comments_df = pd.DataFrame(comments)
-    # Predict emotions in batches
-    text_list = comments_df['text_display'].to_list()
-    batch_size = settings.PRED_BATCH_SIZE
-    text_batches = [text_list[i:i + batch_size]
-                    for i in range(0, len(text_list), batch_size)]
-    preds = [comment_emotions
-             for text_batch in text_batches
-             for comment_emotions in emotions_clf(text_batch)]
-    # Add predictions to DataFrame
-    preds_df = pd.DataFrame([{emotion['label']: emotion['score']
-                              for emotion in pred} for pred in preds])
-    comments_df = pd.concat([comments_df, preds_df], axis=1)
-    # Return DataFrame as a JSON file
-    return Response(
-        content=comments_df.to_json(orient='records'),
-        media_type='application/json')

src/maps.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import json
+import pandas as pd
+import plotly.express as px
+# Language codes predicted by language detection model
+LANG_CODES = ['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
+              'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']
+COUNTRY_TO_LANG_CODE = {
+    'Algeria': 'ar',
+    'Chad': 'ar',
+    'Djibouti': 'ar',
+    'Egypt': 'ar',
+    'Iraq': 'ar',
+    'Jordan': 'ar',
+    'Kuwait': 'ar',
+    'Lebanon': 'ar',
+    'Libya': 'ar',
+    'Mali': 'ar',
+    'Mauritania': 'ar',
+    'Morocco': 'ar',
+    'Oman': 'ar',
+    'Palestine': 'ar',
+    'Qatar': 'ar',
+    'Saudi Arabia': 'ar',
+    'Somalia': 'ar',
+    'Sudan': 'ar',
+    'Syria': 'ar',
+    'Tunisia': 'ar',
+    'United Arab Emirates': 'ar',
+    'Yemen': 'ar',
+    'Bulgaria': 'bg',
+    'Germany': 'de',
+    'Greece': 'el',
+    'Cyprus': 'el',
+    'United States of America': 'en',
+    'Ireland': 'en',
+    'United Kingdom': 'en',
+    'Canada': 'en',
+    'Australia': 'en',
+    'Mexico': 'es',
+    'Mexico': 'es',
+    'Colombia': 'es',
+    'Spain': 'es',
+    'Argentina': 'es',
+    'Peru': 'es',
+    'Venezuela': 'es',
+    'Chile': 'es',
+    'Guatemala': 'es',
+    'Ecuador': 'es',
+    'Bolivia': 'es',
+    'Cuba': 'es',
+    'Dominican Rep.': 'es',
+    'Honduras': 'es',
+    'Paraguay': 'es',
+    'El Salvador': 'es',
+    'Nicaragua': 'es',
+    'Costa Rica': 'es',
+    'Panama': 'es',
+    'Uruguay': 'es',
+    'Guinea': 'es',
+    'France': 'fr',
+    'India': 'hi',
+    'Italy': 'it',
+    'Japan': 'ja',
+    'Netherlands': 'nl',
+    'Belgium': 'nl',
+    'Poland': 'pl',
+    'Portugal': 'pt',
+    'Russia': 'ru',
+    'Uganda': 'sw',
+    'Kenya': 'sw',
+    'Tanzania': 'sw',
+    'Thailand': 'th',
+    'Turkey': 'tr',
+    'Pakistan': 'ur',
+    'Vietnam': 'vi',
+    'China': 'zh'
+}
+def lang_map(df):
+    with open('data/countries.geo.json') as f:
+        countries = json.load(f)
+    country_list = [country['properties']['name']
+                    for country in dict(countries)['features']]
+    LANG_CODES = df.value_counts('predicted_language')
+    countries_data = []
+    lang_count_data = []
+    lang_code_data = []
+    for country in country_list:
+        if country in COUNTRY_TO_LANG_CODE:
+            country_lang = COUNTRY_TO_LANG_CODE[country]
+            if country_lang in LANG_CODES.index:
+                countries_data.append(country)
+                lang_count = LANG_CODES.loc[COUNTRY_TO_LANG_CODE[country]]
+                lang_count_data.append(lang_count)
+                lang_code_data.append(country_lang)
+    lang_df = pd.DataFrame({
+        'country': countries_data,
+        'count': lang_count_data,
+        'lang_code': lang_code_data
+    })
+    fig = px.choropleth(
+        lang_df,
+        geojson=countries,
+        locations='country',
+        locationmode='country names',
+        color='count',
+        color_continuous_scale=[
+            [0, "rgb(45,45,48)"],
+            [0.33, "rgb(116,173,209)"],
+            [0.66, "rgb(255,255,0)"],
+            [1, "rgb(255,94,5)"]
+        ],
+        scope='world',
+        hover_data=['lang_code'],
+        labels={'count': "Language Count"},
+        template='plotly_dark'
+    )
+    fig.update_geos(showcountries=True)
+    fig.update_layout(
+        title_text="Language Map",
+        margin={"r": 0, "t": 20, "l": 0, "b": 0}
+    )
+    return fig

src/models.py DELETED Viewed

@@ -1,10 +0,0 @@
-from transformers import pipeline
-def init_emotions_model():
-    classifier = pipeline(
-        task="text-classification",
-        model="SamLowe/roberta-base-go_emotions",
-        top_k=None)
-    return classifier

src/test_main.py DELETED Viewed

@@ -1,27 +0,0 @@
-from fastapi.testclient import TestClient
-from src.main import app
-import pandas as pd
-client = TestClient(app)
-def test_home():
-    """Test home page."""
-    response = client.get("/")
-    assert response.status_code == 200
-def test_predict():
-    """Test predict method on an example video."""
-    TEST_VIDEO_ID = "0peXnOnDgQ8"
-    response = client.get(
-        "/predict/",
-        params={"video_id": TEST_VIDEO_ID}
-    )
-    df = pd.read_json(response, orient='records')
-    # Ensure the DataFrame has the right amount of columns
-    assert df.shape[1] == 39
-    # Ensure there are no NaN values
-    assert df.isna().sum().sum() == 0

src/yt_api.py CHANGED Viewed

@@ -34,6 +34,10 @@ class YouTubeAPI():
             'pageToken': page_token,
         }
         response = requests.get(url, params=payload)
         return response.json()
     def response_to_comments(self, response):

             'pageToken': page_token,
         }
         response = requests.get(url, params=payload)
+        # Ensure it's not a bad request
+        assert response.status_code != 400
         return response.json()
     def response_to_comments(self, response):

vm_startup.sh DELETED Viewed

@@ -1,6 +0,0 @@
-# Script for an automatic startup on a virtual machine.
-. /home/user/python_venv/social-stat/bin/activate
-cd /home/user/social-stat
-git pull
-pip install -r requirements.txt
-uvicorn src.main:app --host 0.0.0.0 --port 8000 > /home/user/log.txt 2>&1