Spaces:

molokhovdmitry
/

social-stat

Sleeping

App Files Files Community

molokhovdmitry commited on Mar 22

Commit

0d1ee8d

•

1 Parent(s): 47d6fb9

Add language map

Browse files

Files changed (4) hide show

.env.example +2 -1
data/countries.geo.json +0 -0
src/app.py +50 -4
src/maps.py +129 -0

.env.example CHANGED Viewed

@@ -1,3 +1,4 @@
 YT_API_KEY=""
 PRED_BATCH_SIZE=512
-MAX_COMMENT_SIZE=300

 YT_API_KEY=""
 PRED_BATCH_SIZE=512
+MAX_COMMENT_SIZE=300
+LANG_DETECTION_CONF=0.5

data/countries.geo.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from sklearn.decomposition import NMF
 from sklearn.manifold import TSNE
 from yt_api import YouTubeAPI
 # Load app settings
@@ -19,6 +20,7 @@ load_dotenv()
 YT_API_KEY = os.getenv('YT_API_KEY')
 MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
 PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
 @st.cache_resource
@@ -37,6 +39,13 @@ def init_embedding_model():
     return model
 def predict_emotions(df, clf):
     """
     Predicts emotions for every `text_original` in a DataFrame `df` with a
@@ -60,6 +69,29 @@ def predict_emotions(df, clf):
     return df
 def emotion_dist_plot(df, emotion_cols):
     """
     Creates an emotion distribution plotly figure from `df` DataFrame
@@ -78,8 +110,8 @@ def nmf_plots(df,
               tfidf_stop_words='english'
               ):
     """
-    Converts all `text_original` values of `df` DataFrame to TF-IDF features and
-    performs Non-negative matrix factorization on them.
     Returns a tuple of the modified DataFrame with NMF values and a list of
     plotly figures (`df`, [plotly figures]).
@@ -242,6 +274,7 @@ st.title("Social-Stat")
 # Load models
 emotions_clf = init_emotions_model()
 sentence_encoder = init_embedding_model()
 # Init YouTube API
 yt_api = YouTubeAPI(
@@ -306,6 +339,12 @@ with st.form(key='input'):
         options=['first_emotion', 'second_emotion']
     )
     submit = st.form_submit_button("Analyze")
@@ -352,16 +391,23 @@ if submit:
                                        tsne_perplexity)
             plots.extend(tsne_figs)
-        # Show the final DataFrame
-        st.dataframe(df)
         # Plot all figures
         if emotions_checkbox:
             st.plotly_chart(emotion_fig, use_container_width=True)
         cols = st.columns(2)
         for i, plot in enumerate(plots):
             cols[i % 2].plotly_chart(
                 plot, sharing='streamlit',
                 theme='streamlit',
                 use_container_width=True)

 from sklearn.manifold import TSNE
 from yt_api import YouTubeAPI
+from maps import lang_map
 # Load app settings
 YT_API_KEY = os.getenv('YT_API_KEY')
 MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
 PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
+LANG_DETECTION_CONF = float(os.getenv('LANG_DETECTION_CONF'))
 @st.cache_resource
     return model
+@st.cache_resource
+def init_lang_model():
+    model_ckpt = "papluca/xlm-roberta-base-language-detection"
+    pipe = pipeline("text-classification", model=model_ckpt)
+    return pipe
 def predict_emotions(df, clf):
     """
     Predicts emotions for every `text_original` in a DataFrame `df` with a
     return df
+def detect_languages(df, clf):
+    """
+    Detects languages for every `text_original` in a DataFrame `df` with a
+    classifier `clf`. Takes the language with the highest score.
+    Returns a DataFrame with `predicted_language` column.
+    """
+    # Detect languages in batches
+    text_list = df['text_original'].to_list()
+    batch_size = PRED_BATCH_SIZE
+    text_batches = [text_list[i:i + batch_size]
+                    for i in range(0, len(text_list), batch_size)]
+    preds = [batch_preds[0]['label']
+             if batch_preds[0]['score'] > LANG_DETECTION_CONF
+             else None
+             for text_batch in text_batches
+             for batch_preds in clf(text_batch, top_k=1, truncation=True)]
+    # Add predictions to DataFrame
+    df['predicted_language'] = preds
+    return df
 def emotion_dist_plot(df, emotion_cols):
     """
     Creates an emotion distribution plotly figure from `df` DataFrame
               tfidf_stop_words='english'
               ):
     """
+    Converts all `text_original` values of `df` DataFrame to TF-IDF features
+    and performs Non-negative matrix factorization on them.
     Returns a tuple of the modified DataFrame with NMF values and a list of
     plotly figures (`df`, [plotly figures]).
 # Load models
 emotions_clf = init_emotions_model()
 sentence_encoder = init_embedding_model()
+lang_model = init_lang_model()
 # Init YouTube API
 yt_api = YouTubeAPI(
         options=['first_emotion', 'second_emotion']
     )
+    # Language Map
+    map_checkbox = st.checkbox(
+        "Language Map",
+        value=True,
+    )
     submit = st.form_submit_button("Analyze")
                                        tsne_perplexity)
             plots.extend(tsne_figs)
+        if map_checkbox:
+            df = detect_languages(df, lang_model)
+            map_figure = lang_map(df)
         # Plot all figures
         if emotions_checkbox:
             st.plotly_chart(emotion_fig, use_container_width=True)
+        if map_checkbox:
+            st.plotly_chart(map_figure, use_container_width=True)
         cols = st.columns(2)
         for i, plot in enumerate(plots):
             cols[i % 2].plotly_chart(
                 plot, sharing='streamlit',
                 theme='streamlit',
                 use_container_width=True)
+        # Show the final DataFrame
+        st.dataframe(df)

src/maps.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import json
+import pandas as pd
+import plotly.express as px
+# Language codes predicted by language detection model
+LANG_CODES = ['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
+              'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']
+COUNTRY_TO_LANG_CODE = {
+    'Algeria': 'ar',
+    'Chad': 'ar',
+    'Djibouti': 'ar',
+    'Egypt': 'ar',
+    'Iraq': 'ar',
+    'Jordan': 'ar',
+    'Kuwait': 'ar',
+    'Lebanon': 'ar',
+    'Libya': 'ar',
+    'Mali': 'ar',
+    'Mauritania': 'ar',
+    'Morocco': 'ar',
+    'Oman': 'ar',
+    'Palestine': 'ar',
+    'Qatar': 'ar',
+    'Saudi Arabia': 'ar',
+    'Somalia': 'ar',
+    'Sudan': 'ar',
+    'Syria': 'ar',
+    'Tunisia': 'ar',
+    'United Arab Emirates': 'ar',
+    'Yemen': 'ar',
+    'Bulgaria': 'bg',
+    'Germany': 'de',
+    'Greece': 'el',
+    'Cyprus': 'el',
+    'United States of America': 'en',
+    'Ireland': 'en',
+    'United Kingdom': 'en',
+    'Canada': 'en',
+    'Australia': 'en',
+    'Mexico': 'es',
+    'Mexico': 'es',
+    'Colombia': 'es',
+    'Spain': 'es',
+    'Argentina': 'es',
+    'Peru': 'es',
+    'Venezuela': 'es',
+    'Chile': 'es',
+    'Guatemala': 'es',
+    'Ecuador': 'es',
+    'Bolivia': 'es',
+    'Cuba': 'es',
+    'Dominican Rep.': 'es',
+    'Honduras': 'es',
+    'Paraguay': 'es',
+    'El Salvador': 'es',
+    'Nicaragua': 'es',
+    'Costa Rica': 'es',
+    'Panama': 'es',
+    'Uruguay': 'es',
+    'Guinea': 'es',
+    'France': 'fr',
+    'India': 'hi',
+    'Italy': 'it',
+    'Japan': 'ja',
+    'Netherlands': 'nl',
+    'Belgium': 'nl',
+    'Poland': 'pl',
+    'Portugal': 'pt',
+    'Russia': 'ru',
+    'Uganda': 'sw',
+    'Kenya': 'sw',
+    'Tanzania': 'sw',
+    'Thailand': 'th',
+    'Turkey': 'tr',
+    'Pakistan': 'ur',
+    'Vietnam': 'vi',
+    'China': 'zh'
+}
+def lang_map(df):
+    with open('data/countries.geo.json') as f:
+        countries = json.load(f)
+    country_list = [country['properties']['name']
+                    for country in dict(countries)['features']]
+    LANG_CODES = df.value_counts('predicted_language')
+    countries_data = []
+    lang_count_data = []
+    lang_code_data = []
+    for country in country_list:
+        if country in COUNTRY_TO_LANG_CODE:
+            country_lang = COUNTRY_TO_LANG_CODE[country]
+            if country_lang in LANG_CODES.index:
+                countries_data.append(country)
+                lang_count = LANG_CODES.loc[COUNTRY_TO_LANG_CODE[country]]
+                lang_count_data.append(lang_count)
+                lang_code_data.append(country_lang)
+    lang_df = pd.DataFrame({
+        'country': countries_data,
+        'count': lang_count_data,
+        'lang_code': lang_code_data
+    })
+    fig = px.choropleth(
+        lang_df,
+        geojson=countries,
+        locations='country',
+        locationmode='country names',
+        color='count',
+        color_continuous_scale=[
+            [0, "rgb(45,45,48)"],
+            [0.33, "rgb(116,173,209)"],
+            [0.66, "rgb(255,255,0)"],
+            [1, "rgb(255,94,5)"]
+        ],
+        scope='world',
+        hover_data=['lang_code'],
+        labels={'count': "Language Count"},
+        template='plotly_dark'
+    )
+    fig.update_geos(showcountries=True)
+    fig.update_layout(
+        title_text="Language Map",
+        margin={"r": 0, "t": 20, "l": 0, "b": 0}
+    )
+    return fig