Spaces:
Sleeping
Sleeping
molokhovdmitry
commited on
Commit
•
0d1ee8d
1
Parent(s):
47d6fb9
Add language map
Browse files- .env.example +2 -1
- data/countries.geo.json +0 -0
- src/app.py +50 -4
- src/maps.py +129 -0
.env.example
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
YT_API_KEY=""
|
2 |
PRED_BATCH_SIZE=512
|
3 |
-
MAX_COMMENT_SIZE=300
|
|
|
|
1 |
YT_API_KEY=""
|
2 |
PRED_BATCH_SIZE=512
|
3 |
+
MAX_COMMENT_SIZE=300
|
4 |
+
LANG_DETECTION_CONF=0.5
|
data/countries.geo.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/app.py
CHANGED
@@ -12,6 +12,7 @@ from sklearn.decomposition import NMF
|
|
12 |
from sklearn.manifold import TSNE
|
13 |
|
14 |
from yt_api import YouTubeAPI
|
|
|
15 |
|
16 |
|
17 |
# Load app settings
|
@@ -19,6 +20,7 @@ load_dotenv()
|
|
19 |
YT_API_KEY = os.getenv('YT_API_KEY')
|
20 |
MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
|
21 |
PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
|
|
|
22 |
|
23 |
|
24 |
@st.cache_resource
|
@@ -37,6 +39,13 @@ def init_embedding_model():
|
|
37 |
return model
|
38 |
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def predict_emotions(df, clf):
|
41 |
"""
|
42 |
Predicts emotions for every `text_original` in a DataFrame `df` with a
|
@@ -60,6 +69,29 @@ def predict_emotions(df, clf):
|
|
60 |
return df
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def emotion_dist_plot(df, emotion_cols):
|
64 |
"""
|
65 |
Creates an emotion distribution plotly figure from `df` DataFrame
|
@@ -78,8 +110,8 @@ def nmf_plots(df,
|
|
78 |
tfidf_stop_words='english'
|
79 |
):
|
80 |
"""
|
81 |
-
Converts all `text_original` values of `df` DataFrame to TF-IDF features
|
82 |
-
performs Non-negative matrix factorization on them.
|
83 |
|
84 |
Returns a tuple of the modified DataFrame with NMF values and a list of
|
85 |
plotly figures (`df`, [plotly figures]).
|
@@ -242,6 +274,7 @@ st.title("Social-Stat")
|
|
242 |
# Load models
|
243 |
emotions_clf = init_emotions_model()
|
244 |
sentence_encoder = init_embedding_model()
|
|
|
245 |
|
246 |
# Init YouTube API
|
247 |
yt_api = YouTubeAPI(
|
@@ -306,6 +339,12 @@ with st.form(key='input'):
|
|
306 |
options=['first_emotion', 'second_emotion']
|
307 |
)
|
308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
submit = st.form_submit_button("Analyze")
|
310 |
|
311 |
|
@@ -352,16 +391,23 @@ if submit:
|
|
352 |
tsne_perplexity)
|
353 |
plots.extend(tsne_figs)
|
354 |
|
355 |
-
|
356 |
-
|
|
|
357 |
|
358 |
# Plot all figures
|
359 |
if emotions_checkbox:
|
360 |
st.plotly_chart(emotion_fig, use_container_width=True)
|
361 |
|
|
|
|
|
|
|
362 |
cols = st.columns(2)
|
363 |
for i, plot in enumerate(plots):
|
364 |
cols[i % 2].plotly_chart(
|
365 |
plot, sharing='streamlit',
|
366 |
theme='streamlit',
|
367 |
use_container_width=True)
|
|
|
|
|
|
|
|
12 |
from sklearn.manifold import TSNE
|
13 |
|
14 |
from yt_api import YouTubeAPI
|
15 |
+
from maps import lang_map
|
16 |
|
17 |
|
18 |
# Load app settings
|
|
|
20 |
YT_API_KEY = os.getenv('YT_API_KEY')
|
21 |
MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
|
22 |
PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
|
23 |
+
LANG_DETECTION_CONF = float(os.getenv('LANG_DETECTION_CONF'))
|
24 |
|
25 |
|
26 |
@st.cache_resource
|
|
|
39 |
return model
|
40 |
|
41 |
|
42 |
+
@st.cache_resource
|
43 |
+
def init_lang_model():
|
44 |
+
model_ckpt = "papluca/xlm-roberta-base-language-detection"
|
45 |
+
pipe = pipeline("text-classification", model=model_ckpt)
|
46 |
+
return pipe
|
47 |
+
|
48 |
+
|
49 |
def predict_emotions(df, clf):
|
50 |
"""
|
51 |
Predicts emotions for every `text_original` in a DataFrame `df` with a
|
|
|
69 |
return df
|
70 |
|
71 |
|
72 |
+
def detect_languages(df, clf):
|
73 |
+
"""
|
74 |
+
Detects languages for every `text_original` in a DataFrame `df` with a
|
75 |
+
classifier `clf`. Takes the language with the highest score.
|
76 |
+
Returns a DataFrame with `predicted_language` column.
|
77 |
+
"""
|
78 |
+
# Detect languages in batches
|
79 |
+
text_list = df['text_original'].to_list()
|
80 |
+
batch_size = PRED_BATCH_SIZE
|
81 |
+
text_batches = [text_list[i:i + batch_size]
|
82 |
+
for i in range(0, len(text_list), batch_size)]
|
83 |
+
preds = [batch_preds[0]['label']
|
84 |
+
if batch_preds[0]['score'] > LANG_DETECTION_CONF
|
85 |
+
else None
|
86 |
+
for text_batch in text_batches
|
87 |
+
for batch_preds in clf(text_batch, top_k=1, truncation=True)]
|
88 |
+
|
89 |
+
# Add predictions to DataFrame
|
90 |
+
df['predicted_language'] = preds
|
91 |
+
|
92 |
+
return df
|
93 |
+
|
94 |
+
|
95 |
def emotion_dist_plot(df, emotion_cols):
|
96 |
"""
|
97 |
Creates an emotion distribution plotly figure from `df` DataFrame
|
|
|
110 |
tfidf_stop_words='english'
|
111 |
):
|
112 |
"""
|
113 |
+
Converts all `text_original` values of `df` DataFrame to TF-IDF features
|
114 |
+
and performs Non-negative matrix factorization on them.
|
115 |
|
116 |
Returns a tuple of the modified DataFrame with NMF values and a list of
|
117 |
plotly figures (`df`, [plotly figures]).
|
|
|
274 |
# Load models
|
275 |
emotions_clf = init_emotions_model()
|
276 |
sentence_encoder = init_embedding_model()
|
277 |
+
lang_model = init_lang_model()
|
278 |
|
279 |
# Init YouTube API
|
280 |
yt_api = YouTubeAPI(
|
|
|
339 |
options=['first_emotion', 'second_emotion']
|
340 |
)
|
341 |
|
342 |
+
# Language Map
|
343 |
+
map_checkbox = st.checkbox(
|
344 |
+
"Language Map",
|
345 |
+
value=True,
|
346 |
+
)
|
347 |
+
|
348 |
submit = st.form_submit_button("Analyze")
|
349 |
|
350 |
|
|
|
391 |
tsne_perplexity)
|
392 |
plots.extend(tsne_figs)
|
393 |
|
394 |
+
if map_checkbox:
|
395 |
+
df = detect_languages(df, lang_model)
|
396 |
+
map_figure = lang_map(df)
|
397 |
|
398 |
# Plot all figures
|
399 |
if emotions_checkbox:
|
400 |
st.plotly_chart(emotion_fig, use_container_width=True)
|
401 |
|
402 |
+
if map_checkbox:
|
403 |
+
st.plotly_chart(map_figure, use_container_width=True)
|
404 |
+
|
405 |
cols = st.columns(2)
|
406 |
for i, plot in enumerate(plots):
|
407 |
cols[i % 2].plotly_chart(
|
408 |
plot, sharing='streamlit',
|
409 |
theme='streamlit',
|
410 |
use_container_width=True)
|
411 |
+
|
412 |
+
# Show the final DataFrame
|
413 |
+
st.dataframe(df)
|
src/maps.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
|
5 |
+
# Language codes predicted by language detection model
|
6 |
+
LANG_CODES = ['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
|
7 |
+
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']
|
8 |
+
|
9 |
+
COUNTRY_TO_LANG_CODE = {
|
10 |
+
'Algeria': 'ar',
|
11 |
+
'Chad': 'ar',
|
12 |
+
'Djibouti': 'ar',
|
13 |
+
'Egypt': 'ar',
|
14 |
+
'Iraq': 'ar',
|
15 |
+
'Jordan': 'ar',
|
16 |
+
'Kuwait': 'ar',
|
17 |
+
'Lebanon': 'ar',
|
18 |
+
'Libya': 'ar',
|
19 |
+
'Mali': 'ar',
|
20 |
+
'Mauritania': 'ar',
|
21 |
+
'Morocco': 'ar',
|
22 |
+
'Oman': 'ar',
|
23 |
+
'Palestine': 'ar',
|
24 |
+
'Qatar': 'ar',
|
25 |
+
'Saudi Arabia': 'ar',
|
26 |
+
'Somalia': 'ar',
|
27 |
+
'Sudan': 'ar',
|
28 |
+
'Syria': 'ar',
|
29 |
+
'Tunisia': 'ar',
|
30 |
+
'United Arab Emirates': 'ar',
|
31 |
+
'Yemen': 'ar',
|
32 |
+
'Bulgaria': 'bg',
|
33 |
+
'Germany': 'de',
|
34 |
+
'Greece': 'el',
|
35 |
+
'Cyprus': 'el',
|
36 |
+
'United States of America': 'en',
|
37 |
+
'Ireland': 'en',
|
38 |
+
'United Kingdom': 'en',
|
39 |
+
'Canada': 'en',
|
40 |
+
'Australia': 'en',
|
41 |
+
'Mexico': 'es',
|
42 |
+
'Mexico': 'es',
|
43 |
+
'Colombia': 'es',
|
44 |
+
'Spain': 'es',
|
45 |
+
'Argentina': 'es',
|
46 |
+
'Peru': 'es',
|
47 |
+
'Venezuela': 'es',
|
48 |
+
'Chile': 'es',
|
49 |
+
'Guatemala': 'es',
|
50 |
+
'Ecuador': 'es',
|
51 |
+
'Bolivia': 'es',
|
52 |
+
'Cuba': 'es',
|
53 |
+
'Dominican Rep.': 'es',
|
54 |
+
'Honduras': 'es',
|
55 |
+
'Paraguay': 'es',
|
56 |
+
'El Salvador': 'es',
|
57 |
+
'Nicaragua': 'es',
|
58 |
+
'Costa Rica': 'es',
|
59 |
+
'Panama': 'es',
|
60 |
+
'Uruguay': 'es',
|
61 |
+
'Guinea': 'es',
|
62 |
+
'France': 'fr',
|
63 |
+
'India': 'hi',
|
64 |
+
'Italy': 'it',
|
65 |
+
'Japan': 'ja',
|
66 |
+
'Netherlands': 'nl',
|
67 |
+
'Belgium': 'nl',
|
68 |
+
'Poland': 'pl',
|
69 |
+
'Portugal': 'pt',
|
70 |
+
'Russia': 'ru',
|
71 |
+
'Uganda': 'sw',
|
72 |
+
'Kenya': 'sw',
|
73 |
+
'Tanzania': 'sw',
|
74 |
+
'Thailand': 'th',
|
75 |
+
'Turkey': 'tr',
|
76 |
+
'Pakistan': 'ur',
|
77 |
+
'Vietnam': 'vi',
|
78 |
+
'China': 'zh'
|
79 |
+
}
|
80 |
+
|
81 |
+
|
82 |
+
def lang_map(df):
|
83 |
+
with open('data/countries.geo.json') as f:
|
84 |
+
countries = json.load(f)
|
85 |
+
country_list = [country['properties']['name']
|
86 |
+
for country in dict(countries)['features']]
|
87 |
+
LANG_CODES = df.value_counts('predicted_language')
|
88 |
+
|
89 |
+
countries_data = []
|
90 |
+
lang_count_data = []
|
91 |
+
lang_code_data = []
|
92 |
+
for country in country_list:
|
93 |
+
if country in COUNTRY_TO_LANG_CODE:
|
94 |
+
country_lang = COUNTRY_TO_LANG_CODE[country]
|
95 |
+
if country_lang in LANG_CODES.index:
|
96 |
+
countries_data.append(country)
|
97 |
+
lang_count = LANG_CODES.loc[COUNTRY_TO_LANG_CODE[country]]
|
98 |
+
lang_count_data.append(lang_count)
|
99 |
+
lang_code_data.append(country_lang)
|
100 |
+
lang_df = pd.DataFrame({
|
101 |
+
'country': countries_data,
|
102 |
+
'count': lang_count_data,
|
103 |
+
'lang_code': lang_code_data
|
104 |
+
})
|
105 |
+
|
106 |
+
fig = px.choropleth(
|
107 |
+
lang_df,
|
108 |
+
geojson=countries,
|
109 |
+
locations='country',
|
110 |
+
locationmode='country names',
|
111 |
+
color='count',
|
112 |
+
color_continuous_scale=[
|
113 |
+
[0, "rgb(45,45,48)"],
|
114 |
+
[0.33, "rgb(116,173,209)"],
|
115 |
+
[0.66, "rgb(255,255,0)"],
|
116 |
+
[1, "rgb(255,94,5)"]
|
117 |
+
],
|
118 |
+
scope='world',
|
119 |
+
hover_data=['lang_code'],
|
120 |
+
labels={'count': "Language Count"},
|
121 |
+
template='plotly_dark'
|
122 |
+
)
|
123 |
+
fig.update_geos(showcountries=True)
|
124 |
+
fig.update_layout(
|
125 |
+
title_text="Language Map",
|
126 |
+
margin={"r": 0, "t": 20, "l": 0, "b": 0}
|
127 |
+
)
|
128 |
+
|
129 |
+
return fig
|