molokhovdmitry commited on
Commit
3fd5909
2 Parent(s): ecaa557 bca4302

Merge pull request #2 from molokhovdmitry/web_app

Browse files
.env.example ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ YT_API_KEY=""
2
+ PRED_BATCH_SIZE=512
3
+ MAX_COMMENT_SIZE=300
4
+ LANG_DETECTION_CONF=0.9
.github/workflows/python-app.yml CHANGED
@@ -36,6 +36,3 @@ jobs:
36
  flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37
  # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38
  flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39
- - name: Test with pytest
40
- run: |
41
- pytest
 
36
  flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37
  # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38
  flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 
 
 
Dockerfile CHANGED
@@ -5,4 +5,4 @@ RUN python -m pip install --upgrade pip
5
  RUN pip install --no-cache-dir -r requirements.txt
6
  COPY . .
7
  EXPOSE 8000
8
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
 
5
  RUN pip install --no-cache-dir -r requirements.txt
6
  COPY . .
7
  EXPOSE 8000
8
+ CMD ["streamlit", "run", "src/app.py", "--server.port", "8000"]
README.md CHANGED
@@ -1,58 +1,10 @@
1
  # social-stat
2
- API application for social network analysis.
3
-
4
- # Endpoints
5
- ## GET `/predict/{video_id}`
6
- Returns `pandas` DataFrame with all `commentThreads` of a `YouTube` video with emotion scores estimated by [SamLowe/roberta-base-go_emotions](https://huggingface.co/SamLowe/roberta-base-go_emotions).
7
- <details>
8
- <summary>All DataFrame columns:</summary>
9
-
10
- - comment_id
11
- - video_id
12
- - channel_id
13
- - author_display_name
14
- - text_original
15
- - text_display
16
- - published_at
17
- - updated_at
18
- - like_count
19
- - can_reply
20
- - total_reply_count
21
- - neutral
22
- - approval
23
- - annoyance
24
- - disapproval
25
- - realization
26
- - admiration
27
- - disappointment
28
- - excitement
29
- - disgust
30
- - confusion
31
- - joy
32
- - anger
33
- - optimism
34
- - amusement
35
- - gratitude
36
- - surprise
37
- - sadness
38
- - fear
39
- - curiosity
40
- - love
41
- - embarrassment
42
- - desire
43
- - caring
44
- - pride
45
- - relief
46
- - grief
47
- - remorse
48
- - nervousness
49
-
50
- </details>
51
 
52
  # Installation and Running
53
  ```
54
  git clone https://github.com/molokhovdmitry/social-stat
55
  python -m pip install --upgrade pip
56
  pip install -r requirements.txt
57
- uvicorn main:app --reload
58
  ```
 
1
  # social-stat
2
+ Streamlit web application for social network analysis.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  # Installation and Running
5
  ```
6
  git clone https://github.com/molokhovdmitry/social-stat
7
  python -m pip install --upgrade pip
8
  pip install -r requirements.txt
9
+ streamlit run src/app.py
10
  ```
data/countries.geo.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,11 +1,12 @@
1
  requests
2
- fastapi
3
- uvicorn
4
- pydantic_settings
5
  torch
6
  torchvision
7
  torchaudio
8
  transformers
 
9
  pandas
10
- pytest
11
- httpx
 
 
 
1
  requests
2
+ python-dotenv
 
 
3
  torch
4
  torchvision
5
  torchaudio
6
  transformers
7
+ sentence-transformers
8
  pandas
9
+ seaborn
10
+ plotly
11
+ nbformat
12
+ streamlit
src/__init__.py DELETED
File without changes
src/app.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from transformers import pipeline
4
+ from sentence_transformers import SentenceTransformer
5
+ import streamlit as st
6
+ import pandas as pd
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+ from plotly.subplots import make_subplots
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.decomposition import NMF
12
+ from sklearn.manifold import TSNE
13
+
14
+ from yt_api import YouTubeAPI
15
+ from maps import lang_map
16
+
17
+
18
+ # Load app settings
19
+ load_dotenv()
20
+ YT_API_KEY = os.getenv('YT_API_KEY')
21
+ MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
22
+ PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
23
+ LANG_DETECTION_CONF = float(os.getenv('LANG_DETECTION_CONF'))
24
+
25
+
26
+ @st.cache_resource
27
+ def init_emotions_model():
28
+ classifier = pipeline(
29
+ task="text-classification",
30
+ model="SamLowe/roberta-base-go_emotions",
31
+ top_k=None)
32
+
33
+ return classifier
34
+
35
+
36
+ @st.cache_resource
37
+ def init_embedding_model():
38
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
39
+ return model
40
+
41
+
42
+ @st.cache_resource
43
+ def init_lang_model():
44
+ model_ckpt = "papluca/xlm-roberta-base-language-detection"
45
+ pipe = pipeline("text-classification", model=model_ckpt)
46
+ return pipe
47
+
48
+
49
+ def predict_emotions(df, clf):
50
+ """
51
+ Predicts emotions for every `text_original` in a DataFrame `df` with a
52
+ classifier `clf`.
53
+ Returns a DataFrame with emotion columns.
54
+ """
55
+ # Predict emotions in batches
56
+ text_list = df['text_original'].to_list()
57
+ batch_size = PRED_BATCH_SIZE
58
+ text_batches = [text_list[i:i + batch_size]
59
+ for i in range(0, len(text_list), batch_size)]
60
+ preds = [comment_emotions
61
+ for text_batch in text_batches
62
+ for comment_emotions in clf(text_batch)]
63
+
64
+ # Add predictions to DataFrame
65
+ preds_df = pd.DataFrame([{emotion['label']: emotion['score']
66
+ for emotion in pred} for pred in preds])
67
+ df = pd.concat([df, preds_df], axis=1)
68
+
69
+ return df
70
+
71
+
72
+ def detect_languages(df, clf):
73
+ """
74
+ Detects languages for every `text_original` in a DataFrame `df` with a
75
+ classifier `clf`. Takes the language with the highest score.
76
+ Returns a DataFrame with `predicted_language` column.
77
+ """
78
+ # Detect languages in batches
79
+ text_list = df['text_original'].to_list()
80
+ batch_size = PRED_BATCH_SIZE
81
+ text_batches = [text_list[i:i + batch_size]
82
+ for i in range(0, len(text_list), batch_size)]
83
+ preds = [batch_preds[0]['label']
84
+ if batch_preds[0]['score'] > LANG_DETECTION_CONF
85
+ else None
86
+ for text_batch in text_batches
87
+ for batch_preds in clf(text_batch, top_k=1, truncation=True)]
88
+
89
+ # Add predictions to DataFrame
90
+ df['predicted_language'] = preds
91
+
92
+ return df
93
+
94
+
95
+ def emotion_dist_plot(df, emotion_cols):
96
+ """
97
+ Creates an emotion distribution plotly figure from `df` DataFrame
98
+ and `emotion_cols` and returns it.
99
+ """
100
+ fig = px.bar(df[emotion_cols].sum().sort_values(ascending=False))
101
+ fig.update_layout(title_text="Emotion Distribution",
102
+ width=2000)
103
+
104
+ return fig
105
+
106
+
107
+ def nmf_plots(df,
108
+ nmf_components,
109
+ tfidf_max_features,
110
+ tfidf_stop_words='english'
111
+ ):
112
+ """
113
+ Converts all `text_original` values of `df` DataFrame to TF-IDF features
114
+ and performs Non-negative matrix factorization on them.
115
+
116
+ Returns a tuple of the modified DataFrame with NMF values and a list of
117
+ plotly figures (`df`, [plotly figures]).
118
+ """
119
+ # Convert to TF-IDF features
120
+ vectorizer = TfidfVectorizer(max_features=tfidf_max_features,
121
+ stop_words=tfidf_stop_words)
122
+ embeddings = vectorizer.fit_transform(df['text_original'])
123
+
124
+ # Get feature_names (words) from the vectorizer
125
+ feature_names = vectorizer.get_feature_names_out()
126
+
127
+ # Perform NMF
128
+ nmf = NMF(n_components=nmf_components)
129
+ nmf_embeddings = nmf.fit_transform(embeddings).T
130
+ topic_cols = [f'topic_{topic_num+1}'
131
+ for topic_num in range(nmf_components)]
132
+
133
+ # Add NMF values to the DataFrame
134
+ for i, col in enumerate(topic_cols):
135
+ df[col] = nmf_embeddings[i]
136
+
137
+ # Get word values for every topic
138
+ word_df = pd.DataFrame(
139
+ nmf.components_.T,
140
+ columns=topic_cols,
141
+ index=feature_names
142
+ )
143
+
144
+ # Plot word distributions of each topic
145
+ topic_words_fig = make_subplots(
146
+ rows=1, cols=nmf_components,
147
+ subplot_titles=topic_cols)
148
+
149
+ for i, col in enumerate(topic_cols):
150
+ topic_words = word_df[col].sort_values(ascending=False)
151
+ top_topic_words = topic_words[:top_words_in_topic]
152
+ topic_words_fig.add_trace(go.Bar(y=top_topic_words.index,
153
+ x=top_topic_words.values,
154
+ orientation='h',
155
+ base=0),
156
+ row=1, col=i+1)
157
+ topic_words_fig.update_layout(title_text="Topic Word Distributions",
158
+ showlegend=False)
159
+
160
+ # Plot topic contribution for the dataset
161
+ for col in topic_cols:
162
+ df[col + '_cumsum'] = df[col].cumsum()
163
+ for col in topic_cols:
164
+ cumsum_sum = df[[col + '_cumsum' for col in topic_cols]].sum(axis=1)
165
+ df[col + '_percentage'] = df[col + '_cumsum'] / cumsum_sum
166
+ contributions_fig = stacked_area_plot(
167
+ x=df['published_at'],
168
+ y_list=[df[f'topic_{i+1}_percentage'] for i in range(nmf_components)],
169
+ names=topic_cols)
170
+
171
+ return df, [topic_words_fig, contributions_fig]
172
+
173
+
174
+ def tsne_plots(df, encoder, emotion_cols, color_emotion, tsne_perplexity):
175
+ """
176
+ Encodes all `text_original` values of `df` DataFrame with `encoder`,
177
+ uses t-SNE algorithm for visualization on these embeddings and on
178
+ predicted emotions if they were predicted.
179
+ """
180
+ # Encode and add embeddings to the DataFrame
181
+ embeddings = encoder.encode(df['text_original'])
182
+ embedding_cols = [f'embedding_{i+1}' for i in range(embeddings.shape[1])]
183
+ df = pd.concat([df, pd.DataFrame(embeddings, columns=embedding_cols)],
184
+ axis=1)
185
+
186
+ # t-SNE
187
+ TSNE_COMPONENTS = 2
188
+ tsne = TSNE(
189
+ n_components=2,
190
+ perplexity=tsne_perplexity,
191
+ )
192
+
193
+ # Also use predicted emotions
194
+ if emotion_cols:
195
+ tsne_cols = embedding_cols + emotion_cols
196
+ color = color_emotion
197
+ hover_data = ['first_emotion', 'second_emotion', 'text_original']
198
+ else:
199
+ tsne_cols = embedding_cols
200
+ color = None
201
+ hover_data = 'text_original'
202
+
203
+ tsne_results = tsne.fit_transform(df[tsne_cols])
204
+ tsne_results = pd.DataFrame(
205
+ tsne_results,
206
+ columns=[f'tsne_{i+1}' for i in range(TSNE_COMPONENTS)]
207
+ )
208
+
209
+ df = pd.concat([df, tsne_results], axis=1)
210
+
211
+ # 2D Visualization
212
+ fig2d = px.scatter(
213
+ df,
214
+ x='tsne_1',
215
+ y='tsne_2',
216
+ color=color,
217
+ hover_data=hover_data
218
+ )
219
+ fig2d.update_layout(
220
+ title_text="t-SNE Visualization"
221
+ )
222
+
223
+ # 3D Visualization with date as the third axis
224
+ fig3d = px.scatter_3d(
225
+ df,
226
+ x='published_at',
227
+ y='tsne_1',
228
+ z='tsne_2',
229
+ color=color,
230
+ hover_data=hover_data
231
+ )
232
+ fig3d.update_layout(
233
+ title_text="t-SNE Visualization Over Time"
234
+ )
235
+
236
+ return df, [fig2d, fig3d]
237
+
238
+
239
+ def stacked_area_plot(x, y_list, names):
240
+ """Creates plotly stacked area plot. Returns a figure of that plot."""
241
+ fig = go.Figure()
242
+ for y, name in zip(y_list, names):
243
+ fig.add_trace(go.Scatter(
244
+ x=x, y=y*100,
245
+ mode='lines',
246
+ line=dict(width=0.5),
247
+ stackgroup='one',
248
+ name=name,
249
+ ))
250
+
251
+ fig.update_layout(
252
+ showlegend=True,
253
+ xaxis_type='category',
254
+ yaxis=dict(
255
+ type='linear',
256
+ range=[0, 100],
257
+ ticksuffix='%')
258
+ )
259
+
260
+ fig.update_layout(title_text="Topic Contribution")
261
+
262
+ return fig
263
+
264
+
265
+ def add_top_2_emotions(row):
266
+ emotions = row[emotion_cols].sort_values(ascending=False)
267
+ row['first_emotion'] = emotions.index[0]
268
+ row['second_emotion'] = emotions.index[1]
269
+ return row
270
+
271
+
272
+ st.set_page_config(layout='wide')
273
+ st.title("Social-Stat")
274
+
275
+ # Load models
276
+ emotions_clf = init_emotions_model()
277
+ sentence_encoder = init_embedding_model()
278
+ lang_model = init_lang_model()
279
+
280
+ # Init YouTube API
281
+ yt_api = YouTubeAPI(
282
+ api_key=YT_API_KEY,
283
+ max_comment_size=MAX_COMMENT_SIZE
284
+ )
285
+
286
+ # Input form
287
+ with st.form(key='input'):
288
+ video_id = st.text_input("Video ID")
289
+
290
+ # Emotions
291
+ emotions_checkbox = st.checkbox(
292
+ "Predict Emotions",
293
+ value=True,
294
+ )
295
+
296
+ # NMF
297
+ nmf_checkbox = st.checkbox(
298
+ "Non-Negative Matrix Factorization",
299
+ value=True,
300
+ )
301
+
302
+ nmf_components = st.slider(
303
+ "Topics (NMF Components)",
304
+ min_value=2,
305
+ max_value=20,
306
+ value=10,
307
+ step=1,
308
+ )
309
+
310
+ tfidf_max_features = st.select_slider(
311
+ "Words (TF-IDF Vectorizer Max Features)",
312
+ options=list(range(10, 501)) + [None],
313
+ value=100,
314
+ )
315
+
316
+ top_words_in_topic = st.slider(
317
+ "Top Topic Words",
318
+ min_value=1,
319
+ max_value=50,
320
+ value=10,
321
+ step=1,
322
+ )
323
+
324
+ # t-SNE
325
+ tsne_checkbox = st.checkbox(
326
+ "t-SNE Visualization",
327
+ value=True,
328
+ )
329
+
330
+ tsne_perplexity = st.slider(
331
+ "t-SNE Perplexity",
332
+ min_value=5,
333
+ max_value=50,
334
+ value=10,
335
+ step=1,
336
+ )
337
+
338
+ tsne_color_emotion = st.selectbox(
339
+ "Emotion For The Plot Color",
340
+ options=['first_emotion', 'second_emotion']
341
+ )
342
+
343
+ # Language Map
344
+ map_checkbox = st.checkbox(
345
+ "Language Map",
346
+ value=True,
347
+ )
348
+
349
+ submit = st.form_submit_button("Analyze")
350
+
351
+
352
+ if submit:
353
+ # Get comments
354
+ try:
355
+ bad_id = False
356
+ comments = yt_api.get_comments(video_id)
357
+ except KeyError:
358
+ st.write("Video not found.")
359
+ bad_id = True
360
+
361
+ if not bad_id:
362
+ plots = []
363
+
364
+ # Convert to pandas DataFrame and sort by publishing date
365
+ df = pd.DataFrame(comments).sort_values('published_at')
366
+
367
+ emotion_cols = []
368
+ if emotions_checkbox:
369
+ # Predict emotions
370
+ df = predict_emotions(df, emotions_clf)
371
+ emotion_cols = list(df.columns[11:])
372
+
373
+ # Get emotion distribution figure
374
+ emotion_fig = emotion_dist_plot(df, emotion_cols)
375
+
376
+ # TODO: Get emotion contribution figure
377
+
378
+ # Get top 2 emotions
379
+ df = df.apply(add_top_2_emotions, axis=1)
380
+
381
+ if nmf_checkbox:
382
+ # NMF
383
+ df, nmf_figs = nmf_plots(df, nmf_components, tfidf_max_features)
384
+ plots.extend(nmf_figs)
385
+
386
+ if tsne_checkbox:
387
+ # t-SNE visualization
388
+ df, tsne_figs = tsne_plots(df,
389
+ sentence_encoder,
390
+ emotion_cols,
391
+ tsne_color_emotion,
392
+ tsne_perplexity)
393
+ plots.extend(tsne_figs)
394
+
395
+ if map_checkbox:
396
+ df = detect_languages(df, lang_model)
397
+ map_figure = lang_map(df)
398
+
399
+ # Plot all figures
400
+ if emotions_checkbox:
401
+ st.plotly_chart(emotion_fig, use_container_width=True)
402
+
403
+ if map_checkbox:
404
+ st.plotly_chart(map_figure, use_container_width=True)
405
+
406
+ for i, plot in enumerate(plots):
407
+ st.plotly_chart(
408
+ plot, sharing='streamlit',
409
+ theme='streamlit',
410
+ use_container_width=True)
411
+
412
+ # Show the final DataFrame
413
+ st.dataframe(df)
src/main.py DELETED
@@ -1,54 +0,0 @@
1
- from fastapi import FastAPI, Response
2
- from pydantic_settings import BaseSettings, SettingsConfigDict
3
- import pandas as pd
4
-
5
- from src.yt_api import YouTubeAPI
6
- from src.models import init_emotions_model
7
-
8
-
9
- class Settings(BaseSettings):
10
- YT_API_KEY: str
11
- PRED_BATCH_SIZE: int = 512
12
- MAX_COMMENT_SIZE: int = 300
13
- model_config = SettingsConfigDict(env_file='.env')
14
-
15
-
16
- settings = Settings()
17
- app = FastAPI(title='social-stat')
18
-
19
- emotions_clf = init_emotions_model()
20
- yt_api = YouTubeAPI(
21
- api_key=settings.YT_API_KEY,
22
- max_comment_size=settings.MAX_COMMENT_SIZE
23
- )
24
-
25
-
26
- @app.get('/')
27
- def home():
28
- return 'social-stat'
29
-
30
-
31
- @app.get('/predict')
32
- def predict(video_id):
33
- # Get comments
34
- comments = yt_api.get_comments(video_id)
35
- comments_df = pd.DataFrame(comments)
36
-
37
- # Predict emotions in batches
38
- text_list = comments_df['text_display'].to_list()
39
- batch_size = settings.PRED_BATCH_SIZE
40
- text_batches = [text_list[i:i + batch_size]
41
- for i in range(0, len(text_list), batch_size)]
42
- preds = [comment_emotions
43
- for text_batch in text_batches
44
- for comment_emotions in emotions_clf(text_batch)]
45
-
46
- # Add predictions to DataFrame
47
- preds_df = pd.DataFrame([{emotion['label']: emotion['score']
48
- for emotion in pred} for pred in preds])
49
- comments_df = pd.concat([comments_df, preds_df], axis=1)
50
-
51
- # Return DataFrame as a JSON file
52
- return Response(
53
- content=comments_df.to_json(orient='records'),
54
- media_type='application/json')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/maps.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+ # Language codes predicted by language detection model
6
+ LANG_CODES = ['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
7
+ 'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']
8
+
9
+ COUNTRY_TO_LANG_CODE = {
10
+ 'Algeria': 'ar',
11
+ 'Chad': 'ar',
12
+ 'Djibouti': 'ar',
13
+ 'Egypt': 'ar',
14
+ 'Iraq': 'ar',
15
+ 'Jordan': 'ar',
16
+ 'Kuwait': 'ar',
17
+ 'Lebanon': 'ar',
18
+ 'Libya': 'ar',
19
+ 'Mali': 'ar',
20
+ 'Mauritania': 'ar',
21
+ 'Morocco': 'ar',
22
+ 'Oman': 'ar',
23
+ 'Palestine': 'ar',
24
+ 'Qatar': 'ar',
25
+ 'Saudi Arabia': 'ar',
26
+ 'Somalia': 'ar',
27
+ 'Sudan': 'ar',
28
+ 'Syria': 'ar',
29
+ 'Tunisia': 'ar',
30
+ 'United Arab Emirates': 'ar',
31
+ 'Yemen': 'ar',
32
+ 'Bulgaria': 'bg',
33
+ 'Germany': 'de',
34
+ 'Greece': 'el',
35
+ 'Cyprus': 'el',
36
+ 'United States of America': 'en',
37
+ 'Ireland': 'en',
38
+ 'United Kingdom': 'en',
39
+ 'Canada': 'en',
40
+ 'Australia': 'en',
41
+ 'Mexico': 'es',
42
+ 'Mexico': 'es',
43
+ 'Colombia': 'es',
44
+ 'Spain': 'es',
45
+ 'Argentina': 'es',
46
+ 'Peru': 'es',
47
+ 'Venezuela': 'es',
48
+ 'Chile': 'es',
49
+ 'Guatemala': 'es',
50
+ 'Ecuador': 'es',
51
+ 'Bolivia': 'es',
52
+ 'Cuba': 'es',
53
+ 'Dominican Rep.': 'es',
54
+ 'Honduras': 'es',
55
+ 'Paraguay': 'es',
56
+ 'El Salvador': 'es',
57
+ 'Nicaragua': 'es',
58
+ 'Costa Rica': 'es',
59
+ 'Panama': 'es',
60
+ 'Uruguay': 'es',
61
+ 'Guinea': 'es',
62
+ 'France': 'fr',
63
+ 'India': 'hi',
64
+ 'Italy': 'it',
65
+ 'Japan': 'ja',
66
+ 'Netherlands': 'nl',
67
+ 'Belgium': 'nl',
68
+ 'Poland': 'pl',
69
+ 'Portugal': 'pt',
70
+ 'Russia': 'ru',
71
+ 'Uganda': 'sw',
72
+ 'Kenya': 'sw',
73
+ 'Tanzania': 'sw',
74
+ 'Thailand': 'th',
75
+ 'Turkey': 'tr',
76
+ 'Pakistan': 'ur',
77
+ 'Vietnam': 'vi',
78
+ 'China': 'zh'
79
+ }
80
+
81
+
82
+ def lang_map(df):
83
+ with open('data/countries.geo.json') as f:
84
+ countries = json.load(f)
85
+ country_list = [country['properties']['name']
86
+ for country in dict(countries)['features']]
87
+ LANG_CODES = df.value_counts('predicted_language')
88
+
89
+ countries_data = []
90
+ lang_count_data = []
91
+ lang_code_data = []
92
+ for country in country_list:
93
+ if country in COUNTRY_TO_LANG_CODE:
94
+ country_lang = COUNTRY_TO_LANG_CODE[country]
95
+ if country_lang in LANG_CODES.index:
96
+ countries_data.append(country)
97
+ lang_count = LANG_CODES.loc[COUNTRY_TO_LANG_CODE[country]]
98
+ lang_count_data.append(lang_count)
99
+ lang_code_data.append(country_lang)
100
+ lang_df = pd.DataFrame({
101
+ 'country': countries_data,
102
+ 'count': lang_count_data,
103
+ 'lang_code': lang_code_data
104
+ })
105
+
106
+ fig = px.choropleth(
107
+ lang_df,
108
+ geojson=countries,
109
+ locations='country',
110
+ locationmode='country names',
111
+ color='count',
112
+ color_continuous_scale=[
113
+ [0, "rgb(45,45,48)"],
114
+ [0.33, "rgb(116,173,209)"],
115
+ [0.66, "rgb(255,255,0)"],
116
+ [1, "rgb(255,94,5)"]
117
+ ],
118
+ scope='world',
119
+ hover_data=['lang_code'],
120
+ labels={'count': "Language Count"},
121
+ template='plotly_dark'
122
+ )
123
+ fig.update_geos(showcountries=True)
124
+ fig.update_layout(
125
+ title_text="Language Map",
126
+ margin={"r": 0, "t": 20, "l": 0, "b": 0}
127
+ )
128
+
129
+ return fig
src/models.py DELETED
@@ -1,10 +0,0 @@
1
- from transformers import pipeline
2
-
3
-
4
- def init_emotions_model():
5
- classifier = pipeline(
6
- task="text-classification",
7
- model="SamLowe/roberta-base-go_emotions",
8
- top_k=None)
9
-
10
- return classifier
 
 
 
 
 
 
 
 
 
 
 
src/test_main.py DELETED
@@ -1,27 +0,0 @@
1
- from fastapi.testclient import TestClient
2
- from src.main import app
3
- import pandas as pd
4
-
5
-
6
- client = TestClient(app)
7
-
8
-
9
- def test_home():
10
- """Test home page."""
11
- response = client.get("/")
12
- assert response.status_code == 200
13
-
14
-
15
- def test_predict():
16
- """Test predict method on an example video."""
17
- TEST_VIDEO_ID = "0peXnOnDgQ8"
18
- response = client.get(
19
- "/predict/",
20
- params={"video_id": TEST_VIDEO_ID}
21
- )
22
- df = pd.read_json(response, orient='records')
23
-
24
- # Ensure the DataFrame has the right amount of columns
25
- assert df.shape[1] == 39
26
- # Ensure there are no NaN values
27
- assert df.isna().sum().sum() == 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/yt_api.py CHANGED
@@ -34,6 +34,10 @@ class YouTubeAPI():
34
  'pageToken': page_token,
35
  }
36
  response = requests.get(url, params=payload)
 
 
 
 
37
  return response.json()
38
 
39
  def response_to_comments(self, response):
 
34
  'pageToken': page_token,
35
  }
36
  response = requests.get(url, params=payload)
37
+
38
+ # Ensure it's not a bad request
39
+ assert response.status_code != 400
40
+
41
  return response.json()
42
 
43
  def response_to_comments(self, response):
vm_startup.sh DELETED
@@ -1,6 +0,0 @@
1
- # Script for an automatic startup on a virtual machine.
2
- . /home/user/python_venv/social-stat/bin/activate
3
- cd /home/user/social-stat
4
- git pull
5
- pip install -r requirements.txt
6
- uvicorn src.main:app --host 0.0.0.0 --port 8000 > /home/user/log.txt 2>&1