madoss commited on
Commit
d139382
1 Parent(s): 7c454fe
Files changed (3) hide show
  1. app.py +160 -0
  2. gdiy_data.csv +0 -0
  3. requirements.txt +198 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import spacy
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from bertopic import BERTopic
7
+ from wordcloud import WordCloud
8
+ from nltk.corpus import stopwords
9
+ import pickle
10
+ import plotly.express as px
11
+
12
+ nlp = spacy.load("fr_core_news_sm")
13
+ stopword = stopwords.words('french')
14
+ import warnings
15
+ warnings.filterwarnings('ignore')
16
+ from nltk import FreqDist
17
+
18
+ df = pd.read_csv("gdiy_data.csv", sep=',',
19
+ parse_dates=['release_date']) # use `release_date` as date in pandas
20
+
21
+
22
+ def clean_data(df):
23
+ df = df.drop('Unnamed: 0', axis=1)
24
+ df['description'] = df['description'].str.lower()
25
+ df = df.set_index('release_date')
26
+ df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]]
27
+ df.loc[:, 'duration_min'] = df['duration_ms'].apply(
28
+ lambda row: row / (60 * 1000)) # convertir la durée de ms en minutes
29
+ df['year'] = df.index.year
30
+ df['month'] = df.index.month
31
+ return df
32
+
33
+
34
+ df_clean = clean_data(df)
35
+
36
+
37
+ def clean_up1(row: str, stopword, pos=None):
38
+ """ Prend une un text:
39
+ - Supprime les caractères `\xa0` et `\u200a`
40
+ - Supprime les mots avec moins de lettres """
41
+
42
+ texts = row.replace(f'\xa0', '')
43
+ texts = texts.replace(f'\u200a', '')
44
+ text_ = " ".join([token for token in texts.split() if token.isalpha() and len(token) > 2])
45
+ texts = nlp(text_)
46
+ if pos is not None:
47
+ list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword \
48
+ and token.pos_ not in pos]
49
+
50
+ else:
51
+ list_tokens = [token.lemma_ for token in texts if token.lemma_ not in stopword]
52
+
53
+ return list_tokens
54
+
55
+
56
+ pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB']
57
+
58
+ context = ['épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy',
59
+ 'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an',
60
+ 'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait']
61
+ stopword = stopword + context # add some frequent words in the documents
62
+
63
+ clean_text = df_clean['description'].apply(lambda x: clean_up1(x, stopword, pos))
64
+ docs = clean_text.apply(lambda x: " ".join(x)).tolist()
65
+
66
+ topic_model = BERTopic(language="multilingual",
67
+ nr_topics=6,
68
+ top_n_words=30,
69
+ low_memory=True,
70
+ n_gram_range=(1, 2))
71
+
72
+ topics, _ = topic_model.fit_transform(docs)
73
+
74
+ topic_fig = topic_model.visualize_barchart(n_words=10)
75
+
76
+ timestamps = df_clean.index
77
+ topics_over_time = topic_model.topics_over_time(docs, topics, timestamps,
78
+ global_tuning=True,
79
+ evolution_tuning=True,
80
+ nr_bins=20)
81
+
82
+ time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
83
+
84
+ topics_over_time = topics_over_time[topics_over_time['Topic'] != -1]
85
+ topics_over_time.set_index('Timestamp', inplace=True)
86
+ topics_over_time['year'] = topics_over_time.index.year
87
+ topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' '))
88
+
89
+ fig1, ax = plt.subplots()
90
+ sns.countplot(ax=ax, x='year', data=df_clean, palette='viridis');
91
+
92
+
93
+ # plt.ylabel('Nombre de podcasts');
94
+
95
+ def wordscloud(text: str):
96
+ WordCloud()
97
+ word_cloud = WordCloud(background_color='white').generate(text)
98
+ fig, ax = plt.subplots()
99
+ ax.imshow(word_cloud, interpolation='bilinear')
100
+ plt.axis("off")
101
+ plt.show()
102
+ st.pyplot(fig)
103
+
104
+
105
+ data = df_clean.resample('Y')['duration_min'].mean()
106
+ fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True)
107
+ fig.update_traces(textposition="bottom right")
108
+
109
+ st.write('''
110
+ # Nous sommes la moyenne des personnes que nous fréquentons.
111
+ Hello''')
112
+
113
+ st.header('Nombre de podcasts par année')
114
+
115
+ st.write(fig1)
116
+
117
+ st.header('Durée moyenne des podcasts par année')
118
+ st.plotly_chart(fig, use_container_width=False,
119
+ sharing="streamlit")
120
+
121
+ st.header('Les mots fréquemment utilisés dans le podcast')
122
+ text_cloud = clean_text.apply(lambda x: " ".join(x)).str.cat(sep=' ')
123
+ wordcloud = WordCloud(background_color='white').generate(text_cloud)
124
+ fig, ax = plt.subplots()
125
+ ax.imshow(wordcloud, interpolation='bilinear')
126
+ plt.axis("off")
127
+ plt.show()
128
+ st.pyplot(fig)
129
+
130
+ st.header('Sujets évoqués dans le podcast')
131
+ st.plotly_chart(topic_fig, use_container_width=False,
132
+ sharing="streamlit")
133
+
134
+ st.header('Sujets évoqués au cours du temps dans le podcast')
135
+ st.plotly_chart(time_fig, use_container_width=False,
136
+ sharing="streamlit")
137
+
138
+ st.header('Sujets en 2O17')
139
+ text = topic_per_year[2017].replace(',', "")
140
+ wordscloud(text)
141
+
142
+ st.header('Sujets en 2O18')
143
+ text = topic_per_year[2018].replace(',', "")
144
+ wordscloud(text)
145
+
146
+ st.header('Sujets en 2O19')
147
+ text = topic_per_year[2019].replace(',', "")
148
+ wordscloud(text)
149
+
150
+ st.header('Sujets en 2O20')
151
+ text = topic_per_year[2020].replace(',', "")
152
+ wordscloud(text)
153
+
154
+ st.header('Sujets en 2O21')
155
+ text = topic_per_year[2021].replace(',', "")
156
+ wordscloud(text)
157
+
158
+ st.header('Sujets en 2O22')
159
+ text = topic_per_year[2022].replace(',', "")
160
+ wordscloud(text)
gdiy_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.1
2
+ aiosignal==1.2.0
3
+ altair==4.2.0
4
+ anyio==3.5.0
5
+ argon2-cffi==21.3.0
6
+ argon2-cffi-bindings==21.2.0
7
+ asttokens==2.0.5
8
+ async-timeout==4.0.2
9
+ attrs==21.4.0
10
+ Babel==2.9.1
11
+ backcall==0.2.0
12
+ bertopic==0.11.0
13
+ black==22.1.0
14
+ bleach==4.1.0
15
+ blinker==1.5
16
+ blis==0.7.6
17
+ bpemb==0.3.3
18
+ cachetools==5.2.0
19
+ catalogue==2.0.7
20
+ certifi==2021.10.8
21
+ cffi==1.15.0
22
+ charset-normalizer==2.0.11
23
+ click==8.0.3
24
+ cloudpickle==2.1.0
25
+ colorama==0.4.4
26
+ commonmark==0.9.1
27
+ conllu==4.5.1
28
+ cycler==0.11.0
29
+ cymem==2.0.6
30
+ Cython==0.29.23
31
+ datasets==2.2.2
32
+ debugpy==1.5.1
33
+ decorator==5.1.1
34
+ defusedxml==0.7.1
35
+ Deprecated==1.2.13
36
+ dill==0.3.4
37
+ docopt==0.6.2
38
+ entrypoints==0.4
39
+ executing==0.8.2
40
+ fastjsonschema==2.16.1
41
+ filelock==3.6.0
42
+ flair==0.11.3
43
+ fonttools==4.29.1
44
+ fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.2.0/fr_core_news_sm-3.2.0-py3-none-any.whl
45
+ frozenlist==1.3.0
46
+ fsspec==2022.5.0
47
+ ftfy==6.1.1
48
+ funcy==1.17
49
+ future==0.18.2
50
+ gdown==4.4.0
51
+ gitdb==4.0.9
52
+ GitPython==3.1.27
53
+ hdbscan==0.8.28
54
+ huggingface-hub==0.4.0
55
+ hyperopt==0.2.7
56
+ idna==3.3
57
+ importlib-metadata==3.10.1
58
+ ipykernel==6.9.0
59
+ ipython==8.0.1
60
+ ipython-genutils==0.2.0
61
+ ipywidgets==7.7.0
62
+ Janome==0.4.2
63
+ jedi==0.18.1
64
+ Jinja2==3.0.3
65
+ joblib==1.1.0
66
+ Js2Py==0.71
67
+ json5==0.9.6
68
+ jsonschema==4.4.0
69
+ kiwisolver==1.3.2
70
+ konoha==4.6.5
71
+ langcodes==3.3.0
72
+ langdetect==1.0.9
73
+ llvmlite==0.38.1
74
+ lxml==4.8.0
75
+ MarkupSafe==2.0.1
76
+ matplotlib==3.5.1
77
+ matplotlib-inline==0.1.3
78
+ mistune==0.8.4
79
+ more-itertools==8.13.0
80
+ mpld3==0.3
81
+ multidict==6.0.2
82
+ multiprocess==0.70.12.2
83
+ murmurhash==1.0.6
84
+ mypy-extensions==0.4.3
85
+ nest-asyncio==1.5.4
86
+ networkx==2.8.4
87
+ nltk==3.7
88
+ numba==0.55.2
89
+ numexpr==2.8.1
90
+ numpy==1.22.4
91
+ overrides==3.1.0
92
+ packaging==21.3
93
+ pandas==1.4.0
94
+ pandoc==2.0.1
95
+ pandocfilters==1.5.0
96
+ parso==0.8.3
97
+ pathspec==0.9.0
98
+ pathy==0.6.1
99
+ pickleshare==0.7.5
100
+ Pillow==9.0.1
101
+ pipwin==0.5.2
102
+ platformdirs==2.4.1
103
+ plotly==5.9.0
104
+ plumbum==1.7.2
105
+ ply==3.11
106
+ pptree==3.1
107
+ preshed==3.0.6
108
+ prometheus-client==0.13.1
109
+ prompt-toolkit==3.0.27
110
+ protobuf==3.20.1
111
+ pure-eval==0.2.2
112
+ py4j==0.10.9.5
113
+ pyarrow==8.0.0
114
+ pycparser==2.21
115
+ pydantic==1.8.2
116
+ pydeck==0.7.1
117
+ Pygments==2.11.2
118
+ pyjsparser==2.7.1
119
+ pylibscrypt==2.0.0
120
+ pymongo==4.0.2
121
+ Pympler==1.0.1
122
+ pynndescent==0.5.7
123
+ pyparsing==3.0.7
124
+ PyPrind==2.11.3
125
+ pyrsistent==0.18.1
126
+ pySmartDL==1.3.4
127
+ PySocks==1.7.1
128
+ python-dateutil==2.8.2
129
+ pytz==2021.3
130
+ pytz-deprecation-shim==0.1.0.post0
131
+ PyYAML==5.4.1
132
+ pyzmq==22.3.0
133
+ redis==4.3.4
134
+ regex==2022.3.15
135
+ requests==2.27.1
136
+ responses==0.18.0
137
+ rich==12.5.1
138
+ sacremoses==0.0.49
139
+ scikit-learn==1.0.2
140
+ scipy==1.8.0
141
+ scrypt==0.8.20
142
+ seaborn==0.11.2
143
+ segtok==1.5.11
144
+ semver==2.13.0
145
+ Send2Trash==1.8.0
146
+ sentence-transformers==2.2.2
147
+ sentencepiece==0.1.95
148
+ seqeval==1.2.2
149
+ six==1.16.0
150
+ sklearn==0.0
151
+ smart-open==5.2.1
152
+ smmap==5.0.0
153
+ sniffio==1.2.0
154
+ soupsieve==2.3.1
155
+ spacy==3.2.3
156
+ spacy-legacy==3.0.9
157
+ spacy-loggers==1.0.1
158
+ spotipy==2.20.0
159
+ sqlitedict==2.0.0
160
+ srsly==2.4.2
161
+ stack-data==0.1.4
162
+ tabulate==0.8.10
163
+ tenacity==8.0.1
164
+ terminado==0.13.1
165
+ testpath==0.5.0
166
+ thinc==8.0.15
167
+ threadpoolctl==3.1.0
168
+ tinycss2==1.1.1
169
+ tokenizers==0.12.0
170
+ toml==0.10.2
171
+ tomli==2.0.1
172
+ toolz==0.12.0
173
+ torch==1.11.0
174
+ torchtext==0.12.0
175
+ torchvision==0.12.0
176
+ tornado==6.1
177
+ tqdm==4.63.0
178
+ traitlets==5.1.1
179
+ transformers==4.17.0
180
+ typer==0.4.0
181
+ typing_extensions==4.0.1
182
+ tzdata==2022.1
183
+ tzlocal==4.2
184
+ umap-learn==0.5.3
185
+ urllib3==1.26.8
186
+ validators==0.20.0
187
+ voluptuous==0.13.1
188
+ wasabi==0.9.0
189
+ watchdog==2.1.9
190
+ wcwidth==0.2.5
191
+ webencodings==0.5.1
192
+ websocket-client==1.2.3
193
+ widgetsnbextension==3.6.0
194
+ wordcloud==1.8.2.2
195
+ wrapt==1.14.1
196
+ xxhash==3.0.0
197
+ yarl==1.7.2
198
+ zipp==3.8.1