add application and requirements
Browse files- app.py +75 -0
- requirements.txt +190 -0
app.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from fugashi import Tagger
|
5 |
+
import re
|
6 |
+
import random
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import japanize_matplotlib
|
9 |
+
import seaborn as sns
|
10 |
+
from gensim.corpora.dictionary import Dictionary
|
11 |
+
from gensim import models
|
12 |
+
from gensim.models.word2vec import Word2Vec
|
13 |
+
from gensim import similarities
|
14 |
+
|
15 |
+
sns.set(font='IPAexGothic')
|
16 |
+
|
17 |
+
# モデルなどの読み込み
|
18 |
+
pattern = r"[^\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u20000-\u2ffff\sa-zA-Z]" # 記号を取り除くための正規表現
|
19 |
+
df = pd.read_csv("./raw_corpus.csv") # コーパス
|
20 |
+
dictionary = Dictionary.load("./livedoor.dict") # 辞書
|
21 |
+
lda = models.ldamodel.LdaModel.load("./lda.model") # トピックモデル
|
22 |
+
index = similarities.MatrixSimilarity.load("./lda.index") # トピックモデルによるコーパスのindex
|
23 |
+
word_dist = lda.get_topics() # トピックモデルの単語分布(K, V)
|
24 |
+
w2v = Word2Vec.load('./word2vec.gensim.model') # 白やぎword2vec https://github.com/shiroyagicorp/japanese-word2vec-model-builder
|
25 |
+
|
26 |
+
# トピックモデルの情報(トピックごとの単語分布)のDataFrame化
|
27 |
+
num_words = 30
|
28 |
+
topic_list = []
|
29 |
+
word_list = []
|
30 |
+
weight_list = []
|
31 |
+
for n, values in lda.show_topics(num_topics=5, num_words=num_words, formatted=False):
|
32 |
+
for word, weight in values:
|
33 |
+
topic_list.append(n)
|
34 |
+
word_list.append(word)
|
35 |
+
weight_list.append(round(float(weight) * 100, 2))
|
36 |
+
|
37 |
+
topic_df = pd.DataFrame()
|
38 |
+
topic_df["topic"] = topic_list
|
39 |
+
topic_df["word"] = word_list
|
40 |
+
topic_df["weight"] = weight_list
|
41 |
+
|
42 |
+
|
43 |
+
st.sidebar.markdown("Set Parameter")
|
44 |
+
|
45 |
+
#word_list = ["任天堂", "バイオハザード", "ポケモン"]
|
46 |
+
#atom = st.sidebar.radio("元になる単語", word_list)
|
47 |
+
#negative = st.sidebar.radio("ー引く単語", word_list)
|
48 |
+
#positive = st.sidebar.radio("+足す単語", word_list)
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
st.header("word2vecによるアナロジー")
|
53 |
+
st.subheader("単語の演算")
|
54 |
+
st.caption("演算対象の単語")
|
55 |
+
col1, col2, col3 = st.columns(3)
|
56 |
+
|
57 |
+
with col1:
|
58 |
+
#atom = st.text_input("元になる単語", atom)
|
59 |
+
atom = st.text_input("元になる単語")
|
60 |
+
|
61 |
+
with col2:
|
62 |
+
#negative = st.text_input("ー引く単語", negative)
|
63 |
+
negative = st.text_input("ー引く単語")
|
64 |
+
|
65 |
+
with col3:
|
66 |
+
#positive = st.text_input("+足す単語", positive)
|
67 |
+
positive = st.text_input("+足す単語")
|
68 |
+
|
69 |
+
button = st.button("演算する")
|
70 |
+
|
71 |
+
if button:
|
72 |
+
st.text(f"{atom} - {negative} + {positive}")
|
73 |
+
x = w2v.wv.most_similar(positive=[atom, positive], negative=negative)
|
74 |
+
#st.text(f"{x}")
|
75 |
+
st.dataframe(x)
|
requirements.txt
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.15.0
|
2 |
+
aiohttp==3.8.3
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.0
|
5 |
+
anyio==3.6.2
|
6 |
+
appnope==0.1.3
|
7 |
+
argon2-cffi==21.3.0
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
arrow==1.2.3
|
10 |
+
asttokens==2.2.1
|
11 |
+
async-timeout==4.0.2
|
12 |
+
attrs==22.2.0
|
13 |
+
backcall==0.2.0
|
14 |
+
beautifulsoup4==4.11.1
|
15 |
+
bleach==5.0.1
|
16 |
+
blinker==1.5
|
17 |
+
cachetools==5.3.0
|
18 |
+
certifi==2022.12.7
|
19 |
+
cffi==1.15.1
|
20 |
+
charset-normalizer==2.1.1
|
21 |
+
click==8.1.3
|
22 |
+
comm==0.1.2
|
23 |
+
contourpy==1.0.6
|
24 |
+
cycler==0.11.0
|
25 |
+
datasets==2.8.0
|
26 |
+
debugpy==1.6.4
|
27 |
+
decorator==5.1.1
|
28 |
+
defusedxml==0.7.1
|
29 |
+
dill==0.3.6
|
30 |
+
entrypoints==0.4
|
31 |
+
evaluate==0.4.0
|
32 |
+
executing==1.2.0
|
33 |
+
fastapi==0.89.0
|
34 |
+
fastjsonschema==2.16.2
|
35 |
+
ffmpy==0.3.0
|
36 |
+
filelock==3.9.0
|
37 |
+
fonttools==4.38.0
|
38 |
+
fqdn==1.5.1
|
39 |
+
frozenlist==1.3.3
|
40 |
+
fsspec==2022.11.0
|
41 |
+
fst-pso==1.8.1
|
42 |
+
fugashi==1.2.1
|
43 |
+
FuzzyTM==2.0.5
|
44 |
+
gensim==4.3.0
|
45 |
+
gitdb==4.0.10
|
46 |
+
GitPython==3.1.30
|
47 |
+
gradio==3.16.1
|
48 |
+
h11==0.14.0
|
49 |
+
httpcore==0.16.3
|
50 |
+
httpx==0.23.3
|
51 |
+
huggingface-hub==0.11.1
|
52 |
+
idna==3.4
|
53 |
+
importlib-metadata==5.2.0
|
54 |
+
ipadic==1.0.0
|
55 |
+
ipykernel==6.19.4
|
56 |
+
ipython==8.7.0
|
57 |
+
ipython-genutils==0.2.0
|
58 |
+
ipywidgets==8.0.4
|
59 |
+
isoduration==20.11.0
|
60 |
+
japanize-matplotlib==1.1.3
|
61 |
+
jedi==0.18.2
|
62 |
+
Jinja2==3.1.2
|
63 |
+
joblib==1.2.0
|
64 |
+
jsonpointer==2.3
|
65 |
+
jsonschema==4.17.3
|
66 |
+
jupyter==1.0.0
|
67 |
+
jupyter-console==6.4.4
|
68 |
+
jupyter-events==0.5.0
|
69 |
+
jupyter_client==7.4.8
|
70 |
+
jupyter_core==5.1.1
|
71 |
+
jupyter_server==2.0.6
|
72 |
+
jupyter_server_terminals==0.4.3
|
73 |
+
jupyterlab-pygments==0.2.2
|
74 |
+
jupyterlab-widgets==3.0.5
|
75 |
+
kiwisolver==1.4.4
|
76 |
+
linkify-it-py==1.0.3
|
77 |
+
markdown-it-py==2.1.0
|
78 |
+
MarkupSafe==2.1.1
|
79 |
+
matplotlib==3.6.2
|
80 |
+
matplotlib-inline==0.1.6
|
81 |
+
mdit-py-plugins==0.3.3
|
82 |
+
mdurl==0.1.2
|
83 |
+
miniful==0.0.6
|
84 |
+
mistune==2.0.4
|
85 |
+
multidict==6.0.4
|
86 |
+
multiprocess==0.70.14
|
87 |
+
nbclassic==0.4.8
|
88 |
+
nbclient==0.7.2
|
89 |
+
nbconvert==7.2.7
|
90 |
+
nbformat==5.7.1
|
91 |
+
nest-asyncio==1.5.6
|
92 |
+
notebook==6.5.2
|
93 |
+
notebook_shim==0.2.2
|
94 |
+
numpy==1.24.1
|
95 |
+
orjson==3.8.4
|
96 |
+
packaging==22.0
|
97 |
+
pandas==1.5.2
|
98 |
+
pandocfilters==1.5.0
|
99 |
+
parso==0.8.3
|
100 |
+
pexpect==4.8.0
|
101 |
+
pickleshare==0.7.5
|
102 |
+
Pillow==9.3.0
|
103 |
+
plac==1.3.5
|
104 |
+
platformdirs==2.6.2
|
105 |
+
polars==0.15.17
|
106 |
+
portalocker==2.6.0
|
107 |
+
prometheus-client==0.15.0
|
108 |
+
prompt-toolkit==3.0.36
|
109 |
+
protobuf==3.20.1
|
110 |
+
psutil==5.9.4
|
111 |
+
ptyprocess==0.7.0
|
112 |
+
pure-eval==0.2.2
|
113 |
+
pyarrow==10.0.1
|
114 |
+
pycparser==2.21
|
115 |
+
pycryptodome==3.16.0
|
116 |
+
pydantic==1.10.4
|
117 |
+
pydeck==0.8.0
|
118 |
+
pydub==0.25.1
|
119 |
+
pyFUME==0.2.25
|
120 |
+
Pygments==2.13.0
|
121 |
+
Pympler==1.0.1
|
122 |
+
pyparsing==3.0.9
|
123 |
+
pyrsistent==0.19.3
|
124 |
+
python-dateutil==2.8.2
|
125 |
+
python-json-logger==2.0.4
|
126 |
+
python-multipart==0.0.5
|
127 |
+
pytz==2022.7
|
128 |
+
pytz-deprecation-shim==0.1.0.post0
|
129 |
+
PyYAML==6.0
|
130 |
+
pyzmq==24.0.1
|
131 |
+
qtconsole==5.4.0
|
132 |
+
QtPy==2.3.0
|
133 |
+
regex==2022.10.31
|
134 |
+
requests==2.28.1
|
135 |
+
responses==0.18.0
|
136 |
+
rfc3339-validator==0.1.4
|
137 |
+
rfc3986==1.5.0
|
138 |
+
rfc3986-validator==0.1.1
|
139 |
+
rich==13.2.0
|
140 |
+
scikit-learn==1.2.0
|
141 |
+
scipy==1.10.0
|
142 |
+
seaborn==0.12.2
|
143 |
+
semver==2.13.0
|
144 |
+
Send2Trash==1.8.0
|
145 |
+
sentencepiece==0.1.97
|
146 |
+
simpful==2.9.0
|
147 |
+
six==1.16.0
|
148 |
+
smart-open==6.3.0
|
149 |
+
smmap==5.0.0
|
150 |
+
sniffio==1.3.0
|
151 |
+
soupsieve==2.3.2.post1
|
152 |
+
stack-data==0.6.2
|
153 |
+
starlette==0.22.0
|
154 |
+
streamlit==1.17.0
|
155 |
+
terminado==0.17.1
|
156 |
+
threadpoolctl==3.1.0
|
157 |
+
tinycss2==1.2.1
|
158 |
+
tokenizers==0.13.2
|
159 |
+
toml==0.10.2
|
160 |
+
toolz==0.12.0
|
161 |
+
torch==1.13.1
|
162 |
+
torchaudio==0.13.1
|
163 |
+
torchdata==0.5.1
|
164 |
+
torchtext==0.14.1
|
165 |
+
torchvision==0.14.1
|
166 |
+
tornado==6.2
|
167 |
+
tqdm==4.64.1
|
168 |
+
traitlets==5.8.0
|
169 |
+
transformers==4.25.1
|
170 |
+
typing_extensions==4.4.0
|
171 |
+
tzdata==2022.7
|
172 |
+
tzlocal==4.2
|
173 |
+
uc-micro-py==1.0.1
|
174 |
+
unidic==1.1.0
|
175 |
+
unidic-lite==1.0.8
|
176 |
+
uri-template==1.2.0
|
177 |
+
urllib3==1.26.13
|
178 |
+
uvicorn==0.20.0
|
179 |
+
validators==0.20.0
|
180 |
+
wasabi==0.10.1
|
181 |
+
wcwidth==0.2.5
|
182 |
+
webcolors==1.12
|
183 |
+
webencodings==0.5.1
|
184 |
+
websocket-client==1.4.2
|
185 |
+
websockets==10.4
|
186 |
+
widgetsnbextension==4.0.5
|
187 |
+
wordcloud==1.8.2.2
|
188 |
+
xxhash==3.2.0
|
189 |
+
yarl==1.8.2
|
190 |
+
zipp==3.11.0
|