File size: 2,565 Bytes
453a744 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import streamlit as st
import pandas as pd
import numpy as np
from fugashi import Tagger
import re
import random
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
from gensim.corpora.dictionary import Dictionary
from gensim import models
from gensim.models.word2vec import Word2Vec
from gensim import similarities
sns.set(font='IPAexGothic')
# モデルなどの読み込み
pattern = r"[^\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u20000-\u2ffff\sa-zA-Z]" # 記号を取り除くための正規表現
df = pd.read_csv("./raw_corpus.csv") # コーパス
dictionary = Dictionary.load("./livedoor.dict") # 辞書
lda = models.ldamodel.LdaModel.load("./lda.model") # トピックモデル
index = similarities.MatrixSimilarity.load("./lda.index") # トピックモデルによるコーパスのindex
word_dist = lda.get_topics() # トピックモデルの単語分布(K, V)
w2v = Word2Vec.load('./word2vec.gensim.model') # 白やぎword2vec https://github.com/shiroyagicorp/japanese-word2vec-model-builder
# トピックモデルの情報(トピックごとの単語分布)のDataFrame化
num_words = 30
topic_list = []
word_list = []
weight_list = []
for n, values in lda.show_topics(num_topics=5, num_words=num_words, formatted=False):
for word, weight in values:
topic_list.append(n)
word_list.append(word)
weight_list.append(round(float(weight) * 100, 2))
topic_df = pd.DataFrame()
topic_df["topic"] = topic_list
topic_df["word"] = word_list
topic_df["weight"] = weight_list
st.sidebar.markdown("Set Parameter")
#word_list = ["任天堂", "バイオハザード", "ポケモン"]
#atom = st.sidebar.radio("元になる単語", word_list)
#negative = st.sidebar.radio("ー引く単語", word_list)
#positive = st.sidebar.radio("+足す単語", word_list)
st.header("word2vecによるアナロジー")
st.subheader("単語の演算")
st.caption("演算対象の単語")
col1, col2, col3 = st.columns(3)
with col1:
#atom = st.text_input("元になる単語", atom)
atom = st.text_input("元になる単語")
with col2:
#negative = st.text_input("ー引く単語", negative)
negative = st.text_input("ー引く単語")
with col3:
#positive = st.text_input("+足す単語", positive)
positive = st.text_input("+足す単語")
button = st.button("演算する")
if button:
st.text(f"{atom} - {negative} + {positive}")
x = w2v.wv.most_similar(positive=[atom, positive], negative=negative)
#st.text(f"{x}")
st.dataframe(x)
|