testStGen / app.py
kawayui's picture
add application and requirements
453a744
import streamlit as st
import pandas as pd
import numpy as np
from fugashi import Tagger
import re
import random
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
from gensim.corpora.dictionary import Dictionary
from gensim import models
from gensim.models.word2vec import Word2Vec
from gensim import similarities
sns.set(font='IPAexGothic')
# モデルなどの読み込み
pattern = r"[^\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u20000-\u2ffff\sa-zA-Z]" # 記号を取り除くための正規表現
df = pd.read_csv("./raw_corpus.csv") # コーパス
dictionary = Dictionary.load("./livedoor.dict") # 辞書
lda = models.ldamodel.LdaModel.load("./lda.model") # トピックモデル
index = similarities.MatrixSimilarity.load("./lda.index") # トピックモデルによるコーパスのindex
word_dist = lda.get_topics() # トピックモデルの単語分布(K, V)
w2v = Word2Vec.load('./word2vec.gensim.model') # 白やぎword2vec https://github.com/shiroyagicorp/japanese-word2vec-model-builder
# トピックモデルの情報(トピックごとの単語分布)のDataFrame化
num_words = 30
topic_list = []
word_list = []
weight_list = []
for n, values in lda.show_topics(num_topics=5, num_words=num_words, formatted=False):
for word, weight in values:
topic_list.append(n)
word_list.append(word)
weight_list.append(round(float(weight) * 100, 2))
topic_df = pd.DataFrame()
topic_df["topic"] = topic_list
topic_df["word"] = word_list
topic_df["weight"] = weight_list
st.sidebar.markdown("Set Parameter")
#word_list = ["任天堂", "バイオハザード", "ポケモン"]
#atom = st.sidebar.radio("元になる単語", word_list)
#negative = st.sidebar.radio("ー引く単語", word_list)
#positive = st.sidebar.radio("+足す単語", word_list)
st.header("word2vecによるアナロジー")
st.subheader("単語の演算")
st.caption("演算対象の単語")
col1, col2, col3 = st.columns(3)
with col1:
#atom = st.text_input("元になる単語", atom)
atom = st.text_input("元になる単語")
with col2:
#negative = st.text_input("ー引く単語", negative)
negative = st.text_input("ー引く単語")
with col3:
#positive = st.text_input("+足す単語", positive)
positive = st.text_input("+足す単語")
button = st.button("演算する")
if button:
st.text(f"{atom} - {negative} + {positive}")
x = w2v.wv.most_similar(positive=[atom, positive], negative=negative)
#st.text(f"{x}")
st.dataframe(x)