|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from fugashi import Tagger |
|
import re |
|
import random |
|
import matplotlib.pyplot as plt |
|
import japanize_matplotlib |
|
import seaborn as sns |
|
from gensim.corpora.dictionary import Dictionary |
|
from gensim import models |
|
from gensim.models.word2vec import Word2Vec |
|
from gensim import similarities |
|
|
|
sns.set(font='IPAexGothic') |
|
|
|
|
|
pattern = r"[^\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u20000-\u2ffff\sa-zA-Z]" |
|
df = pd.read_csv("./raw_corpus.csv") |
|
dictionary = Dictionary.load("./livedoor.dict") |
|
lda = models.ldamodel.LdaModel.load("./lda.model") |
|
index = similarities.MatrixSimilarity.load("./lda.index") |
|
word_dist = lda.get_topics() |
|
w2v = Word2Vec.load('./word2vec.gensim.model') |
|
|
|
|
|
num_words = 30 |
|
topic_list = [] |
|
word_list = [] |
|
weight_list = [] |
|
for n, values in lda.show_topics(num_topics=5, num_words=num_words, formatted=False): |
|
for word, weight in values: |
|
topic_list.append(n) |
|
word_list.append(word) |
|
weight_list.append(round(float(weight) * 100, 2)) |
|
|
|
topic_df = pd.DataFrame() |
|
topic_df["topic"] = topic_list |
|
topic_df["word"] = word_list |
|
topic_df["weight"] = weight_list |
|
|
|
|
|
st.sidebar.markdown("Set Parameter") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.header("word2vecによるアナロジー") |
|
st.subheader("単語の演算") |
|
st.caption("演算対象の単語") |
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
|
|
atom = st.text_input("元になる単語") |
|
|
|
with col2: |
|
|
|
negative = st.text_input("ー引く単語") |
|
|
|
with col3: |
|
|
|
positive = st.text_input("+足す単語") |
|
|
|
button = st.button("演算する") |
|
|
|
if button: |
|
st.text(f"{atom} - {negative} + {positive}") |
|
x = w2v.wv.most_similar(positive=[atom, positive], negative=negative) |
|
|
|
st.dataframe(x) |
|
|