Unggi's picture
Update app.py
2a014dc
raw
history blame
2.92 kB
import numpy as np
#import itertools
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import pandas as pd
# make function using import pip to install torch
import pip
#pip.main(['install', 'torch'])
#pip.main(['install', 'transformers'])
import torch
import transformers
from transformers import BertTokenizerFast
from transformers import AutoModel
def make_candiadte(prompt):
okt = Okt()
tokenized_doc = okt.pos(prompt)
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])
n_gram_range = (2, 3)
count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
candidates = count.get_feature_names_out()
return candidates
# saved_model
def load_model():
pretrained_model_name = "kykim/bert-kor-base"
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)
model = AutoModel.from_pretrained("./bertmodel/")
return model, tokenizer
# main
def inference(prompt):
candidates = make_candiadte(prompt)
model, tokenizer = load_model()
input_ids = tokenizer.encode(prompt)
input_ids = torch.tensor(input_ids).unsqueeze(0)
doc_embedding = model(input_ids)["pooler_output"]
top_n = 5
words = []
distances = []
for word in candidates:
input_ids = tokenizer.encode(word)
input_ids = torch.tensor(input_ids).unsqueeze(0)
word_embedding = model(input_ids)["pooler_output"]
distance = torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item()
words.append(word)
distances.append(distance)
#print(word, torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item())
cos_df = pd.DataFrame({'word':words, 'distance':distances})
# sort by distance
cos_df = cos_df.sort_values(by='distance', ascending=False)
# top n
cos_df = cos_df[:top_n]
cos_df["word"].values
# ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ
outputs = []
for word in cos_df["word"].values:
okt = Okt()
tokenized_doc = okt.pos(word)
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])
outputs.append("#" + tokenized_nouns)
outputs = " ".join(outputs)
return outputs
demo = gr.Interface(
fn=inference,
inputs="text",
outputs="text", #return ๊ฐ’
examples=[
"์ง€๋‚œํ•ด ๊ตญ๋‚ด ํด๋ž˜์‹๊ณ„ ์ตœ๊ณ  ์Šคํƒ€๋กœ ๋– ์˜ค๋ฅธ ํ”ผ์•„๋‹ˆ์ŠคํŠธ ์ž„์œค์ฐฌ์ด ๋ฏธ๊ตญ ๋ฐด ํด๋ผ์ด๋ฒˆ ๊ตญ์ œ์ฝฉ์ฟ ๋ฅด ๊ฒฐ์„ ์—์„œ ์—ฐ์ฃผํ•œ ๋ผํ๋งˆ๋‹ˆ๋…ธํ”„ ํ”ผ์•„๋…ธ ํ˜‘์ฃผ๊ณก ์ œ3๋ฒˆ ์˜์ƒ์ด ์œ ํŠœ๋ธŒ์—์„œ ์กฐํšŒ์ˆ˜ 1000๋งŒํšŒ๋ฅผ ๋„˜๊ฒผ๋‹ค. ๋ผํ๋งˆ๋‹ˆ๋…ธํ”„ 3๋ฒˆ ์—ฐ์ฃผ ์˜์ƒ ์ค‘ ๋‹จ์—ฐ ์ตœ๊ณ  ์กฐํšŒ์ˆ˜๋‹ค."
]
).launch() # launch(share=True)๋ฅผ ์„ค์ •ํ•˜๋ฉด ์™ธ๋ถ€์—์„œ ์ ‘์† ๊ฐ€๋Šฅํ•œ ๋งํฌ๊ฐ€ ์ƒ์„ฑ๋จ
demo.launch()