comment_filter / app.py
duwing's picture
Update app.py
4f2bcd0 verified
raw
history blame
5.22 kB
import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
import json
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib
import requests
from bs4 import BeautifulSoup
import time
PATH_t = './checkpoint-7500/'
PATH = './checkpoint-18750/'
SEQ_LEN = 128
tokenizer_t = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
def create_sentiment_bert():
# ๋ฒ„ํŠธ pretrained ๋ชจ๋ธ ๋กœ๋“œ
model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
# ํ† ํฐ ์ธํ’‹, ๋งˆ์Šคํฌ ์ธํ’‹, ์„ธ๊ทธ๋จผํŠธ ์ธํ’‹ ์ •์˜
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
# ์ธํ’‹์ด [ํ† ํฐ, ๋งˆ์Šคํฌ, ์„ธ๊ทธ๋จผํŠธ]์ธ ๋ชจ๋ธ ์ •์˜
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])
bert_outputs = bert_outputs[1]
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)
sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
return sentiment_model
def sentence_convert_data(data):
global tokenizer
tokens, masks, segments = [], [], []
token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
num_zeros = token.count(0)
mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
segment = [0]*SEQ_LEN
tokens.append(token)
segments.append(segment)
masks.append(mask)
tokens = np.array(tokens)
masks = np.array(masks)
segments = np.array(segments)
return [tokens, masks, segments]
def evaluation_predict(sentence):
data_x = sentence_convert_data(sentence)
predict = sentiment_model.predict(data_x)
predict_value = np.ravel(predict)
# 0:๋ถ€์ •, 1:๊ธ์ •
predict_answer = np.round(predict_value,0).item()
return predict_answer
def get_comments(news_url):
# oid, aid ์ถ”์ถœ
list = news_url.split("/")
oid = list[-2]
aid = list[-1]
if len(aid) > 10:
aid = aid[:10]
# API URL ๊ตฌ์„ฑ
api_url = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json"
params = {
"ticket": "news",
"templateId": "default_society",
"pool": "cbox5",
"lang": "ko",
"country": "KR",
"objectId": f"news{oid},{aid}",
"pageSize": 100,
"indexSize": 10,
"page": 1,
"sort": "FAVORITE" # 'NEW'(์ตœ์‹ ์ˆœ), 'FAVORITE'(์ˆœ๊ณต๊ฐ์ˆœ)
}
headers = {
"User-Agent": "Mozilla/5.0",
"Referer": news_url
}
# API ํ˜ธ์ถœ ๋ฐ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
response = requests.get(api_url, params=params, headers=headers)
content = response.text.replace("_callback(", "").replace(");", "")
json_data = json.loads(content)
response = requests.get(news_url)
article_soup = BeautifulSoup(response.text, "html.parser")
# ์ œ๋ชฉ ์ถ”์ถœ
title = article_soup.select_one("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2")
if title is None:
title = article_soup.select_one("#content > div.end_ct > div > h2")
# ๋ณธ๋ฌธ ์ถ”์ถœ
article = article_soup.select_one("#dic_area")
if article is None:
article = article_soup.select_one("#articeBody")
return title.text.strip(), article.text.strip(), processing_data(json_data['result']['commentList'])
def processing_data(comments):
comment_list = []
for comment in comments:
comment_list.append(comment['contents'])
comment_listR = [x for x in comment_list if x]
return comment_listR
def main():
global sentiment_model
sentiment_model = create_sentiment_bert()
st.title("๋Œ“๊ธ€ ํ•„ํ„ฐ๋ง ์„œ๋น„์Šค")
# URL ์ž…๋ ฅ ๋ฐ›๊ธฐ
value = st.query_params['q']
if value:
url = st.text_input("url์„ ์ž…๋ ฅํ•˜์„ธ์š”",value=st.query_params['q'])
else:
url = st.text_input("url์„ ์ž…๋ ฅํ•˜์„ธ์š”")
if url:
title, content, comments = get_comments(url)
# ๊ฒฐ๊ณผ ํ‘œ์‹œ
st.subheader("๊ธฐ์‚ฌ ์ œ๋ชฉ")
st.write(title)
st.subheader("๋ณธ๋ฌธ ๋‚ด์šฉ")
st.write(content)
st.subheader("๋Œ“๊ธ€")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
if st.button("์Šคํฌ๋žฉ ์‹œ์ž‘"):
if url:
title, content, comments = get_comments(url)
# ๊ฒฐ๊ณผ ํ‘œ์‹œ
st.subheader("๊ธฐ์‚ฌ ์ œ๋ชฉ")
st.write(title)
st.subheader("๋ณธ๋ฌธ ๋‚ด์šฉ")
st.write(content)
st.subheader("๋Œ“๊ธ€")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
return 0
if __name__ == "__main__":
main()