comment_filter / app.py
duwing's picture
Update app.py
4693a2c verified
import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
import json
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib
import requests
from bs4 import BeautifulSoup
import time
import instaloader
from instaloader import Post
PATH = './checkpoint-18750/'
SEQ_LEN = 128
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
def create_sentiment_bert():
# ๋ฒ„ํŠธ pretrained ๋ชจ๋ธ ๋กœ๋“œ
model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
# ํ† ํฐ ์ธํ’‹, ๋งˆ์Šคํฌ ์ธํ’‹, ์„ธ๊ทธ๋จผํŠธ ์ธํ’‹ ์ •์˜
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
# ์ธํ’‹์ด [ํ† ํฐ, ๋งˆ์Šคํฌ, ์„ธ๊ทธ๋จผํŠธ]์ธ ๋ชจ๋ธ ์ •์˜
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])
bert_outputs = bert_outputs[1]
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)
sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
return sentiment_model
def sentence_convert_data(data):
global tokenizer
tokens, masks, segments = [], [], []
token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
num_zeros = token.count(0)
mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
segment = [0]*SEQ_LEN
tokens.append(token)
segments.append(segment)
masks.append(mask)
tokens = np.array(tokens)
masks = np.array(masks)
segments = np.array(segments)
return [tokens, masks, segments]
def evaluation_predict(sentence):
data_x = sentence_convert_data(sentence)
predict = sentiment_model.predict(data_x)
predict_value = np.ravel(predict)
# 0:๋ถ€์ •, 1:๊ธ์ •
predict_answer = np.round(predict_value,0).item()
return predict_answer
def get_comments(news_url):
if ('naver' in news_url):
# oid, aid ์ถ”์ถœ
list = news_url.split("/")
oid = list[-2]
aid = list[-1]
if len(aid) > 10:
aid = aid[:10]
# API URL ๊ตฌ์„ฑ
api_url = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json"
params = {
"ticket": "news",
"templateId": "default_society",
"pool": "cbox5",
"lang": "ko",
"country": "KR",
"objectId": f"news{oid},{aid}",
"pageSize": 100,
"indexSize": 10,
"page": 1,
"sort": "FAVORITE" # 'NEW'(์ตœ์‹ ์ˆœ), 'FAVORITE'(์ˆœ๊ณต๊ฐ์ˆœ)
}
headers = {
"User-Agent": "Mozilla/5.0",
"Referer": news_url
}
# API ํ˜ธ์ถœ ๋ฐ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
response = requests.get(api_url, params=params, headers=headers)
content = response.text.replace("_callback(", "").replace(");", "")
json_data = json.loads(content)
response = requests.get(news_url)
article_soup = BeautifulSoup(response.text, "html.parser")
# ์ œ๋ชฉ ์ถ”์ถœ
title = article_soup.select_one("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2")
if title is None:
title = article_soup.select_one("#content > div.end_ct > div > h2")
# ๋ณธ๋ฌธ ์ถ”์ถœ
article = article_soup.select_one("#dic_area")
if article is None:
article = article_soup.select_one("#articeBody")
return title.text.strip(), article.text.strip(), processing_data(json_data['result']['commentList'])
elif ('insta' in news_url):
list = news_url.split('/')
pid = ''
for i in list:
if len(i) == 11:
pid = i
L = instaloader.Instaloader()
post = Post.from_shortcode(L.context, pid)
try:
comments = [x.text for x in post.get_comments()]
except:
comments = ['๋กœ๊ทธ์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค']
return '', post.caption, comments
def processing_data(comments):
comment_list = []
for comment in comments:
comment_list.append(comment['contents'])
comment_listR = [x for x in comment_list if x]
return comment_listR
def main():
global sentiment_model
title = ''
content = ''
comments = []
sentiment_model = create_sentiment_bert()
st.title("๋Œ“๊ธ€ ํ•„ํ„ฐ๋ง ์„œ๋น„์Šค")
# URL ์ž…๋ ฅ ๋ฐ›๊ธฐ
if "q" in st.query_params:
value = st.query_params['q']
if value:
url = st.text_input("url์„ ์ž…๋ ฅํ•˜์„ธ์š”",value=st.query_params['q'])
title, content, comments = get_comments(url)
if st.button("์Šคํฌ๋žฉ ์‹œ์ž‘"):
if url:
title, content, comments = get_comments(url)
# ๊ฒฐ๊ณผ ํ‘œ์‹œ
st.subheader("์ œ๋ชฉ")
st.write(title)
st.subheader("๋ณธ๋ฌธ ๋‚ด์šฉ")
st.write(content)
st.subheader("๋Œ“๊ธ€")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
# ๊ฒฐ๊ณผ ํ‘œ์‹œ
st.subheader("์ œ๋ชฉ")
st.write(title)
st.subheader("๋ณธ๋ฌธ ๋‚ด์šฉ")
st.write(content)
st.subheader("๋Œ“๊ธ€")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
else:
url = st.text_input("url์„ ์ž…๋ ฅํ•˜์„ธ์š”")
if st.button("์Šคํฌ๋žฉ ์‹œ์ž‘"):
if url:
title, content, comments = get_comments(url)
# ๊ฒฐ๊ณผ ํ‘œ์‹œ
st.subheader("์ œ๋ชฉ")
st.write(title)
st.subheader("๋ณธ๋ฌธ ๋‚ด์šฉ")
st.write(content)
st.subheader("๋Œ“๊ธ€")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
else:
url = st.text_input("url์„ ์ž…๋ ฅํ•˜์„ธ์š”")
if st.button("์Šคํฌ๋žฉ ์‹œ์ž‘"):
if url:
title, content, comments = get_comments(url)
# ๊ฒฐ๊ณผ ํ‘œ์‹œ
st.subheader("์ œ๋ชฉ")
st.write(title)
st.subheader("๋ณธ๋ฌธ ๋‚ด์šฉ")
st.write(content)
st.subheader("๋Œ“๊ธ€")
for comment in comments:
if evaluation_predict(comment) == 1:
st.write(comment)
return 0
if __name__ == "__main__":
main()