|
|
|
|
|
import gradio as gr |
|
from transformers import pipeline |
|
from bs4 import BeautifulSoup |
|
import requests |
|
import pandas as pd |
|
import gensim |
|
import re |
|
import nltk |
|
from nltk.corpus import stopwords, wordnet |
|
from nltk.stem import WordNetLemmatizer |
|
import os |
|
|
|
def summarizer_func(): |
|
return pipeline( |
|
model="Majon911/pegasus_multi_news_ep1", |
|
tokenizer = "google/pegasus-xsum", |
|
min_length=100, max_length=200, |
|
truncation = True |
|
) |
|
|
|
def sentiment_func(): |
|
return pipeline("text-classification", |
|
model="kbaumgartner/DeBERTa_Finetuned_Financial_News", |
|
tokenizer = "microsoft/deberta-v3-base") |
|
|
|
def source_outlet(choise): |
|
if choise == 'CNBC': |
|
url = "https://www.cnbc.com/finance/" |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
headlines = {} |
|
headline_elements = soup.find_all('a', class_='Card-title') |
|
for headline_element in headline_elements: |
|
headlines[headline_element.text.strip()] = headline_element['href'] |
|
elif choise == "Reuters": |
|
pass |
|
|
|
df = pd.DataFrame({'headline': headlines.keys(), |
|
'url': headlines.values()}) |
|
|
|
first_5_articles = df.head() |
|
first_5_articles = first_5_articles.assign(text='') |
|
first_5_articles = first_5_articles.assign(summary='') |
|
first_5_articles = first_5_articles.assign(sentiment='') |
|
first_5_articles = first_5_articles.assign(topic='') |
|
return first_5_articles |
|
|
|
def sentiment_translation(curr_sentiment): |
|
if curr_sentiment == "LABEL_0": |
|
trans_lbl = "NEGATIVE" |
|
elif curr_sentiment == "LABEL_1": |
|
trans_lbl = "NEUTRAL" |
|
elif curr_sentiment == "LABEL_2": |
|
trans_lbl = "POSITIVE" |
|
return trans_lbl |
|
|
|
def preprocess(text): |
|
|
|
text = text.lower() |
|
text = re.sub("(\\d|\\W)+", " ", text) |
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
tokens = [lemmatizer.lemmatize(word) for word in text.lower().split() if word not in stop_words and len(word) > 3] |
|
return tokens |
|
|
|
def lda_topic_modeling(text): |
|
lda_model = gensim.models.LdaModel.load("lda_gensim_5t/lda_model5.gensim") |
|
dictionary = gensim.corpora.Dictionary.load("lda_gensim_5t/dictionary5.gensim") |
|
|
|
processed_text = preprocess(text) |
|
bow = dictionary.doc2bow(processed_text) |
|
topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0.0) |
|
topic_distribution = sorted(topic_distribution, key=lambda x: x[1], reverse=True) |
|
|
|
topic_names = { |
|
'0': "Corporate Valuation & Performance", |
|
'1': "Quarterly Financial Reports", |
|
'2': "Stock Market & Investment Funds", |
|
'3': "Corporate Affairs & Products", |
|
'4': "Investment Research" |
|
} |
|
|
|
|
|
if topic_distribution: |
|
dominant_topic, probability = topic_distribution[0] |
|
topic_name = topic_names.get(str(dominant_topic), "Unknown Topic") |
|
return (topic_name, probability) |
|
else: |
|
|
|
return ("No Topic Found", 0.0) |
|
|
|
def gradio_stocknews(source_ch, art_number): |
|
|
|
|
|
summarizer = summarizer_func() |
|
|
|
pipe_sentiment = sentiment_func() |
|
|
|
|
|
first_5_articles = source_outlet(source_ch) |
|
|
|
|
|
response = requests.get(first_5_articles.loc[art_number-1, 'url']) |
|
sub_soup = BeautifulSoup(response.content, 'html.parser') |
|
article_body_element = sub_soup.find('div', class_='ArticleBody-articleBody') |
|
article_text = article_body_element.get_text() |
|
first_5_articles.loc[art_number-1, 'text'] = article_text |
|
first_5_articles.loc[art_number-1, 'summary'] = summarizer(article_text)[0]['generated_text'] |
|
|
|
label_sentiment = pipe_sentiment(article_text)[0]['label'] |
|
first_5_articles.loc[art_number-1, 'sentiment'] = sentiment_translation(label_sentiment) |
|
|
|
|
|
first_5_articles.loc[art_number-1, 'topic'] = lda_topic_modeling(article_text)[0] |
|
|
|
|
|
return first_5_articles.loc[art_number-1, 'headline'], first_5_articles.loc[art_number-1, 'url'], first_5_articles.loc[art_number-1, 'summary'], first_5_articles.loc[art_number-1, 'sentiment'], first_5_articles.loc[art_number-1, 'topic'] |
|
|
|
def main(): |
|
os.chdir(os.path.dirname(os.path.realpath(__file__))) |
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
|
|
|
|
|
|
iface = gr.Interface(fn=gradio_stocknews, |
|
inputs=[gr.Dropdown(choices=["CNBC"], label="Select Source"), gr.Dropdown(choices=[1, 2, 3, 4, 5], label="Select Article Number")], |
|
outputs=[gr.Textbox(lines=1, label="Article Title"), gr.Textbox(lines=1, label="Article Link"), gr.Textbox(lines=1, label="Article Summary"), gr.Textbox(lines=1, label="Article Sentiment"), gr.Textbox(lines=1, label="Article Topic")], |
|
title="Latest 5 Stock News Dashboard", |
|
description="Click the button to refresh the news summary.") |
|
|
|
iface.launch() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|