File size: 5,384 Bytes
f98185e 16fad89 849b075 f98185e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# Gradio Application Interface
import gradio as gr
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
import pandas as pd
import gensim
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import os
def summarizer_func():
return pipeline(
model="Majon911/pegasus_multi_news_ep1",
tokenizer = "google/pegasus-xsum",
min_length=100, max_length=200,
truncation = True
)
def sentiment_func():
return pipeline("text-classification",
model="kbaumgartner/DeBERTa_Finetuned_Financial_News",
tokenizer = "microsoft/deberta-v3-base")
def source_outlet(choise):
if choise == 'CNBC':
url = "https://www.cnbc.com/finance/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
headlines = {}
headline_elements = soup.find_all('a', class_='Card-title')
for headline_element in headline_elements:
headlines[headline_element.text.strip()] = headline_element['href']
elif choise == "Reuters":
pass
df = pd.DataFrame({'headline': headlines.keys(),
'url': headlines.values()})
first_5_articles = df.head()
first_5_articles = first_5_articles.assign(text='')
first_5_articles = first_5_articles.assign(summary='')
first_5_articles = first_5_articles.assign(sentiment='')
first_5_articles = first_5_articles.assign(topic='')
return first_5_articles
def sentiment_translation(curr_sentiment):
if curr_sentiment == "LABEL_0":
trans_lbl = "NEGATIVE"
elif curr_sentiment == "LABEL_1":
trans_lbl = "NEUTRAL"
elif curr_sentiment == "LABEL_2":
trans_lbl = "POSITIVE"
return trans_lbl
def preprocess(text):
# Remove special characters and digits
text = text.lower()
text = re.sub("(\\d|\\W)+", " ", text)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in text.lower().split() if word not in stop_words and len(word) > 3]
return tokens
def lda_topic_modeling(text):
lda_model = gensim.models.LdaModel.load("lda_gensim_5t/lda_model5.gensim")
dictionary = gensim.corpora.Dictionary.load("lda_gensim_5t/dictionary5.gensim")
processed_text = preprocess(text)
bow = dictionary.doc2bow(processed_text)
topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0.0)
topic_distribution = sorted(topic_distribution, key=lambda x: x[1], reverse=True)
topic_names = {
'0': "Corporate Valuation & Performance",
'1': "Quarterly Financial Reports",
'2': "Stock Market & Investment Funds",
'3': "Corporate Affairs & Products",
'4': "Investment Research"
}
# Extract the most probable topic and its probability
if topic_distribution:
dominant_topic, probability = topic_distribution[0]
topic_name = topic_names.get(str(dominant_topic), "Unknown Topic")
return (topic_name, probability)
else:
# If no topic is found, return a placeholder and zero probability
return ("No Topic Found", 0.0)
def gradio_stocknews(source_ch, art_number):
# Defining the summarizer
summarizer = summarizer_func()
# Defining the semtiment analysis
pipe_sentiment = sentiment_func()
# Identyfying the Articles
first_5_articles = source_outlet(source_ch)
# Scraping text for the chosen article
response = requests.get(first_5_articles.loc[art_number-1, 'url'])
sub_soup = BeautifulSoup(response.content, 'html.parser')
article_body_element = sub_soup.find('div', class_='ArticleBody-articleBody') # ArticleBody-articleBody
article_text = article_body_element.get_text() # Extracting only the text
first_5_articles.loc[art_number-1, 'text'] = article_text
first_5_articles.loc[art_number-1, 'summary'] = summarizer(article_text)[0]['generated_text']
label_sentiment = pipe_sentiment(article_text)[0]['label']
first_5_articles.loc[art_number-1, 'sentiment'] = sentiment_translation(label_sentiment)
# Get the human-readable topic name using the topic names mapping
first_5_articles.loc[art_number-1, 'topic'] = lda_topic_modeling(article_text)[0]
return first_5_articles.loc[art_number-1, 'headline'], first_5_articles.loc[art_number-1, 'url'], first_5_articles.loc[art_number-1, 'summary'], first_5_articles.loc[art_number-1, 'sentiment'], first_5_articles.loc[art_number-1, 'topic']
def main():
os.chdir(os.path.dirname(os.path.realpath(__file__)))
nltk.download('stopwords')
nltk.download('wordnet')
#print(gradio_stocknews("CNBC", 2))
iface = gr.Interface(fn=gradio_stocknews,
inputs=[gr.Dropdown(choices=["CNBC"], label="Select Source"), gr.Dropdown(choices=[1, 2, 3, 4, 5], label="Select Article Number")],
outputs=[gr.Textbox(lines=1, label="Article Title"), gr.Textbox(lines=1, label="Article Link"), gr.Textbox(lines=1, label="Article Summary"), gr.Textbox(lines=1, label="Article Sentiment"), gr.Textbox(lines=1, label="Article Topic")], # Add this line for topic
title="Latest 5 Stock News Dashboard",
description="Click the button to refresh the news summary.")
iface.launch()
if __name__ == "__main__":
main()
|