Spaces:
Sleeping
Sleeping
File size: 8,090 Bytes
35ad6e4 052eebe a2a36cb bc766e3 a2a36cb ff461da a2a36cb bc766e3 35ad6e4 14390d1 35ad6e4 bc766e3 24fc46f a9d660f 763f466 a9d660f ab7581e a9d660f 3d5c250 012fb93 ab7581e c2d6542 ab7581e 012fb93 ab7581e a9d660f 24fc46f 3d5c250 ff461da ab7581e ff461da ab7581e ff461da b6d15a2 ff461da 3d5c250 012fb93 b6d15a2 3d5c250 012fb93 3d5c250 0a71874 3d5c250 23f32f5 b6d15a2 c88e626 23f32f5 c1cb18a 012fb93 3d5c250 c1cb18a 3d5c250 1ab75eb 3d5c250 1ab75eb a2a36cb b6d15a2 a2a36cb b6d15a2 3d5c250 b6d15a2 3d5c250 b6d15a2 3d5c250 b6d15a2 3d5c250 b6d15a2 3d5c250 b6d15a2 3d5c250 b6d15a2 3d5c250 b6d15a2 3d5c250 b6d15a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
from collections import Counter
from concurrent.futures import ThreadPoolExecutor # palarell processing
import matplotlib.pyplot as plt
import pandas as pd
import praw # Reddit's API
import re # Regular expression module
import streamlit as st
import time
import numpy as np
from wordcloud import WordCloud
from transformers import (
pipeline,
AutoTokenizer,
AutoModelForSequenceClassification,
AutoModelForTokenClassification,
TokenClassificationPipeline
)
from transformers.pipelines import AggregationStrategy
from functions import (
scrape_reddit_data,
safe_sentiment,
analyze_detail,
preprocess_text
)
# ---------- Cached function for loading the model pipelines ----------
@st.cache_resource
def load_sentiment_pipeline(): # sentiment pipeline
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained(
"cardiffnlp/twitter-roberta-base-sentiment-latest",
use_auth_token=st.secrets["hugging_face_token"]
)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
max_tokens = tokenizer.model_max_length
if max_tokens > 10000:
max_tokens = 200
return sentiment_pipeline, tokenizer, max_tokens
@st.cache_resource
def load_summarize_pipeline(): # summarize_pipeline
summarize_pipeline = pipeline("summarization", model="Falconsai/text_summarization", device=0)
return summarize_pipeline
@st.cache_resource
def summarize_txt(summarize_pipeline, texts, length):
if "count" not in st.session_state:
st.session_state.count = 0
summary = summarize_pipeline(texts, max_length=10, num_return_sequences=1)
result = summary[0]["summary_text"]
# print("summarized...")
st.session_state.count += 1
st.write(f"Phase: {st.session_state.count / length}")
return result
# class for keyword extraction
@st.cache_resource
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
def __init__(self, model, *args, **kwargs):
super().__init__(
model=AutoModelForTokenClassification.from_pretrained(model),
tokenizer=AutoTokenizer.from_pretrained(model),
*args,
**kwargs
)
def postprocess(self, all_outputs):
results = super().postprocess(
all_outputs=all_outputs,
aggregation_strategy=AggregationStrategy.SIMPLE,
)
return np.unique([result.get("word").strip() for result in results])
@st.cache_resource
def keyword_extractor():
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)
return extractor
st.title("Scraping & Analysis of Reddit")
# --- User Input ---
user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds")
if user_query:
search_query = f'"{user_query}" OR "{user_query.replace(" ", "")}"'
else:
search_query = ""
st.write("Search Query:", search_query)
# Button to trigger scraping and summarize
if st.button("Scrape & Summarize"):
with st.spinner("Scraping..."):
# progress_bar = st.progress(0)
progress_text = st.empty()
total_limit = 5000 # Maximum number of submissions to check
df = scrape_reddit_data(search_query, total_limit)
length = len(df)
progress_text.text(f"Collected {length} valid posts.")
with st.spinner("Loading Summarizing Pipeline"):
summarize_pipeline = load_summarize_pipeline()
with st.spinner("Summarizing txt data..."):
df["Detail_Summary"] = df["Detail"].apply(lambda x: summarize_txt(summarize_pipeline, x, length) if x else None)
st.session_state["df"] = df
# button to trigger sentiment analysis
if st.button("Sentiment Analysis"):
df = st.session_state.get("df")
with st.spinner("Loading Sentiment Pipeline..."):
sentiment_pipeline, tokenizer, max_tokens = load_sentiment_pipeline()
st.write("Sentiment pipeline loaded...")
with st.spinner("Doing Sentiment Analysis..."):
# title is short, so dont havwe to use batch processing
df['Title_Sentiment'] = df['Title'].apply(lambda x: safe_sentiment(sentiment_pipeline, text=preprocess_text(x), length) if x else None)
df['Detail_Sentiment'] = df['Detail_Summary'].apply(lambda x: safe_sentiment(sentiment_pipeline, text=preprocess_text(x), length) if x else None)
# # palarell procsssing for each row of detail
# with ThreadPoolExecutor() as executor:
# detail_sentiments = list(executor.map(
# lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None,
# df['Detail']
# ))
# df['detail_sentiment'] = detail_sentiments
df["Title_Sentiment_Label"] = df["Title_Sentiment"].apply(lambda x: x["label"] if x else None)
df["Title_Sentiment_Score"] = df["Title_Sentiment"].apply(lambda x: x["score"] if x else None)
df["Detail_Sentiment_Label"] = df["Detail_Sentiment"].apply(lambda x: x["label"] if x else None)
df["Detail_Sentiment_Score"] = df["Detail_Sentiment"].apply(lambda x: x["score"] if x else None)
df = df.drop(["Title_Sentiment", "Detail_Sentiment"], axis=1)
cols = ["Title", "Title_Sentiment_Label", "Title_Sentiment_Score",
"Detail", "Detail_Sentiment_Label", "Detail_Sentiment_Score", "Date"]
df = df[cols]
st.session_state["df"] = df
# Button to draw graphs
if st.button("Draw Graph"):
df = st.session_state.get("df")
if df is None or df.empty:
st.write("Please run 'Scrape and Sentiment Analysis' first.")
else:
# ------------------- Plot Title's Sentiment Score -------------------#
fig1, ax1 = plt.subplots(figsize=(10, 5))
# Filter and plot for each sentiment category
positive_title = df[df["Title_Sentiment_Label"].str.lower() == "positive"]
negative_title = df[df["Title_Sentiment_Label"].str.lower() == "negative"]
neutral_title = df[df["Title_Sentiment_Label"].str.lower() == "neutral"]
ax1.plot(positive_title["Date"], positive_title["Title_Sentiment_Score"],
marker="o", label="Title Positive", color="orange")
ax1.plot(negative_title["Date"], negative_title["Title_Sentiment_Score"],
marker="o", label="Title Negative", color="blue")
ax1.plot(neutral_title["Date"], neutral_title["Title_Sentiment_Score"],
marker="o", label="Title Neutral", color="yellowgreen")
ax1.set_title("Title Sentiment Scores Over Time")
ax1.set_xlabel("Time")
ax1.set_ylabel("Sentiment Score")
ax1.legend()
plt.xticks(rotation=45)
st.pyplot(fig1)
# ------------------- Plot Detail's Sentiment Score -------------------#
fig2, ax2 = plt.subplots(figsize=(10, 5))
positive_detail = df[df["Detail_Sentiment_Label"].str.lower() == "positive"]
negative_detail = df[df["Detail_Sentiment_Label"].str.lower() == "negative"]
neutral_detail = df[df["Detail_Sentiment_Label"].str.lower() == "neutral"]
ax2.plot(positive_detail["Date"], positive_detail["Detail_Sentiment_Score"],
marker="+", label="Detail Positive", color="darkorange")
ax2.plot(negative_detail["Date"], negative_detail["Detail_Sentiment_Score"],
marker="+", label="Detail Negative", color="navy")
ax2.plot(neutral_detail["Date"], neutral_detail["Detail_Sentiment_Score"],
marker="+", label="Detail Neutral", color="forestgreen")
ax2.set_title("Detail Sentiment Scores Over Time")
ax2.set_xlabel("Time")
ax2.set_ylabel("Sentiment Score")
ax2.legend()
plt.xticks(rotation=45)
st.pyplot(fig2) |