kusa04 commited on
Commit
b6d15a2
·
verified ·
1 Parent(s): cca9ecc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -128
app.py CHANGED
@@ -10,85 +10,76 @@ st.title("Reddit Scraping & Sentiment Analysis")
10
 
11
  # --- User Input ---
12
  user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds")
13
- # Build search query by inserting the user input into quotes and also a variant without spaces
14
  if user_query:
15
  search_query = f'"{user_query}" OR "{user_query.replace(" ", "")}"'
16
  else:
17
  search_query = ""
18
-
19
  st.write("Search Query:", search_query)
20
 
21
- # --- Scraping Section ---
22
- if st.button("Scrape and Sentiment Analysis"):
23
- # Set up a progress bar and status text
24
- progress_bar = st.progress(0)
25
- progress_text = st.empty()
26
-
27
- # API Information
28
- CLIENT_ID = st.secrets["reddit_client_id"]
29
- CLIENT_SECRET = st.secrets["reddit_client_secret"]
30
- USER_AGENT = st.secrets["reddit_user_agent"]
31
-
32
- # Setting up PRAW
 
 
 
 
 
 
33
  reddit = praw.Reddit(
34
- client_id=CLIENT_ID,
35
- client_secret=CLIENT_SECRET,
36
- user_agent=USER_AGENT
37
  )
38
-
39
  subreddit = reddit.subreddit("all")
40
-
41
  posts_data = []
42
- total_limit = 5000 # maximum number of submissions to check
43
-
44
- # Loop through submissions, update progress bar based on iteration count
45
  for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
46
- # Update progress
47
- progress = (i + 1) / total_limit
48
- progress_bar.progress(progress)
49
- progress_text.text(f"Scraping... {progress*100:.2f}%")
50
-
51
- # Filtering out posts that appear to be image/video only by checking if title and selftext exist.
52
  if submission.title and submission.selftext:
53
- title = submission.title
54
- created_utc = submission.created_utc
55
- submission_text = submission.selftext
56
-
57
-
58
  posts_data.append([
59
- title, # Original Title
60
- submission.url, # Original URL (case preserved)
61
- created_utc, # Date (UTC)
62
- submission_text, # Detail (main text)
63
  ])
64
-
65
- time.sleep(0.25)
66
-
67
- progress_text.text("Scraping complete.")
68
-
69
- # Convert the collected posts into a DataFrame
70
  df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
71
-
72
- # Define a text normalization function that replaces multiple spaces/newlines with a single space
73
  def normalize_text(text):
74
  if not isinstance(text, str):
75
  return ""
76
  return re.sub(r'\s+', ' ', text).strip()
77
 
78
- # Apply normalization to the "Title" and "Detail" columns
79
  for col in ["Title", "Detail"]:
80
  df[col] = df[col].apply(normalize_text)
81
 
82
- # Filter DataFrame to include only rows where Title and Detail are non-empty
83
  df = df[(df["Title"] != "") & (df["Detail"] != "")]
84
-
85
- # Convert Date to datetime, sort descending and reset index
86
  df['Date'] = pd.to_datetime(df['Date'], unit='s')
87
  df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
 
88
 
 
 
 
 
 
 
 
 
89
  progress_text.text(f"Collected {len(df)} valid posts.")
90
  st.session_state["df"] = df
91
-
92
  # ------------------ Sentiment Analysis Functions ------------------------#
93
  def split_text_by_token_limit(text, tokenizer, max_tokens):
94
  tokens = tokenizer.encode(text, add_special_tokens=False)
@@ -99,7 +90,6 @@ if st.button("Scrape and Sentiment Analysis"):
99
  chunks.append(chunk_text)
100
  return chunks
101
 
102
-
103
  def safe_sentiment(text):
104
  try:
105
  result = sentiment_pipeline(text)[0]
@@ -107,13 +97,12 @@ if st.button("Scrape and Sentiment Analysis"):
107
  result = None
108
  return result
109
 
110
-
111
  def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
112
  text = preprocess_text(text)
113
  chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
114
  if not chunks:
115
  return None
116
- # Initialize score accumulation for each sentiment category
117
  scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
118
  for chunk in chunks:
119
  result = safe_sentiment(chunk)
@@ -124,36 +113,24 @@ if st.button("Scrape and Sentiment Analysis"):
124
  final_label = max(scores, key=lambda k: scores[k])
125
  final_score = scores[final_label]
126
  return {"label": final_label, "score": final_score}
127
-
128
-
129
  def preprocess_text(text):
130
- text = re.sub(r'http\S+', 'http', text) # Replace URLs with 'http'
131
- text = re.sub(r'@\w+', '@user', text) # Replace user mentions with '@user'
 
132
  return text
133
-
134
  #-----------------------------------------------------------------------#
135
-
136
-
137
 
138
- # --- Sentiment Analysis Section ---
139
  with st.spinner("Loading Sentiment Pipeline..."):
140
- tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
141
- model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest",
142
- use_auth_token=st.secrets["hugging_face_token"])
143
-
144
- sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1)
145
- st.write("sentiment pipeline loaded...")
146
-
147
- max_tokens = tokenizer.model_max_length
148
- if max_tokens > 10000:
149
- max_tokens = 512
150
 
151
-
152
  with st.spinner("Doing Sentiment Analysis..."):
153
- # Apply sentiment analysis to Title directly (assuming Title is short)
154
  df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
155
-
156
- # Apply sentiment analysis to Detail by splitting into token-limited chunks and accumulating scores
157
  df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
158
 
159
  df["Title_sentiment_label"] = df["title_sentiment"].apply(lambda x: x["label"] if x else None)
@@ -161,62 +138,54 @@ if st.button("Scrape and Sentiment Analysis"):
161
  df["Detail_sentiment_label"] = df["detail_sentiment"].apply(lambda x: x["label"] if x else None)
162
  df["Detail_sentiment_score"] = df["detail_sentiment"].apply(lambda x: x["score"] if x else None)
163
 
 
164
  df = df.drop(["title_sentiment", "detail_sentiment"], axis=1)
165
- cols = ["Title", "Title_sentiment_label", "Title_sentiment_score", \
166
- "Detail", "Detail_sentiment_label", "Detail_sentiment_score", "Date"]
167
  df = df[cols]
168
-
169
  st.session_state["df"] = df
170
 
171
-
172
-
173
  if st.button("Draw Graph"):
174
  df = st.session_state.get("df")
175
-
176
- # ------------------- Plot Title's Sentiment Score -------------------#
177
- fig1, ax1 = plt.subplots(figsize=(10, 5))
178
-
179
- # positive, negative, neutral の各フィルタリングを実施してプロット
180
- positive_title = df[df["Title_sentiment_label"].str.lower() == "positive"]
181
- negative_title = df[df["Title_sentiment_label"].str.lower() == "negative"]
182
- neutral_title = df[df["Title_sentiment_label"].str.lower() == "neutral"]
183
-
184
- ax1.plot(positive_title["Date"], positive_title["Title_sentiment_score"],
185
- marker="o", label="Title Positive", color="orange")
186
-
187
- ax1.plot(negative_title["Date"], negative_title["Title_sentiment_score"],
188
- marker="o", label="Title Negative", color="blue")
189
-
190
- ax1.plot(neutral_title["Date"], neutral_title["Title_sentiment_score"],
191
- marker="o", label="Title Neutral", color="yellowgreen")
192
-
193
- ax1.set_title("Title Sentiment Scores Over Time")
194
- ax1.set_xlabel("Time")
195
- ax1.set_ylabel("Sentiment Score")
196
- ax1.legend()
197
- plt.xticks(rotation=45)
198
- st.pyplot(fig1)
199
-
200
-
201
- # ------------------- Plot Detail's Sentiment Score -------------------#
202
- fig2, ax2 = plt.subplots(figsize=(10, 5))
203
-
204
- positive_detail = df[df["Detail_sentiment_label"].str.lower() == "positive"]
205
- negative_detail = df[df["Detail_sentiment_label"].str.lower() == "negative"]
206
- neutral_detail = df[df["Detail_sentiment_label"].str.lower() == "neutral"]
207
-
208
- ax2.plot(positive_detail["Date"], positive_detail["Detail_sentiment_score"],
209
- marker="+", label="Detail Positive", color="darkorange")
210
-
211
- ax2.plot(negative_detail["Date"], negative_detail["Detail_sentiment_score"],
212
- marker="+", label="Detail Negative", color="navy")
213
-
214
- ax2.plot(neutral_detail["Date"], neutral_detail["Detail_sentiment_score"],
215
- marker="+", label="Detail Neutral", color="forestgreen")
216
-
217
- ax2.set_title("Detail Sentiment Scores Over Time")
218
- ax2.set_xlabel("Time")
219
- ax2.set_ylabel("Sentiment Score")
220
- ax2.legend()
221
- plt.xticks(rotation=45)
222
- st.pyplot(fig2)
 
10
 
11
  # --- User Input ---
12
  user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds")
 
13
  if user_query:
14
  search_query = f'"{user_query}" OR "{user_query.replace(" ", "")}"'
15
  else:
16
  search_query = ""
 
17
  st.write("Search Query:", search_query)
18
 
19
+ # ---------- Cached function for loading the sentiment model pipeline ----------
20
+ @st.cache_resource
21
+ def load_sentiment_pipeline():
22
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
23
+ model = AutoModelForSequenceClassification.from_pretrained(
24
+ "cardiffnlp/twitter-roberta-base-sentiment-latest",
25
+ use_auth_token=st.secrets["hugging_face_token"]
26
+ )
27
+ sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1)
28
+ max_tokens = tokenizer.model_max_length
29
+ if max_tokens > 10000:
30
+ max_tokens = 512
31
+ return sentiment_pipeline, tokenizer, max_tokens
32
+
33
+ # ---------- Cached function for scraping Reddit data ----------
34
+ @st.cache_data(show_spinner=False)
35
+ def scrape_reddit_data(search_query, total_limit):
36
+ # Retrieve API credentials from st.secrets
37
  reddit = praw.Reddit(
38
+ client_id=st.secrets["reddit_client_id"],
39
+ client_secret=st.secrets["reddit_client_secret"],
40
+ user_agent=st.secrets["reddit_user_agent"]
41
  )
 
42
  subreddit = reddit.subreddit("all")
 
43
  posts_data = []
44
+ # Iterate over submissions based on the search query and limit
 
 
45
  for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
46
+ # No UI updates here as caching does not allow live progress updates
 
 
 
 
 
47
  if submission.title and submission.selftext:
 
 
 
 
 
48
  posts_data.append([
49
+ submission.title,
50
+ submission.url,
51
+ submission.created_utc,
52
+ submission.selftext,
53
  ])
54
+ time.sleep(0.25)
 
 
 
 
 
55
  df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
56
+
57
+ # Function to normalize text by replacing multiple spaces/newlines with a single space
58
  def normalize_text(text):
59
  if not isinstance(text, str):
60
  return ""
61
  return re.sub(r'\s+', ' ', text).strip()
62
 
 
63
  for col in ["Title", "Detail"]:
64
  df[col] = df[col].apply(normalize_text)
65
 
66
+ # Filter out rows with empty Title or Detail
67
  df = df[(df["Title"] != "") & (df["Detail"] != "")]
 
 
68
  df['Date'] = pd.to_datetime(df['Date'], unit='s')
69
  df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
70
+ return df
71
 
72
+ # Button to trigger scraping and sentiment analysis
73
+ if st.button("Scrape and Sentiment Analysis"):
74
+ progress_bar = st.progress(0)
75
+ progress_text = st.empty()
76
+
77
+ total_limit = 5000 # Maximum number of submissions to check
78
+ # Cached scraping; if the same search query is used, cached results are returned
79
+ df = scrape_reddit_data(search_query, total_limit)
80
  progress_text.text(f"Collected {len(df)} valid posts.")
81
  st.session_state["df"] = df
82
+
83
  # ------------------ Sentiment Analysis Functions ------------------------#
84
  def split_text_by_token_limit(text, tokenizer, max_tokens):
85
  tokens = tokenizer.encode(text, add_special_tokens=False)
 
90
  chunks.append(chunk_text)
91
  return chunks
92
 
 
93
  def safe_sentiment(text):
94
  try:
95
  result = sentiment_pipeline(text)[0]
 
97
  result = None
98
  return result
99
 
 
100
  def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
101
  text = preprocess_text(text)
102
  chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
103
  if not chunks:
104
  return None
105
+ # Initialize accumulated scores for each sentiment category
106
  scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
107
  for chunk in chunks:
108
  result = safe_sentiment(chunk)
 
113
  final_label = max(scores, key=lambda k: scores[k])
114
  final_score = scores[final_label]
115
  return {"label": final_label, "score": final_score}
116
+
 
117
  def preprocess_text(text):
118
+ # Replace URLs and user mentions
119
+ text = re.sub(r'http\S+', 'http', text)
120
+ text = re.sub(r'@\w+', '@user', text)
121
  return text
 
122
  #-----------------------------------------------------------------------#
 
 
123
 
124
+ # Load the sentiment pipeline (cached)
125
  with st.spinner("Loading Sentiment Pipeline..."):
126
+ sentiment_pipeline, tokenizer, max_tokens = load_sentiment_pipeline()
127
+ st.write("Sentiment pipeline loaded...")
 
 
 
 
 
 
 
 
128
 
129
+ # Perform sentiment analysis
130
  with st.spinner("Doing Sentiment Analysis..."):
131
+ # Analyze Title sentiment directly (assuming the title is short)
132
  df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
133
+ # Analyze Detail sentiment by splitting into token-limited chunks and accumulating scores
 
134
  df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
135
 
136
  df["Title_sentiment_label"] = df["title_sentiment"].apply(lambda x: x["label"] if x else None)
 
138
  df["Detail_sentiment_label"] = df["detail_sentiment"].apply(lambda x: x["label"] if x else None)
139
  df["Detail_sentiment_score"] = df["detail_sentiment"].apply(lambda x: x["score"] if x else None)
140
 
141
+ # Drop intermediate columns
142
  df = df.drop(["title_sentiment", "detail_sentiment"], axis=1)
143
+ cols = ["Title", "Title_sentiment_label", "Title_sentiment_score",
144
+ "Detail", "Detail_sentiment_label", "Detail_sentiment_score", "Date"]
145
  df = df[cols]
 
146
  st.session_state["df"] = df
147
 
148
+ # Button to draw graphs
 
149
  if st.button("Draw Graph"):
150
  df = st.session_state.get("df")
151
+ if df is None or df.empty:
152
+ st.write("Please run 'Scrape and Sentiment Analysis' first.")
153
+ else:
154
+ # ------------------- Plot Title's Sentiment Score -------------------#
155
+ fig1, ax1 = plt.subplots(figsize=(10, 5))
156
+ # Filter and plot for each sentiment category
157
+ positive_title = df[df["Title_sentiment_label"].str.lower() == "positive"]
158
+ negative_title = df[df["Title_sentiment_label"].str.lower() == "negative"]
159
+ neutral_title = df[df["Title_sentiment_label"].str.lower() == "neutral"]
160
+
161
+ ax1.plot(positive_title["Date"], positive_title["Title_sentiment_score"],
162
+ marker="o", label="Title Positive", color="orange")
163
+ ax1.plot(negative_title["Date"], negative_title["Title_sentiment_score"],
164
+ marker="o", label="Title Negative", color="blue")
165
+ ax1.plot(neutral_title["Date"], neutral_title["Title_sentiment_score"],
166
+ marker="o", label="Title Neutral", color="yellowgreen")
167
+ ax1.set_title("Title Sentiment Scores Over Time")
168
+ ax1.set_xlabel("Time")
169
+ ax1.set_ylabel("Sentiment Score")
170
+ ax1.legend()
171
+ plt.xticks(rotation=45)
172
+ st.pyplot(fig1)
173
+
174
+ # ------------------- Plot Detail's Sentiment Score -------------------#
175
+ fig2, ax2 = plt.subplots(figsize=(10, 5))
176
+ positive_detail = df[df["Detail_sentiment_label"].str.lower() == "positive"]
177
+ negative_detail = df[df["Detail_sentiment_label"].str.lower() == "negative"]
178
+ neutral_detail = df[df["Detail_sentiment_label"].str.lower() == "neutral"]
179
+
180
+ ax2.plot(positive_detail["Date"], positive_detail["Detail_sentiment_score"],
181
+ marker="+", label="Detail Positive", color="darkorange")
182
+ ax2.plot(negative_detail["Date"], negative_detail["Detail_sentiment_score"],
183
+ marker="+", label="Detail Negative", color="navy")
184
+ ax2.plot(neutral_detail["Date"], neutral_detail["Detail_sentiment_score"],
185
+ marker="+", label="Detail Neutral", color="forestgreen")
186
+ ax2.set_title("Detail Sentiment Scores Over Time")
187
+ ax2.set_xlabel("Time")
188
+ ax2.set_ylabel("Sentiment Score")
189
+ ax2.legend()
190
+ plt.xticks(rotation=45)
191
+ st.pyplot(fig2)