Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,31 @@
|
|
|
|
1 |
import matplotlib.pyplot as plt
|
2 |
import pandas as pd
|
3 |
import praw # Reddit's API
|
4 |
import re # Regular expression module
|
5 |
import streamlit as st
|
6 |
import time
|
7 |
-
|
8 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
st.title("Reddit Scraping & Sentiment Analysis")
|
11 |
|
@@ -18,57 +38,57 @@ else:
|
|
18 |
st.write("Search Query:", search_query)
|
19 |
|
20 |
# ---------- Cached function for loading the sentiment model pipeline ----------
|
21 |
-
@st.cache_resource
|
22 |
-
def load_sentiment_pipeline():
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
|
34 |
# ---------- Cached function for scraping Reddit data ----------
|
35 |
-
@st.cache_data(show_spinner=False)
|
36 |
-
def scrape_reddit_data(search_query, total_limit):
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
|
58 |
# Function to normalize text by replacing multiple spaces/newlines with a single space
|
59 |
-
def normalize_text(text):
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
|
64 |
-
for col in ["Title", "Detail"]:
|
65 |
-
|
66 |
|
67 |
-
# Filter out rows with empty Title or Detail
|
68 |
-
df = df[(df["Title"] != "") & (df["Detail"] != "")]
|
69 |
-
df['Date'] = pd.to_datetime(df['Date'], unit='s')
|
70 |
-
df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
|
71 |
-
return df
|
72 |
|
73 |
# Button to trigger scraping and sentiment analysis
|
74 |
if st.button("Scrape and Sentiment Analysis"):
|
@@ -82,44 +102,44 @@ if st.button("Scrape and Sentiment Analysis"):
|
|
82 |
st.session_state["df"] = df
|
83 |
|
84 |
# ------------------ Sentiment Analysis Functions ------------------------#
|
85 |
-
def split_text_by_token_limit(text, tokenizer, max_tokens):
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
|
94 |
-
def safe_sentiment(text):
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
|
101 |
-
def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
|
118 |
-
def preprocess_text(text):
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
#-----------------------------------------------------------------------#
|
124 |
|
125 |
# Load the sentiment pipeline (cached)
|
|
|
1 |
+
from collections import Counter
|
2 |
import matplotlib.pyplot as plt
|
3 |
import pandas as pd
|
4 |
import praw # Reddit's API
|
5 |
import re # Regular expression module
|
6 |
import streamlit as st
|
7 |
import time
|
8 |
+
import numpy as np
|
9 |
+
from wordcloud import WordCloud
|
10 |
+
from transformers import (
|
11 |
+
pipeline,
|
12 |
+
AutoTokenizer,
|
13 |
+
AutoModelForSequenceClassification,
|
14 |
+
AutoModelForTokenClassification,
|
15 |
+
TokenClassificationPipeline
|
16 |
+
)
|
17 |
+
from transformers.pipelines import AggregationStrategy
|
18 |
+
|
19 |
+
from functions import (
|
20 |
+
load_sentiment_pipeline,
|
21 |
+
KeyphraseExtractionPipeline,
|
22 |
+
keyword_extractor,
|
23 |
+
scrape_reddit_data,
|
24 |
+
split_text_by_token_limit,
|
25 |
+
safe_sentiment,
|
26 |
+
analyze_detail,
|
27 |
+
preprocess_text
|
28 |
+
)
|
29 |
|
30 |
st.title("Reddit Scraping & Sentiment Analysis")
|
31 |
|
|
|
38 |
st.write("Search Query:", search_query)
|
39 |
|
40 |
# ---------- Cached function for loading the sentiment model pipeline ----------
|
41 |
+
# @st.cache_resource
|
42 |
+
# def load_sentiment_pipeline():
|
43 |
+
# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
44 |
+
# model = AutoModelForSequenceClassification.from_pretrained(
|
45 |
+
# "cardiffnlp/twitter-roberta-base-sentiment-latest",
|
46 |
+
# use_auth_token=st.secrets["hugging_face_token"]
|
47 |
+
# )
|
48 |
+
# sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
|
49 |
+
# max_tokens = tokenizer.model_max_length
|
50 |
+
# if max_tokens > 10000:
|
51 |
+
# max_tokens = 512
|
52 |
+
# return sentiment_pipeline, tokenizer, max_tokens
|
53 |
|
54 |
# ---------- Cached function for scraping Reddit data ----------
|
55 |
+
# @st.cache_data(show_spinner=False)
|
56 |
+
# def scrape_reddit_data(search_query, total_limit):
|
57 |
+
# # Retrieve API credentials from st.secrets
|
58 |
+
# reddit = praw.Reddit(
|
59 |
+
# client_id=st.secrets["reddit_client_id"],
|
60 |
+
# client_secret=st.secrets["reddit_client_secret"],
|
61 |
+
# user_agent=st.secrets["reddit_user_agent"]
|
62 |
+
# )
|
63 |
+
# subreddit = reddit.subreddit("all")
|
64 |
+
# posts_data = []
|
65 |
+
# # Iterate over submissions based on the search query and limit
|
66 |
+
# for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
|
67 |
+
# # No UI updates here as caching does not allow live progress updates
|
68 |
+
# if submission.title and submission.selftext:
|
69 |
+
# posts_data.append([
|
70 |
+
# submission.title,
|
71 |
+
# submission.url,
|
72 |
+
# submission.created_utc,
|
73 |
+
# submission.selftext,
|
74 |
+
# ])
|
75 |
+
# time.sleep(0.25)
|
76 |
+
# df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
|
77 |
|
78 |
# Function to normalize text by replacing multiple spaces/newlines with a single space
|
79 |
+
# def normalize_text(text):
|
80 |
+
# if not isinstance(text, str):
|
81 |
+
# return ""
|
82 |
+
# return re.sub(r'\s+', ' ', text).strip()
|
83 |
|
84 |
+
# for col in ["Title", "Detail"]:
|
85 |
+
# df[col] = df[col].apply(normalize_text)
|
86 |
|
87 |
+
# # Filter out rows with empty Title or Detail
|
88 |
+
# df = df[(df["Title"] != "") & (df["Detail"] != "")]
|
89 |
+
# df['Date'] = pd.to_datetime(df['Date'], unit='s')
|
90 |
+
# df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
|
91 |
+
# return df
|
92 |
|
93 |
# Button to trigger scraping and sentiment analysis
|
94 |
if st.button("Scrape and Sentiment Analysis"):
|
|
|
102 |
st.session_state["df"] = df
|
103 |
|
104 |
# ------------------ Sentiment Analysis Functions ------------------------#
|
105 |
+
# def split_text_by_token_limit(text, tokenizer, max_tokens):
|
106 |
+
# tokens = tokenizer.encode(text, add_special_tokens=False)
|
107 |
+
# chunks = []
|
108 |
+
# for i in range(0, len(tokens), max_tokens):
|
109 |
+
# chunk_tokens = tokens[i:i+max_tokens]
|
110 |
+
# chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
|
111 |
+
# chunks.append(chunk_text)
|
112 |
+
# return chunks
|
113 |
|
114 |
+
# def safe_sentiment(text):
|
115 |
+
# try:
|
116 |
+
# result = sentiment_pipeline(text)[0]
|
117 |
+
# except Exception as e:
|
118 |
+
# result = None
|
119 |
+
# return result
|
120 |
|
121 |
+
# def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
|
122 |
+
# text = preprocess_text(text)
|
123 |
+
# chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
|
124 |
+
# if not chunks:
|
125 |
+
# return None
|
126 |
+
# # Initialize accumulated scores for each sentiment category
|
127 |
+
# scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
|
128 |
+
# for chunk in chunks:
|
129 |
+
# result = safe_sentiment(chunk)
|
130 |
+
# if result is not None:
|
131 |
+
# label = result['label'].upper()
|
132 |
+
# if label in scores:
|
133 |
+
# scores[label] += result['score']
|
134 |
+
# final_label = max(scores, key=lambda k: scores[k])
|
135 |
+
# final_score = scores[final_label]
|
136 |
+
# return {"label": final_label, "score": final_score}
|
137 |
|
138 |
+
# def preprocess_text(text):
|
139 |
+
# # Replace URLs and user mentions
|
140 |
+
# text = re.sub(r'http\S+', 'http', text)
|
141 |
+
# text = re.sub(r'@\w+', '@user', text)
|
142 |
+
# return text
|
143 |
#-----------------------------------------------------------------------#
|
144 |
|
145 |
# Load the sentiment pipeline (cached)
|