Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -37,58 +37,7 @@ else:
|
|
37 |
search_query = ""
|
38 |
st.write("Search Query:", search_query)
|
39 |
|
40 |
-
# ---------- Cached function for loading the sentiment model pipeline ----------
|
41 |
-
# @st.cache_resource
|
42 |
-
# def load_sentiment_pipeline():
|
43 |
-
# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
44 |
-
# model = AutoModelForSequenceClassification.from_pretrained(
|
45 |
-
# "cardiffnlp/twitter-roberta-base-sentiment-latest",
|
46 |
-
# use_auth_token=st.secrets["hugging_face_token"]
|
47 |
-
# )
|
48 |
-
# sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
|
49 |
-
# max_tokens = tokenizer.model_max_length
|
50 |
-
# if max_tokens > 10000:
|
51 |
-
# max_tokens = 512
|
52 |
-
# return sentiment_pipeline, tokenizer, max_tokens
|
53 |
|
54 |
-
# ---------- Cached function for scraping Reddit data ----------
|
55 |
-
# @st.cache_data(show_spinner=False)
|
56 |
-
# def scrape_reddit_data(search_query, total_limit):
|
57 |
-
# # Retrieve API credentials from st.secrets
|
58 |
-
# reddit = praw.Reddit(
|
59 |
-
# client_id=st.secrets["reddit_client_id"],
|
60 |
-
# client_secret=st.secrets["reddit_client_secret"],
|
61 |
-
# user_agent=st.secrets["reddit_user_agent"]
|
62 |
-
# )
|
63 |
-
# subreddit = reddit.subreddit("all")
|
64 |
-
# posts_data = []
|
65 |
-
# # Iterate over submissions based on the search query and limit
|
66 |
-
# for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
|
67 |
-
# # No UI updates here as caching does not allow live progress updates
|
68 |
-
# if submission.title and submission.selftext:
|
69 |
-
# posts_data.append([
|
70 |
-
# submission.title,
|
71 |
-
# submission.url,
|
72 |
-
# submission.created_utc,
|
73 |
-
# submission.selftext,
|
74 |
-
# ])
|
75 |
-
# time.sleep(0.25)
|
76 |
-
# df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
|
77 |
-
|
78 |
-
# Function to normalize text by replacing multiple spaces/newlines with a single space
|
79 |
-
# def normalize_text(text):
|
80 |
-
# if not isinstance(text, str):
|
81 |
-
# return ""
|
82 |
-
# return re.sub(r'\s+', ' ', text).strip()
|
83 |
-
|
84 |
-
# for col in ["Title", "Detail"]:
|
85 |
-
# df[col] = df[col].apply(normalize_text)
|
86 |
-
|
87 |
-
# # Filter out rows with empty Title or Detail
|
88 |
-
# df = df[(df["Title"] != "") & (df["Detail"] != "")]
|
89 |
-
# df['Date'] = pd.to_datetime(df['Date'], unit='s')
|
90 |
-
# df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
|
91 |
-
# return df
|
92 |
|
93 |
# Button to trigger scraping and sentiment analysis
|
94 |
if st.button("Scrape and Sentiment Analysis"):
|
@@ -100,47 +49,6 @@ if st.button("Scrape and Sentiment Analysis"):
|
|
100 |
df = scrape_reddit_data(search_query, total_limit)
|
101 |
progress_text.text(f"Collected {len(df)} valid posts.")
|
102 |
st.session_state["df"] = df
|
103 |
-
|
104 |
-
# ------------------ Sentiment Analysis Functions ------------------------#
|
105 |
-
# def split_text_by_token_limit(text, tokenizer, max_tokens):
|
106 |
-
# tokens = tokenizer.encode(text, add_special_tokens=False)
|
107 |
-
# chunks = []
|
108 |
-
# for i in range(0, len(tokens), max_tokens):
|
109 |
-
# chunk_tokens = tokens[i:i+max_tokens]
|
110 |
-
# chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
|
111 |
-
# chunks.append(chunk_text)
|
112 |
-
# return chunks
|
113 |
-
|
114 |
-
# def safe_sentiment(text):
|
115 |
-
# try:
|
116 |
-
# result = sentiment_pipeline(text)[0]
|
117 |
-
# except Exception as e:
|
118 |
-
# result = None
|
119 |
-
# return result
|
120 |
-
|
121 |
-
# def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
|
122 |
-
# text = preprocess_text(text)
|
123 |
-
# chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
|
124 |
-
# if not chunks:
|
125 |
-
# return None
|
126 |
-
# # Initialize accumulated scores for each sentiment category
|
127 |
-
# scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
|
128 |
-
# for chunk in chunks:
|
129 |
-
# result = safe_sentiment(chunk)
|
130 |
-
# if result is not None:
|
131 |
-
# label = result['label'].upper()
|
132 |
-
# if label in scores:
|
133 |
-
# scores[label] += result['score']
|
134 |
-
# final_label = max(scores, key=lambda k: scores[k])
|
135 |
-
# final_score = scores[final_label]
|
136 |
-
# return {"label": final_label, "score": final_score}
|
137 |
-
|
138 |
-
# def preprocess_text(text):
|
139 |
-
# # Replace URLs and user mentions
|
140 |
-
# text = re.sub(r'http\S+', 'http', text)
|
141 |
-
# text = re.sub(r'@\w+', '@user', text)
|
142 |
-
# return text
|
143 |
-
#-----------------------------------------------------------------------#
|
144 |
|
145 |
# Load the sentiment pipeline (cached)
|
146 |
with st.spinner("Loading Sentiment Pipeline..."):
|
@@ -150,7 +58,7 @@ if st.button("Scrape and Sentiment Analysis"):
|
|
150 |
# Perform sentiment analysis
|
151 |
with st.spinner("Doing Sentiment Analysis..."):
|
152 |
# Analyze Title sentiment directly (assuming the title is short)
|
153 |
-
df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
|
154 |
# Analyze Detail sentiment by splitting into token-limited chunks and accumulating scores
|
155 |
df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
|
156 |
|
|
|
37 |
search_query = ""
|
38 |
st.write("Search Query:", search_query)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# Button to trigger scraping and sentiment analysis
|
43 |
if st.button("Scrape and Sentiment Analysis"):
|
|
|
49 |
df = scrape_reddit_data(search_query, total_limit)
|
50 |
progress_text.text(f"Collected {len(df)} valid posts.")
|
51 |
st.session_state["df"] = df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Load the sentiment pipeline (cached)
|
54 |
with st.spinner("Loading Sentiment Pipeline..."):
|
|
|
58 |
# Perform sentiment analysis
|
59 |
with st.spinner("Doing Sentiment Analysis..."):
|
60 |
# Analyze Title sentiment directly (assuming the title is short)
|
61 |
+
df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(sentiment_pipeline, preprocess_text(x)) if x else None)
|
62 |
# Analyze Detail sentiment by splitting into token-limited chunks and accumulating scores
|
63 |
df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
|
64 |
|