kusa04 commited on
Commit
d6d3d88
·
verified ·
1 Parent(s): e81fa54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -93
app.py CHANGED
@@ -37,58 +37,7 @@ else:
37
  search_query = ""
38
  st.write("Search Query:", search_query)
39
 
40
- # ---------- Cached function for loading the sentiment model pipeline ----------
41
- # @st.cache_resource
42
- # def load_sentiment_pipeline():
43
- # tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
44
- # model = AutoModelForSequenceClassification.from_pretrained(
45
- # "cardiffnlp/twitter-roberta-base-sentiment-latest",
46
- # use_auth_token=st.secrets["hugging_face_token"]
47
- # )
48
- # sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
49
- # max_tokens = tokenizer.model_max_length
50
- # if max_tokens > 10000:
51
- # max_tokens = 512
52
- # return sentiment_pipeline, tokenizer, max_tokens
53
 
54
- # ---------- Cached function for scraping Reddit data ----------
55
- # @st.cache_data(show_spinner=False)
56
- # def scrape_reddit_data(search_query, total_limit):
57
- # # Retrieve API credentials from st.secrets
58
- # reddit = praw.Reddit(
59
- # client_id=st.secrets["reddit_client_id"],
60
- # client_secret=st.secrets["reddit_client_secret"],
61
- # user_agent=st.secrets["reddit_user_agent"]
62
- # )
63
- # subreddit = reddit.subreddit("all")
64
- # posts_data = []
65
- # # Iterate over submissions based on the search query and limit
66
- # for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
67
- # # No UI updates here as caching does not allow live progress updates
68
- # if submission.title and submission.selftext:
69
- # posts_data.append([
70
- # submission.title,
71
- # submission.url,
72
- # submission.created_utc,
73
- # submission.selftext,
74
- # ])
75
- # time.sleep(0.25)
76
- # df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
77
-
78
- # Function to normalize text by replacing multiple spaces/newlines with a single space
79
- # def normalize_text(text):
80
- # if not isinstance(text, str):
81
- # return ""
82
- # return re.sub(r'\s+', ' ', text).strip()
83
-
84
- # for col in ["Title", "Detail"]:
85
- # df[col] = df[col].apply(normalize_text)
86
-
87
- # # Filter out rows with empty Title or Detail
88
- # df = df[(df["Title"] != "") & (df["Detail"] != "")]
89
- # df['Date'] = pd.to_datetime(df['Date'], unit='s')
90
- # df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
91
- # return df
92
 
93
  # Button to trigger scraping and sentiment analysis
94
  if st.button("Scrape and Sentiment Analysis"):
@@ -100,47 +49,6 @@ if st.button("Scrape and Sentiment Analysis"):
100
  df = scrape_reddit_data(search_query, total_limit)
101
  progress_text.text(f"Collected {len(df)} valid posts.")
102
  st.session_state["df"] = df
103
-
104
- # ------------------ Sentiment Analysis Functions ------------------------#
105
- # def split_text_by_token_limit(text, tokenizer, max_tokens):
106
- # tokens = tokenizer.encode(text, add_special_tokens=False)
107
- # chunks = []
108
- # for i in range(0, len(tokens), max_tokens):
109
- # chunk_tokens = tokens[i:i+max_tokens]
110
- # chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
111
- # chunks.append(chunk_text)
112
- # return chunks
113
-
114
- # def safe_sentiment(text):
115
- # try:
116
- # result = sentiment_pipeline(text)[0]
117
- # except Exception as e:
118
- # result = None
119
- # return result
120
-
121
- # def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
122
- # text = preprocess_text(text)
123
- # chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
124
- # if not chunks:
125
- # return None
126
- # # Initialize accumulated scores for each sentiment category
127
- # scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
128
- # for chunk in chunks:
129
- # result = safe_sentiment(chunk)
130
- # if result is not None:
131
- # label = result['label'].upper()
132
- # if label in scores:
133
- # scores[label] += result['score']
134
- # final_label = max(scores, key=lambda k: scores[k])
135
- # final_score = scores[final_label]
136
- # return {"label": final_label, "score": final_score}
137
-
138
- # def preprocess_text(text):
139
- # # Replace URLs and user mentions
140
- # text = re.sub(r'http\S+', 'http', text)
141
- # text = re.sub(r'@\w+', '@user', text)
142
- # return text
143
- #-----------------------------------------------------------------------#
144
 
145
  # Load the sentiment pipeline (cached)
146
  with st.spinner("Loading Sentiment Pipeline..."):
@@ -150,7 +58,7 @@ if st.button("Scrape and Sentiment Analysis"):
150
  # Perform sentiment analysis
151
  with st.spinner("Doing Sentiment Analysis..."):
152
  # Analyze Title sentiment directly (assuming the title is short)
153
- df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
154
  # Analyze Detail sentiment by splitting into token-limited chunks and accumulating scores
155
  df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
156
 
 
37
  search_query = ""
38
  st.write("Search Query:", search_query)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Button to trigger scraping and sentiment analysis
43
  if st.button("Scrape and Sentiment Analysis"):
 
49
  df = scrape_reddit_data(search_query, total_limit)
50
  progress_text.text(f"Collected {len(df)} valid posts.")
51
  st.session_state["df"] = df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Load the sentiment pipeline (cached)
54
  with st.spinner("Loading Sentiment Pipeline..."):
 
58
  # Perform sentiment analysis
59
  with st.spinner("Doing Sentiment Analysis..."):
60
  # Analyze Title sentiment directly (assuming the title is short)
61
+ df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(sentiment_pipeline, preprocess_text(x)) if x else None)
62
  # Analyze Detail sentiment by splitting into token-limited chunks and accumulating scores
63
  df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
64