Spaces:

kusa04
/

g13_DL_project

Sleeping

App Files Files Community

kusa04 commited on Mar 20

Commit

d6d3d88

verified ·

1 Parent(s): e81fa54

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -93

app.py CHANGED Viewed

@@ -37,58 +37,7 @@ else:
     search_query = ""
 st.write("Search Query:", search_query)
-# ---------- Cached function for loading the sentiment model pipeline ----------
-# @st.cache_resource
-# def load_sentiment_pipeline():
-#     tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
-#     model = AutoModelForSequenceClassification.from_pretrained(
-#         "cardiffnlp/twitter-roberta-base-sentiment-latest",
-#         use_auth_token=st.secrets["hugging_face_token"]
-#     )
-#     sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
-#     max_tokens = tokenizer.model_max_length
-#     if max_tokens > 10000:
-#         max_tokens = 512
-#     return sentiment_pipeline, tokenizer, max_tokens
-# ---------- Cached function for scraping Reddit data ----------
-# @st.cache_data(show_spinner=False)
-# def scrape_reddit_data(search_query, total_limit):
-#     # Retrieve API credentials from st.secrets
-#     reddit = praw.Reddit(
-#         client_id=st.secrets["reddit_client_id"],
-#         client_secret=st.secrets["reddit_client_secret"],
-#         user_agent=st.secrets["reddit_user_agent"]
-#     )
-#     subreddit = reddit.subreddit("all")
-#     posts_data = []
-#     # Iterate over submissions based on the search query and limit
-#     for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
-#         # No UI updates here as caching does not allow live progress updates
-#         if submission.title and submission.selftext:
-#             posts_data.append([
-#                 submission.title,
-#                 submission.url,
-#                 submission.created_utc,
-#                 submission.selftext,
-#             ])
-#             time.sleep(0.25)
-#     df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
-    # Function to normalize text by replacing multiple spaces/newlines with a single space
-    # def normalize_text(text):
-    #     if not isinstance(text, str):
-    #         return ""
-    #     return re.sub(r'\s+', ' ', text).strip()
-    # for col in ["Title", "Detail"]:
-    #     df[col] = df[col].apply(normalize_text)
-    # # Filter out rows with empty Title or Detail
-    # df = df[(df["Title"] != "") & (df["Detail"] != "")]
-    # df['Date'] = pd.to_datetime(df['Date'], unit='s')
-    # df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
-    # return df
 # Button to trigger scraping and sentiment analysis
 if st.button("Scrape and Sentiment Analysis"):
@@ -100,47 +49,6 @@ if st.button("Scrape and Sentiment Analysis"):
     df = scrape_reddit_data(search_query, total_limit)
     progress_text.text(f"Collected {len(df)} valid posts.")
     st.session_state["df"] = df
-    # ------------------ Sentiment Analysis Functions ------------------------#
-    # def split_text_by_token_limit(text, tokenizer, max_tokens):
-    #     tokens = tokenizer.encode(text, add_special_tokens=False)
-    #     chunks = []
-    #     for i in range(0, len(tokens), max_tokens):
-    #         chunk_tokens = tokens[i:i+max_tokens]
-    #         chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
-    #         chunks.append(chunk_text)
-    #     return chunks
-    # def safe_sentiment(text):
-    #     try:
-    #         result = sentiment_pipeline(text)[0]
-    #     except Exception as e:
-    #         result = None
-    #     return result
-    # def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
-    #     text = preprocess_text(text)
-    #     chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
-    #     if not chunks:
-    #         return None
-    #     # Initialize accumulated scores for each sentiment category
-    #     scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
-    #     for chunk in chunks:
-    #         result = safe_sentiment(chunk)
-    #         if result is not None:
-    #             label = result['label'].upper()
-    #             if label in scores:
-    #                 scores[label] += result['score']
-    #     final_label = max(scores, key=lambda k: scores[k])
-    #     final_score = scores[final_label]
-    #     return {"label": final_label, "score": final_score}
-    # def preprocess_text(text):
-    #     # Replace URLs and user mentions
-    #     text = re.sub(r'http\S+', 'http', text)
-    #     text = re.sub(r'@\w+', '@user', text)
-    #     return text
-    #-----------------------------------------------------------------------#
     # Load the sentiment pipeline (cached)
     with st.spinner("Loading Sentiment Pipeline..."):
@@ -150,7 +58,7 @@ if st.button("Scrape and Sentiment Analysis"):
     # Perform sentiment analysis
     with st.spinner("Doing Sentiment Analysis..."):
         # Analyze Title sentiment directly (assuming the title is short)
-        df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
         # Analyze Detail sentiment by splitting into token-limited chunks and accumulating scores
         df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)

     search_query = ""
 st.write("Search Query:", search_query)
 # Button to trigger scraping and sentiment analysis
 if st.button("Scrape and Sentiment Analysis"):
     df = scrape_reddit_data(search_query, total_limit)
     progress_text.text(f"Collected {len(df)} valid posts.")
     st.session_state["df"] = df
     # Load the sentiment pipeline (cached)
     with st.spinner("Loading Sentiment Pipeline..."):
     # Perform sentiment analysis
     with st.spinner("Doing Sentiment Analysis..."):
         # Analyze Title sentiment directly (assuming the title is short)
+        df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(sentiment_pipeline, preprocess_text(x)) if x else None)
         # Analyze Detail sentiment by splitting into token-limited chunks and accumulating scores
         df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)