Spaces:

kusa04
/

g13_DL_project

Sleeping

App Files Files Community

kusa04 commited on Mar 20

Commit

35ad6e4

verified ·

1 Parent(s): 188fc65

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -82

app.py CHANGED Viewed

@@ -1,11 +1,31 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 import praw  # Reddit's API
 import re  # Regular expression module
 import streamlit as st
 import time
-from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
-from function import split_text_by_token_limit, safe_sentiment, analyze_detail, preprocess_text
 st.title("Reddit Scraping & Sentiment Analysis")
@@ -18,57 +38,57 @@ else:
 st.write("Search Query:", search_query)
 # ---------- Cached function for loading the sentiment model pipeline ----------
-@st.cache_resource
-def load_sentiment_pipeline():
-    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
-    model = AutoModelForSequenceClassification.from_pretrained(
-        "cardiffnlp/twitter-roberta-base-sentiment-latest",
-        use_auth_token=st.secrets["hugging_face_token"]
-    )
-    sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
-    max_tokens = tokenizer.model_max_length
-    if max_tokens > 10000:
-        max_tokens = 512
-    return sentiment_pipeline, tokenizer, max_tokens
 # ---------- Cached function for scraping Reddit data ----------
-@st.cache_data(show_spinner=False)
-def scrape_reddit_data(search_query, total_limit):
-    # Retrieve API credentials from st.secrets
-    reddit = praw.Reddit(
-        client_id=st.secrets["reddit_client_id"],
-        client_secret=st.secrets["reddit_client_secret"],
-        user_agent=st.secrets["reddit_user_agent"]
-    )
-    subreddit = reddit.subreddit("all")
-    posts_data = []
-    # Iterate over submissions based on the search query and limit
-    for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
-        # No UI updates here as caching does not allow live progress updates
-        if submission.title and submission.selftext:
-            posts_data.append([
-                submission.title,
-                submission.url,
-                submission.created_utc,
-                submission.selftext,
-            ])
-            time.sleep(0.25)
-    df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
     # Function to normalize text by replacing multiple spaces/newlines with a single space
-    def normalize_text(text):
-        if not isinstance(text, str):
-            return ""
-        return re.sub(r'\s+', ' ', text).strip()
-    for col in ["Title", "Detail"]:
-        df[col] = df[col].apply(normalize_text)
-    # Filter out rows with empty Title or Detail
-    df = df[(df["Title"] != "") & (df["Detail"] != "")]
-    df['Date'] = pd.to_datetime(df['Date'], unit='s')
-    df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
-    return df
 # Button to trigger scraping and sentiment analysis
 if st.button("Scrape and Sentiment Analysis"):
@@ -82,44 +102,44 @@ if st.button("Scrape and Sentiment Analysis"):
     st.session_state["df"] = df
     # ------------------ Sentiment Analysis Functions ------------------------#
-    def split_text_by_token_limit(text, tokenizer, max_tokens):
-        tokens = tokenizer.encode(text, add_special_tokens=False)
-        chunks = []
-        for i in range(0, len(tokens), max_tokens):
-            chunk_tokens = tokens[i:i+max_tokens]
-            chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
-            chunks.append(chunk_text)
-        return chunks
-    def safe_sentiment(text):
-        try:
-            result = sentiment_pipeline(text)[0]
-        except Exception as e:
-            result = None
-        return result
-    def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
-        text = preprocess_text(text)
-        chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
-        if not chunks:
-            return None
-        # Initialize accumulated scores for each sentiment category
-        scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
-        for chunk in chunks:
-            result = safe_sentiment(chunk)
-            if result is not None:
-                label = result['label'].upper()
-                if label in scores:
-                    scores[label] += result['score']
-        final_label = max(scores, key=lambda k: scores[k])
-        final_score = scores[final_label]
-        return {"label": final_label, "score": final_score}
-    def preprocess_text(text):
-        # Replace URLs and user mentions
-        text = re.sub(r'http\S+', 'http', text)
-        text = re.sub(r'@\w+', '@user', text)
-        return text
     #-----------------------------------------------------------------------#
     # Load the sentiment pipeline (cached)

+from collections import Counter
 import matplotlib.pyplot as plt
 import pandas as pd
 import praw  # Reddit's API
 import re  # Regular expression module
 import streamlit as st
 import time
+import numpy as np
+from wordcloud import WordCloud
+from transformers import (
+    pipeline,
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    TokenClassificationPipeline
+    )
+from transformers.pipelines import AggregationStrategy
+from functions import (
+                    load_sentiment_pipeline,
+                    KeyphraseExtractionPipeline,
+                    keyword_extractor,
+                    scrape_reddit_data,
+                    split_text_by_token_limit,
+                    safe_sentiment,
+                    analyze_detail,
+                    preprocess_text
+                )
 st.title("Reddit Scraping & Sentiment Analysis")
 st.write("Search Query:", search_query)
 # ---------- Cached function for loading the sentiment model pipeline ----------
+# @st.cache_resource
+# def load_sentiment_pipeline():
+#     tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+#     model = AutoModelForSequenceClassification.from_pretrained(
+#         "cardiffnlp/twitter-roberta-base-sentiment-latest",
+#         use_auth_token=st.secrets["hugging_face_token"]
+#     )
+#     sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
+#     max_tokens = tokenizer.model_max_length
+#     if max_tokens > 10000:
+#         max_tokens = 512
+#     return sentiment_pipeline, tokenizer, max_tokens
 # ---------- Cached function for scraping Reddit data ----------
+# @st.cache_data(show_spinner=False)
+# def scrape_reddit_data(search_query, total_limit):
+#     # Retrieve API credentials from st.secrets
+#     reddit = praw.Reddit(
+#         client_id=st.secrets["reddit_client_id"],
+#         client_secret=st.secrets["reddit_client_secret"],
+#         user_agent=st.secrets["reddit_user_agent"]
+#     )
+#     subreddit = reddit.subreddit("all")
+#     posts_data = []
+#     # Iterate over submissions based on the search query and limit
+#     for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
+#         # No UI updates here as caching does not allow live progress updates
+#         if submission.title and submission.selftext:
+#             posts_data.append([
+#                 submission.title,
+#                 submission.url,
+#                 submission.created_utc,
+#                 submission.selftext,
+#             ])
+#             time.sleep(0.25)
+#     df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
     # Function to normalize text by replacing multiple spaces/newlines with a single space
+    # def normalize_text(text):
+    #     if not isinstance(text, str):
+    #         return ""
+    #     return re.sub(r'\s+', ' ', text).strip()
+    # for col in ["Title", "Detail"]:
+    #     df[col] = df[col].apply(normalize_text)
+    # # Filter out rows with empty Title or Detail
+    # df = df[(df["Title"] != "") & (df["Detail"] != "")]
+    # df['Date'] = pd.to_datetime(df['Date'], unit='s')
+    # df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
+    # return df
 # Button to trigger scraping and sentiment analysis
 if st.button("Scrape and Sentiment Analysis"):
     st.session_state["df"] = df
     # ------------------ Sentiment Analysis Functions ------------------------#
+    # def split_text_by_token_limit(text, tokenizer, max_tokens):
+    #     tokens = tokenizer.encode(text, add_special_tokens=False)
+    #     chunks = []
+    #     for i in range(0, len(tokens), max_tokens):
+    #         chunk_tokens = tokens[i:i+max_tokens]
+    #         chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
+    #         chunks.append(chunk_text)
+    #     return chunks
+    # def safe_sentiment(text):
+    #     try:
+    #         result = sentiment_pipeline(text)[0]
+    #     except Exception as e:
+    #         result = None
+    #     return result
+    # def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
+    #     text = preprocess_text(text)
+    #     chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
+    #     if not chunks:
+    #         return None
+    #     # Initialize accumulated scores for each sentiment category
+    #     scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
+    #     for chunk in chunks:
+    #         result = safe_sentiment(chunk)
+    #         if result is not None:
+    #             label = result['label'].upper()
+    #             if label in scores:
+    #                 scores[label] += result['score']
+    #     final_label = max(scores, key=lambda k: scores[k])
+    #     final_score = scores[final_label]
+    #     return {"label": final_label, "score": final_score}
+    # def preprocess_text(text):
+    #     # Replace URLs and user mentions
+    #     text = re.sub(r'http\S+', 'http', text)
+    #     text = re.sub(r'@\w+', '@user', text)
+    #     return text
     #-----------------------------------------------------------------------#
     # Load the sentiment pipeline (cached)