Spaces:

kusa04
/

g13_DL_project

Sleeping

App Files Files Community

kusa04 commited on Mar 15

Commit

b6d15a2

verified ·

1 Parent(s): cca9ecc

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -128

app.py CHANGED Viewed

@@ -10,85 +10,76 @@ st.title("Reddit Scraping & Sentiment Analysis")
 # --- User Input ---
 user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds")
-# Build search query by inserting the user input into quotes and also a variant without spaces
 if user_query:
     search_query = f'"{user_query}" OR "{user_query.replace(" ", "")}"'
 else:
     search_query = ""
 st.write("Search Query:", search_query)
-# --- Scraping Section ---
-if st.button("Scrape and Sentiment Analysis"):
-    # Set up a progress bar and status text
-    progress_bar = st.progress(0)
-    progress_text = st.empty()
-    # API Information
-    CLIENT_ID = st.secrets["reddit_client_id"]
-    CLIENT_SECRET = st.secrets["reddit_client_secret"]
-    USER_AGENT = st.secrets["reddit_user_agent"]
-    # Setting up PRAW
     reddit = praw.Reddit(
-        client_id=CLIENT_ID,
-        client_secret=CLIENT_SECRET,
-        user_agent=USER_AGENT
     )
     subreddit = reddit.subreddit("all")
     posts_data = []
-    total_limit = 5000  # maximum number of submissions to check
-    # Loop through submissions, update progress bar based on iteration count
     for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
-        # Update progress
-        progress = (i + 1) / total_limit
-        progress_bar.progress(progress)
-        progress_text.text(f"Scraping... {progress*100:.2f}%")
-        # Filtering out posts that appear to be image/video only by checking if title and selftext exist.
         if submission.title and submission.selftext:
-            title = submission.title
-            created_utc = submission.created_utc
-            submission_text = submission.selftext
             posts_data.append([
-                title,             # Original Title
-                submission.url,    # Original URL (case preserved)
-                created_utc,       # Date (UTC)
-                submission_text,   # Detail (main text)
             ])
-        time.sleep(0.25)
-    progress_text.text("Scraping complete.")
-    # Convert the collected posts into a DataFrame
     df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
-    # Define a text normalization function that replaces multiple spaces/newlines with a single space
     def normalize_text(text):
         if not isinstance(text, str):
             return ""
         return re.sub(r'\s+', ' ', text).strip()
-    # Apply normalization to the "Title" and "Detail" columns
     for col in ["Title", "Detail"]:
         df[col] = df[col].apply(normalize_text)
-    # Filter DataFrame to include only rows where Title and Detail are non-empty
     df = df[(df["Title"] != "") & (df["Detail"] != "")]
-    # Convert Date to datetime, sort descending and reset index
     df['Date'] = pd.to_datetime(df['Date'], unit='s')
     df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
     progress_text.text(f"Collected {len(df)} valid posts.")
     st.session_state["df"] = df
     # ------------------ Sentiment Analysis Functions ------------------------#
     def split_text_by_token_limit(text, tokenizer, max_tokens):
         tokens = tokenizer.encode(text, add_special_tokens=False)
@@ -99,7 +90,6 @@ if st.button("Scrape and Sentiment Analysis"):
             chunks.append(chunk_text)
         return chunks
     def safe_sentiment(text):
         try:
             result = sentiment_pipeline(text)[0]
@@ -107,13 +97,12 @@ if st.button("Scrape and Sentiment Analysis"):
             result = None
         return result
     def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
         text = preprocess_text(text)
         chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
         if not chunks:
             return None
-        # Initialize score accumulation for each sentiment category
         scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
         for chunk in chunks:
             result = safe_sentiment(chunk)
@@ -124,36 +113,24 @@ if st.button("Scrape and Sentiment Analysis"):
         final_label = max(scores, key=lambda k: scores[k])
         final_score = scores[final_label]
         return {"label": final_label, "score": final_score}
     def preprocess_text(text):
-        text = re.sub(r'http\S+', 'http', text)  # Replace URLs with 'http'
-        text = re.sub(r'@\w+', '@user', text)     # Replace user mentions with '@user'
         return text
     #-----------------------------------------------------------------------#
-    # --- Sentiment Analysis Section ---
     with st.spinner("Loading Sentiment Pipeline..."):
-        tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
-        model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest",
-                                                                  use_auth_token=st.secrets["hugging_face_token"])
-        sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1)
-        st.write("sentiment pipeline loaded...")
-        max_tokens = tokenizer.model_max_length
-        if max_tokens > 10000:
-            max_tokens = 512
     with st.spinner("Doing Sentiment Analysis..."):
-        # Apply sentiment analysis to Title directly (assuming Title is short)
         df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
-        # Apply sentiment analysis to Detail by splitting into token-limited chunks and accumulating scores
         df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
         df["Title_sentiment_label"] = df["title_sentiment"].apply(lambda x: x["label"] if x else None)
@@ -161,62 +138,54 @@ if st.button("Scrape and Sentiment Analysis"):
         df["Detail_sentiment_label"] = df["detail_sentiment"].apply(lambda x: x["label"] if x else None)
         df["Detail_sentiment_score"] = df["detail_sentiment"].apply(lambda x: x["score"] if x else None)
         df = df.drop(["title_sentiment", "detail_sentiment"], axis=1)
-        cols = ["Title", "Title_sentiment_label", "Title_sentiment_score", \
-                            "Detail", "Detail_sentiment_label", "Detail_sentiment_score", "Date"]
         df = df[cols]
         st.session_state["df"] = df
 if st.button("Draw Graph"):
     df = st.session_state.get("df")
-# ------------------- Plot Title's Sentiment Score -------------------#
-    fig1, ax1 = plt.subplots(figsize=(10, 5))
-    # positive, negative, neutral の各フィルタリングを実施してプロット
-    positive_title = df[df["Title_sentiment_label"].str.lower() == "positive"]
-    negative_title = df[df["Title_sentiment_label"].str.lower() == "negative"]
-    neutral_title  = df[df["Title_sentiment_label"].str.lower() == "neutral"]
-    ax1.plot(positive_title["Date"], positive_title["Title_sentiment_score"],
-                                            marker="o", label="Title Positive", color="orange")
-    ax1.plot(negative_title["Date"], negative_title["Title_sentiment_score"],
-                                            marker="o", label="Title Negative", color="blue")
-    ax1.plot(neutral_title["Date"], neutral_title["Title_sentiment_score"],
-                                            marker="o", label="Title Neutral", color="yellowgreen")
-    ax1.set_title("Title Sentiment Scores Over Time")
-    ax1.set_xlabel("Time")
-    ax1.set_ylabel("Sentiment Score")
-    ax1.legend()
-    plt.xticks(rotation=45)
-    st.pyplot(fig1)
-# ------------------- Plot Detail's Sentiment Score -------------------#
-    fig2, ax2 = plt.subplots(figsize=(10, 5))
-    positive_detail = df[df["Detail_sentiment_label"].str.lower() == "positive"]
-    negative_detail = df[df["Detail_sentiment_label"].str.lower() == "negative"]
-    neutral_detail  = df[df["Detail_sentiment_label"].str.lower() == "neutral"]
-    ax2.plot(positive_detail["Date"], positive_detail["Detail_sentiment_score"],
-                                            marker="+", label="Detail Positive", color="darkorange")
-    ax2.plot(negative_detail["Date"], negative_detail["Detail_sentiment_score"],
-                                            marker="+", label="Detail Negative", color="navy")
-    ax2.plot(neutral_detail["Date"], neutral_detail["Detail_sentiment_score"],
-                                            marker="+", label="Detail Neutral", color="forestgreen")
-    ax2.set_title("Detail Sentiment Scores Over Time")
-    ax2.set_xlabel("Time")
-    ax2.set_ylabel("Sentiment Score")
-    ax2.legend()
-    plt.xticks(rotation=45)
-    st.pyplot(fig2)

 # --- User Input ---
 user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds")
 if user_query:
     search_query = f'"{user_query}" OR "{user_query.replace(" ", "")}"'
 else:
     search_query = ""
 st.write("Search Query:", search_query)
+# ---------- Cached function for loading the sentiment model pipeline ----------
+@st.cache_resource
+def load_sentiment_pipeline():
+    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+    model = AutoModelForSequenceClassification.from_pretrained(
+        "cardiffnlp/twitter-roberta-base-sentiment-latest",
+        use_auth_token=st.secrets["hugging_face_token"]
+    )
+    sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1)
+    max_tokens = tokenizer.model_max_length
+    if max_tokens > 10000:
+        max_tokens = 512
+    return sentiment_pipeline, tokenizer, max_tokens
+# ---------- Cached function for scraping Reddit data ----------
+@st.cache_data(show_spinner=False)
+def scrape_reddit_data(search_query, total_limit):
+    # Retrieve API credentials from st.secrets
     reddit = praw.Reddit(
+        client_id=st.secrets["reddit_client_id"],
+        client_secret=st.secrets["reddit_client_secret"],
+        user_agent=st.secrets["reddit_user_agent"]
     )
     subreddit = reddit.subreddit("all")
     posts_data = []
+    # Iterate over submissions based on the search query and limit
     for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
+        # No UI updates here as caching does not allow live progress updates
         if submission.title and submission.selftext:
             posts_data.append([
+                submission.title,
+                submission.url,
+                submission.created_utc,
+                submission.selftext,
             ])
+            time.sleep(0.25)
     df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
+    # Function to normalize text by replacing multiple spaces/newlines with a single space
     def normalize_text(text):
         if not isinstance(text, str):
             return ""
         return re.sub(r'\s+', ' ', text).strip()
     for col in ["Title", "Detail"]:
         df[col] = df[col].apply(normalize_text)
+    # Filter out rows with empty Title or Detail
     df = df[(df["Title"] != "") & (df["Detail"] != "")]
     df['Date'] = pd.to_datetime(df['Date'], unit='s')
     df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
+    return df
+# Button to trigger scraping and sentiment analysis
+if st.button("Scrape and Sentiment Analysis"):
+    progress_bar = st.progress(0)
+    progress_text = st.empty()
+    total_limit = 5000  # Maximum number of submissions to check
+    # Cached scraping; if the same search query is used, cached results are returned
+    df = scrape_reddit_data(search_query, total_limit)
     progress_text.text(f"Collected {len(df)} valid posts.")
     st.session_state["df"] = df
     # ------------------ Sentiment Analysis Functions ------------------------#
     def split_text_by_token_limit(text, tokenizer, max_tokens):
         tokens = tokenizer.encode(text, add_special_tokens=False)
             chunks.append(chunk_text)
         return chunks
     def safe_sentiment(text):
         try:
             result = sentiment_pipeline(text)[0]
             result = None
         return result
     def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
         text = preprocess_text(text)
         chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
         if not chunks:
             return None
+        # Initialize accumulated scores for each sentiment category
         scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
         for chunk in chunks:
             result = safe_sentiment(chunk)
         final_label = max(scores, key=lambda k: scores[k])
         final_score = scores[final_label]
         return {"label": final_label, "score": final_score}
     def preprocess_text(text):
+        # Replace URLs and user mentions
+        text = re.sub(r'http\S+', 'http', text)
+        text = re.sub(r'@\w+', '@user', text)
         return text
     #-----------------------------------------------------------------------#
+    # Load the sentiment pipeline (cached)
     with st.spinner("Loading Sentiment Pipeline..."):
+        sentiment_pipeline, tokenizer, max_tokens = load_sentiment_pipeline()
+        st.write("Sentiment pipeline loaded...")
+    # Perform sentiment analysis
     with st.spinner("Doing Sentiment Analysis..."):
+        # Analyze Title sentiment directly (assuming the title is short)
         df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
+        # Analyze Detail sentiment by splitting into token-limited chunks and accumulating scores
         df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
         df["Title_sentiment_label"] = df["title_sentiment"].apply(lambda x: x["label"] if x else None)
         df["Detail_sentiment_label"] = df["detail_sentiment"].apply(lambda x: x["label"] if x else None)
         df["Detail_sentiment_score"] = df["detail_sentiment"].apply(lambda x: x["score"] if x else None)
+        # Drop intermediate columns
         df = df.drop(["title_sentiment", "detail_sentiment"], axis=1)
+        cols = ["Title", "Title_sentiment_label", "Title_sentiment_score",
+                "Detail", "Detail_sentiment_label", "Detail_sentiment_score", "Date"]
         df = df[cols]
         st.session_state["df"] = df
+# Button to draw graphs
 if st.button("Draw Graph"):
     df = st.session_state.get("df")
+    if df is None or df.empty:
+        st.write("Please run 'Scrape and Sentiment Analysis' first.")
+    else:
+        # ------------------- Plot Title's Sentiment Score -------------------#
+        fig1, ax1 = plt.subplots(figsize=(10, 5))
+        # Filter and plot for each sentiment category
+        positive_title = df[df["Title_sentiment_label"].str.lower() == "positive"]
+        negative_title = df[df["Title_sentiment_label"].str.lower() == "negative"]
+        neutral_title  = df[df["Title_sentiment_label"].str.lower() == "neutral"]
+        ax1.plot(positive_title["Date"], positive_title["Title_sentiment_score"],
+                 marker="o", label="Title Positive", color="orange")
+        ax1.plot(negative_title["Date"], negative_title["Title_sentiment_score"],
+                 marker="o", label="Title Negative", color="blue")
+        ax1.plot(neutral_title["Date"], neutral_title["Title_sentiment_score"],
+                 marker="o", label="Title Neutral", color="yellowgreen")
+        ax1.set_title("Title Sentiment Scores Over Time")
+        ax1.set_xlabel("Time")
+        ax1.set_ylabel("Sentiment Score")
+        ax1.legend()
+        plt.xticks(rotation=45)
+        st.pyplot(fig1)
+        # ------------------- Plot Detail's Sentiment Score -------------------#
+        fig2, ax2 = plt.subplots(figsize=(10, 5))
+        positive_detail = df[df["Detail_sentiment_label"].str.lower() == "positive"]
+        negative_detail = df[df["Detail_sentiment_label"].str.lower() == "negative"]
+        neutral_detail  = df[df["Detail_sentiment_label"].str.lower() == "neutral"]
+        ax2.plot(positive_detail["Date"], positive_detail["Detail_sentiment_score"],
+                 marker="+", label="Detail Positive", color="darkorange")
+        ax2.plot(negative_detail["Date"], negative_detail["Detail_sentiment_score"],
+                 marker="+", label="Detail Negative", color="navy")
+        ax2.plot(neutral_detail["Date"], neutral_detail["Detail_sentiment_score"],
+                 marker="+", label="Detail Neutral", color="forestgreen")
+        ax2.set_title("Detail Sentiment Scores Over Time")
+        ax2.set_xlabel("Time")
+        ax2.set_ylabel("Sentiment Score")
+        ax2.legend()
+        plt.xticks(rotation=45)
+        st.pyplot(fig2)