Spaces:

kusa04
/

g13_DL_project

Sleeping

App Files Files Community

kusa04 commited on Mar 14

Commit

ff461da

verified ·

1 Parent(s): 9f96692

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -71

app.py CHANGED Viewed

@@ -1,75 +1,169 @@
 import streamlit as st
-import praw
 import pandas as pd
 import time
-# ----- Reddit APIの認証情報 -----
-CLIENT_ID = "Yo1i-hMOZshiGY3whJpHhA"         # personal use script
-CLIENT_SECRET = "K6TsiBiNzg-RV8akXNLaXwLyQkMo8A"  # secret
-USER_AGENT = "sotaro_ism"        # ユニークなuser_agent（例: "my-reddit-script v1.0 by /u/YourRedditUsername"）
-# PRAWの設定
-reddit = praw.Reddit(
-    client_id=CLIENT_ID,
-    client_secret=CLIENT_SECRET,
-    user_agent=USER_AGENT
-)
-# Streamlitのタイトル・説明
-st.title("Reddit 検索: Monster Hunter Wilds")
-st.write("Redditから『Monster Hunter Wilds OR MHWs OR MHWS』に関する投稿を取得しています。")
-# 検索クエリとサブレディットの指定
-search_query = "Monster Hunter Wilds OR MHWs OR MHWS"
-subreddit = reddit.subreddit("all")
-# 取得件数の設定（例として100件）
-limit = 50
-# 結果を格納するリスト
-posts_data = []
-# Streamlitのプログレスバーと状態表示
-progress_bar = st.progress(0)
-status_text = st.empty()
-# 検索結果を1件ずつ処理しながら進捗更新
-for i, submission in enumerate(subreddit.search(search_query, sort="new", limit=limit)):
-    # 投稿情報の取得
-    title = submission.title
-    url = submission.url
-    score = submission.score
-    author = str(submission.author)  # 投稿者（Noneの場合もあるため文字列化）
-    created_utc = submission.created_utc
-    submission_text = submission.selftext
-    # コメントの取得（replace_moreで全コメント展開）
-    submission.comments.replace_more(limit=None)
-    comments_list = [comment.body for comment in submission.comments.list()]
-    # コメントは区切り文字で連結
-    delimiter = "|||END|||"
-    comments_text = delimiter.join(comments_list)
-    # 取得した投稿情報をリストに追加
-    posts_data.append([
-        title,       # タイトル
-        url,         # URL
-        score,       # スコア
-        author,      # 投稿者
-        created_utc, # 投稿日時(UTC)
-        submission_text,  # 投稿本文
-        comments_text     # コメント一覧
-    ])
-    # プログレスバーの更新（i+1件目を処理したら進捗率を更新）
-    progress_bar.progress((i + 1) / limit)
-    status_text.text(f"Processed {i + 1} / {limit} submissions")
-    # サーバー負荷対策として少し待機
-    time.sleep(2)
-# DataFrameに変換して結果を表示
-df = pd.DataFrame(posts_data, columns=["Title", "URL", "Score", "Poster", "Date", "Detail", "Comment"])
-st.write("### 上位5件の結果")
-st.dataframe(df.head())

 import streamlit as st
+import praw  # Reddit's API
 import pandas as pd
+import re  # Regular expression module
 import time
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+st.title("Reddit Scraping & Sentiment Analysis")
+# --- User Input ---
+user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds")
+# Build search query by inserting the user input into quotes and also a variant without spaces
+if user_query:
+    search_query = f'"{user_query}" OR "{user_query.replace(" ", "")}"'
+else:
+    search_query = ""
+st.write("Search Query:", search_query)
+# --- Scraping Section ---
+if st.button("Scrape Reddit"):
+    # Set up a progress bar and status text
+    progress_bar = st.progress(0)
+    progress_text = st.empty()
+    # API Information
+    CLIENT_ID = "Yo1i-hMOZshiGY3whJpHhA"
+    CLIENT_SECRET = "K6TsiBiNzg-RV8akXNLaXwLyQkMo8A"
+    USER_AGENT = "sotaro_ism"
+    # Setting up PRAW
+    reddit = praw.Reddit(
+        client_id=CLIENT_ID,
+        client_secret=CLIENT_SECRET,
+        user_agent=USER_AGENT
+    )
+    subreddit = reddit.subreddit("all")
+    posts_data = []
+    total_limit = 5000  # maximum number of submissions to check
+    # Loop through submissions, update progress bar based on iteration count
+    for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
+        # Update progress
+        progress = (i + 1) / total_limit
+        progress_bar.progress(progress)
+        progress_text.text(f"Scraping... {progress*100:.2f}%")
+        # Filtering out posts that appear to be image/video only by checking if title and selftext exist.
+        if submission.title and submission.selftext:
+            title = submission.title
+            created_utc = submission.created_utc
+            submission_text = submission.selftext
+            posts_data.append([
+                title,             # Original Title
+                submission.url,    # Original URL (case preserved)
+                created_utc,       # Date (UTC)
+                submission_text,   # Detail (main text)
+            ])
+        time.sleep(0.25)
+    progress_text.text("Scraping complete.)
+    # Convert the collected posts into a DataFrame
+    df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
+    progress_text.text(f"Collected {len(df)} valid posts.")
+    # Define a text normalization function that replaces multiple spaces/newlines with a single space
+    def normalize_text(text):
+        if not isinstance(text, str):
+            return ""
+        return re.sub(r'\s+', ' ', text).strip()
+    # Apply normalization to the "Title" and "Detail" columns
+    for col in ["Title", "Detail"]:
+        df[col] = df[col].apply(normalize_text)
+    # Filter DataFrame to include only rows where Title and Detail are non-empty
+    df = df[(df["Title"] != "") & (df["Detail"] != "")]
+    st.write(f"Collected {len(df)} valid posts.")
+    # Convert Date to datetime, sort descending and reset index
+    df['Date'] = pd.to_datetime(df['Date'], unit='s')
+    df = df.sort_values(by="Date", ascending=False).reset_index(drop=True)
+    # --- Sentiment Analysis Section ---
+    with st.spinner("Loading sentiment pipeline..."):
+        tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+        model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+        sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1)
+    max_tokens = tokenizer.model_max_length
+    if max_tokens > 10000:
+        max_tokens = 512
+    def preprocess_text(text):
+        text = re.sub(r'http\S+', 'http', text)  # Replace URLs with 'http'
+        text = re.sub(r'@\w+', '@user', text)     # Replace user mentions with '@user'
+        return text
+    def split_text_by_token_limit(text, tokenizer, max_tokens):
+        tokens = tokenizer.encode(text, add_special_tokens=False)
+        chunks = []
+        for i in range(0, len(tokens), max_tokens):
+            chunk_tokens = tokens[i:i+max_tokens]
+            chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
+            chunks.append(chunk_text)
+        return chunks
+    def safe_sentiment(text):
+        try:
+            result = sentiment_pipeline(text)[0]
+        except Exception as e:
+            result = None
+        return result
+    def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
+        text = preprocess_text(text)
+        chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
+        if not chunks:
+            return None
+        # Initialize score accumulation for each sentiment category
+        scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
+        for chunk in chunks:
+            result = safe_sentiment(chunk)
+            if result is not None:
+                label = result['label'].upper()
+                if label in scores:
+                    scores[label] += result['score']
+        final_label = max(scores, key=lambda k: scores[k])
+        final_score = scores[final_label]
+        return {"label": final_label, "score": final_score}
+    # Apply sentiment analysis to Title directly (assuming Title is short)
+    df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
+    # Apply sentiment analysis to Detail by splitting into token-limited chunks and accumulating scores
+    df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
+    df["Title_sentiment_label"] = df["title_sentiment"].apply(lambda x: x["label"] if x else None)
+    df["Title_sentiment_score"] = df["title_sentiment"].apply(lambda x: x["score"] if x else None)
+    df["Detail_sentiment_label"] = df["detail_sentiment"].apply(lambda x: x["label"] if x else None)
+    df["Detail_sentiment_score"] = df["detail_sentiment"].apply(lambda x: x["score"] if x else None)
+    df = df.drop(["title_sentiment", "detail_sentiment"], axis=1)
+    cols = ["Title", "Title_sentiment_label", "Title_sentiment_score", "Detail", "Detail_sentiment_label", "Detail_sentiment_score", "Date"]
+    df = df[cols]
+    st.write("Sentiment analysis complete. Top 5 results:")
+    st.dataframe(df.head())