kusa04 commited on
Commit
ff461da
·
verified ·
1 Parent(s): 9f96692

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -71
app.py CHANGED
@@ -1,75 +1,169 @@
1
  import streamlit as st
2
- import praw
3
  import pandas as pd
 
4
  import time
 
5
 
6
- # ----- Reddit APIの認証情報 -----
7
- CLIENT_ID = "Yo1i-hMOZshiGY3whJpHhA" # personal use script
8
- CLIENT_SECRET = "K6TsiBiNzg-RV8akXNLaXwLyQkMo8A" # secret
9
- USER_AGENT = "sotaro_ism" # ユニークなuser_agent(例: "my-reddit-script v1.0 by /u/YourRedditUsername"
10
-
11
- # PRAWの設定
12
- reddit = praw.Reddit(
13
- client_id=CLIENT_ID,
14
- client_secret=CLIENT_SECRET,
15
- user_agent=USER_AGENT
16
- )
17
-
18
- # Streamlitのタイトル・説明
19
- st.title("Reddit 検索: Monster Hunter Wilds")
20
- st.write("Redditから『Monster Hunter Wilds OR MHWs OR MHWS』に関する投稿を取得しています。")
21
-
22
- # 検索クエリとサブレディットの指定
23
- search_query = "Monster Hunter Wilds OR MHWs OR MHWS"
24
- subreddit = reddit.subreddit("all")
25
-
26
- # 取得件数の設定(例として100件)
27
- limit = 50
28
-
29
- # 結果を格納するリスト
30
- posts_data = []
31
-
32
- # Streamlitのプログレスバーと状態表示
33
- progress_bar = st.progress(0)
34
- status_text = st.empty()
35
-
36
- # 検索結果を1件ずつ処理しながら進捗更新
37
- for i, submission in enumerate(subreddit.search(search_query, sort="new", limit=limit)):
38
- # 投稿情報の取得
39
- title = submission.title
40
- url = submission.url
41
- score = submission.score
42
- author = str(submission.author) # 投稿者(Noneの場合もあるため文字列化)
43
- created_utc = submission.created_utc
44
- submission_text = submission.selftext
45
-
46
- # コメントの取得(replace_moreで全コメント展開)
47
- submission.comments.replace_more(limit=None)
48
- comments_list = [comment.body for comment in submission.comments.list()]
49
-
50
- # コメントは区切り文字で連結
51
- delimiter = "|||END|||"
52
- comments_text = delimiter.join(comments_list)
53
-
54
- # 取得した投稿情報をリストに追加
55
- posts_data.append([
56
- title, # タイトル
57
- url, # URL
58
- score, # スコア
59
- author, # 投稿者
60
- created_utc, # 投稿日時(UTC)
61
- submission_text, # 投稿本文
62
- comments_text # コメント一覧
63
- ])
64
-
65
- # プログレスバーの更新(i+1件目を処理したら進捗率を更新)
66
- progress_bar.progress((i + 1) / limit)
67
- status_text.text(f"Processed {i + 1} / {limit} submissions")
68
-
69
- # サーバー負荷対策として少し待機
70
- time.sleep(2)
71
-
72
- # DataFrameに変換して結果を表示
73
- df = pd.DataFrame(posts_data, columns=["Title", "URL", "Score", "Poster", "Date", "Detail", "Comment"])
74
- st.write("### 上位5件の結果")
75
- st.dataframe(df.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import praw # Reddit's API
3
  import pandas as pd
4
+ import re # Regular expression module
5
  import time
6
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
7
 
8
+ st.title("Reddit Scraping & Sentiment Analysis")
9
+
10
+ # --- User Input ---
11
+ user_query = st.text_input("Enter search keyword:", value="Monster Hunter Wilds")
12
+ # Build search query by inserting the user input into quotes and also a variant without spaces
13
+ if user_query:
14
+ search_query = f'"{user_query}" OR "{user_query.replace(" ", "")}"'
15
+ else:
16
+ search_query = ""
17
+
18
+ st.write("Search Query:", search_query)
19
+
20
+ # --- Scraping Section ---
21
+ if st.button("Scrape Reddit"):
22
+ # Set up a progress bar and status text
23
+ progress_bar = st.progress(0)
24
+ progress_text = st.empty()
25
+
26
+ # API Information
27
+ CLIENT_ID = "Yo1i-hMOZshiGY3whJpHhA"
28
+ CLIENT_SECRET = "K6TsiBiNzg-RV8akXNLaXwLyQkMo8A"
29
+ USER_AGENT = "sotaro_ism"
30
+
31
+ # Setting up PRAW
32
+ reddit = praw.Reddit(
33
+ client_id=CLIENT_ID,
34
+ client_secret=CLIENT_SECRET,
35
+ user_agent=USER_AGENT
36
+ )
37
+
38
+ subreddit = reddit.subreddit("all")
39
+
40
+ posts_data = []
41
+ total_limit = 5000 # maximum number of submissions to check
42
+
43
+ # Loop through submissions, update progress bar based on iteration count
44
+ for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
45
+ # Update progress
46
+ progress = (i + 1) / total_limit
47
+ progress_bar.progress(progress)
48
+ progress_text.text(f"Scraping... {progress*100:.2f}%")
49
+
50
+ # Filtering out posts that appear to be image/video only by checking if title and selftext exist.
51
+ if submission.title and submission.selftext:
52
+ title = submission.title
53
+ created_utc = submission.created_utc
54
+ submission_text = submission.selftext
55
+
56
+
57
+ posts_data.append([
58
+ title, # Original Title
59
+ submission.url, # Original URL (case preserved)
60
+ created_utc, # Date (UTC)
61
+ submission_text, # Detail (main text)
62
+ ])
63
+
64
+ time.sleep(0.25)
65
+
66
+ progress_text.text("Scraping complete.)
67
+
68
+ # Convert the collected posts into a DataFrame
69
+ df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
70
+ progress_text.text(f"Collected {len(df)} valid posts.")
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+ # Define a text normalization function that replaces multiple spaces/newlines with a single space
79
+ def normalize_text(text):
80
+ if not isinstance(text, str):
81
+ return ""
82
+ return re.sub(r'\s+', ' ', text).strip()
83
+
84
+ # Apply normalization to the "Title" and "Detail" columns
85
+ for col in ["Title", "Detail"]:
86
+ df[col] = df[col].apply(normalize_text)
87
+
88
+ # Filter DataFrame to include only rows where Title and Detail are non-empty
89
+ df = df[(df["Title"] != "") & (df["Detail"] != "")]
90
+
91
+ st.write(f"Collected {len(df)} valid posts.")
92
+
93
+ # Convert Date to datetime, sort descending and reset index
94
+ df['Date'] = pd.to_datetime(df['Date'], unit='s')
95
+ df = df.sort_values(by="Date", ascending=False).reset_index(drop=True)
96
+
97
+ # --- Sentiment Analysis Section ---
98
+ with st.spinner("Loading sentiment pipeline..."):
99
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
100
+ model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
101
+ sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1)
102
+
103
+ max_tokens = tokenizer.model_max_length
104
+ if max_tokens > 10000:
105
+ max_tokens = 512
106
+
107
+ def preprocess_text(text):
108
+ text = re.sub(r'http\S+', 'http', text) # Replace URLs with 'http'
109
+ text = re.sub(r'@\w+', '@user', text) # Replace user mentions with '@user'
110
+ return text
111
+
112
+ def split_text_by_token_limit(text, tokenizer, max_tokens):
113
+ tokens = tokenizer.encode(text, add_special_tokens=False)
114
+ chunks = []
115
+ for i in range(0, len(tokens), max_tokens):
116
+ chunk_tokens = tokens[i:i+max_tokens]
117
+ chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
118
+ chunks.append(chunk_text)
119
+ return chunks
120
+
121
+ def safe_sentiment(text):
122
+ try:
123
+ result = sentiment_pipeline(text)[0]
124
+ except Exception as e:
125
+ result = None
126
+ return result
127
+
128
+ def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
129
+ text = preprocess_text(text)
130
+ chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
131
+ if not chunks:
132
+ return None
133
+ # Initialize score accumulation for each sentiment category
134
+ scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
135
+ for chunk in chunks:
136
+ result = safe_sentiment(chunk)
137
+ if result is not None:
138
+ label = result['label'].upper()
139
+ if label in scores:
140
+ scores[label] += result['score']
141
+ final_label = max(scores, key=lambda k: scores[k])
142
+ final_score = scores[final_label]
143
+ return {"label": final_label, "score": final_score}
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+ # Apply sentiment analysis to Title directly (assuming Title is short)
154
+ df['title_sentiment'] = df['Title'].apply(lambda x: safe_sentiment(preprocess_text(x)) if x else None)
155
+
156
+ # Apply sentiment analysis to Detail by splitting into token-limited chunks and accumulating scores
157
+ df['detail_sentiment'] = df['Detail'].apply(lambda x: analyze_detail(x, tokenizer, sentiment_pipeline, max_tokens) if x else None)
158
+
159
+ df["Title_sentiment_label"] = df["title_sentiment"].apply(lambda x: x["label"] if x else None)
160
+ df["Title_sentiment_score"] = df["title_sentiment"].apply(lambda x: x["score"] if x else None)
161
+ df["Detail_sentiment_label"] = df["detail_sentiment"].apply(lambda x: x["label"] if x else None)
162
+ df["Detail_sentiment_score"] = df["detail_sentiment"].apply(lambda x: x["score"] if x else None)
163
+
164
+ df = df.drop(["title_sentiment", "detail_sentiment"], axis=1)
165
+ cols = ["Title", "Title_sentiment_label", "Title_sentiment_score", "Detail", "Detail_sentiment_label", "Detail_sentiment_score", "Date"]
166
+ df = df[cols]
167
+
168
+ st.write("Sentiment analysis complete. Top 5 results:")
169
+ st.dataframe(df.head())