kusa04 commited on
Commit
35ad6e4
·
verified ·
1 Parent(s): 188fc65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -82
app.py CHANGED
@@ -1,11 +1,31 @@
 
1
  import matplotlib.pyplot as plt
2
  import pandas as pd
3
  import praw # Reddit's API
4
  import re # Regular expression module
5
  import streamlit as st
6
  import time
7
- from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
8
- from function import split_text_by_token_limit, safe_sentiment, analyze_detail, preprocess_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  st.title("Reddit Scraping & Sentiment Analysis")
11
 
@@ -18,57 +38,57 @@ else:
18
  st.write("Search Query:", search_query)
19
 
20
  # ---------- Cached function for loading the sentiment model pipeline ----------
21
- @st.cache_resource
22
- def load_sentiment_pipeline():
23
- tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
24
- model = AutoModelForSequenceClassification.from_pretrained(
25
- "cardiffnlp/twitter-roberta-base-sentiment-latest",
26
- use_auth_token=st.secrets["hugging_face_token"]
27
- )
28
- sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
29
- max_tokens = tokenizer.model_max_length
30
- if max_tokens > 10000:
31
- max_tokens = 512
32
- return sentiment_pipeline, tokenizer, max_tokens
33
 
34
  # ---------- Cached function for scraping Reddit data ----------
35
- @st.cache_data(show_spinner=False)
36
- def scrape_reddit_data(search_query, total_limit):
37
- # Retrieve API credentials from st.secrets
38
- reddit = praw.Reddit(
39
- client_id=st.secrets["reddit_client_id"],
40
- client_secret=st.secrets["reddit_client_secret"],
41
- user_agent=st.secrets["reddit_user_agent"]
42
- )
43
- subreddit = reddit.subreddit("all")
44
- posts_data = []
45
- # Iterate over submissions based on the search query and limit
46
- for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
47
- # No UI updates here as caching does not allow live progress updates
48
- if submission.title and submission.selftext:
49
- posts_data.append([
50
- submission.title,
51
- submission.url,
52
- submission.created_utc,
53
- submission.selftext,
54
- ])
55
- time.sleep(0.25)
56
- df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
57
 
58
  # Function to normalize text by replacing multiple spaces/newlines with a single space
59
- def normalize_text(text):
60
- if not isinstance(text, str):
61
- return ""
62
- return re.sub(r'\s+', ' ', text).strip()
63
 
64
- for col in ["Title", "Detail"]:
65
- df[col] = df[col].apply(normalize_text)
66
 
67
- # Filter out rows with empty Title or Detail
68
- df = df[(df["Title"] != "") & (df["Detail"] != "")]
69
- df['Date'] = pd.to_datetime(df['Date'], unit='s')
70
- df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
71
- return df
72
 
73
  # Button to trigger scraping and sentiment analysis
74
  if st.button("Scrape and Sentiment Analysis"):
@@ -82,44 +102,44 @@ if st.button("Scrape and Sentiment Analysis"):
82
  st.session_state["df"] = df
83
 
84
  # ------------------ Sentiment Analysis Functions ------------------------#
85
- def split_text_by_token_limit(text, tokenizer, max_tokens):
86
- tokens = tokenizer.encode(text, add_special_tokens=False)
87
- chunks = []
88
- for i in range(0, len(tokens), max_tokens):
89
- chunk_tokens = tokens[i:i+max_tokens]
90
- chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
91
- chunks.append(chunk_text)
92
- return chunks
93
 
94
- def safe_sentiment(text):
95
- try:
96
- result = sentiment_pipeline(text)[0]
97
- except Exception as e:
98
- result = None
99
- return result
100
 
101
- def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
102
- text = preprocess_text(text)
103
- chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
104
- if not chunks:
105
- return None
106
- # Initialize accumulated scores for each sentiment category
107
- scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
108
- for chunk in chunks:
109
- result = safe_sentiment(chunk)
110
- if result is not None:
111
- label = result['label'].upper()
112
- if label in scores:
113
- scores[label] += result['score']
114
- final_label = max(scores, key=lambda k: scores[k])
115
- final_score = scores[final_label]
116
- return {"label": final_label, "score": final_score}
117
 
118
- def preprocess_text(text):
119
- # Replace URLs and user mentions
120
- text = re.sub(r'http\S+', 'http', text)
121
- text = re.sub(r'@\w+', '@user', text)
122
- return text
123
  #-----------------------------------------------------------------------#
124
 
125
  # Load the sentiment pipeline (cached)
 
1
+ from collections import Counter
2
  import matplotlib.pyplot as plt
3
  import pandas as pd
4
  import praw # Reddit's API
5
  import re # Regular expression module
6
  import streamlit as st
7
  import time
8
+ import numpy as np
9
+ from wordcloud import WordCloud
10
+ from transformers import (
11
+ pipeline,
12
+ AutoTokenizer,
13
+ AutoModelForSequenceClassification,
14
+ AutoModelForTokenClassification,
15
+ TokenClassificationPipeline
16
+ )
17
+ from transformers.pipelines import AggregationStrategy
18
+
19
+ from functions import (
20
+ load_sentiment_pipeline,
21
+ KeyphraseExtractionPipeline,
22
+ keyword_extractor,
23
+ scrape_reddit_data,
24
+ split_text_by_token_limit,
25
+ safe_sentiment,
26
+ analyze_detail,
27
+ preprocess_text
28
+ )
29
 
30
  st.title("Reddit Scraping & Sentiment Analysis")
31
 
 
38
  st.write("Search Query:", search_query)
39
 
40
  # ---------- Cached function for loading the sentiment model pipeline ----------
41
+ # @st.cache_resource
42
+ # def load_sentiment_pipeline():
43
+ # tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
44
+ # model = AutoModelForSequenceClassification.from_pretrained(
45
+ # "cardiffnlp/twitter-roberta-base-sentiment-latest",
46
+ # use_auth_token=st.secrets["hugging_face_token"]
47
+ # )
48
+ # sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
49
+ # max_tokens = tokenizer.model_max_length
50
+ # if max_tokens > 10000:
51
+ # max_tokens = 512
52
+ # return sentiment_pipeline, tokenizer, max_tokens
53
 
54
  # ---------- Cached function for scraping Reddit data ----------
55
+ # @st.cache_data(show_spinner=False)
56
+ # def scrape_reddit_data(search_query, total_limit):
57
+ # # Retrieve API credentials from st.secrets
58
+ # reddit = praw.Reddit(
59
+ # client_id=st.secrets["reddit_client_id"],
60
+ # client_secret=st.secrets["reddit_client_secret"],
61
+ # user_agent=st.secrets["reddit_user_agent"]
62
+ # )
63
+ # subreddit = reddit.subreddit("all")
64
+ # posts_data = []
65
+ # # Iterate over submissions based on the search query and limit
66
+ # for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
67
+ # # No UI updates here as caching does not allow live progress updates
68
+ # if submission.title and submission.selftext:
69
+ # posts_data.append([
70
+ # submission.title,
71
+ # submission.url,
72
+ # submission.created_utc,
73
+ # submission.selftext,
74
+ # ])
75
+ # time.sleep(0.25)
76
+ # df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
77
 
78
  # Function to normalize text by replacing multiple spaces/newlines with a single space
79
+ # def normalize_text(text):
80
+ # if not isinstance(text, str):
81
+ # return ""
82
+ # return re.sub(r'\s+', ' ', text).strip()
83
 
84
+ # for col in ["Title", "Detail"]:
85
+ # df[col] = df[col].apply(normalize_text)
86
 
87
+ # # Filter out rows with empty Title or Detail
88
+ # df = df[(df["Title"] != "") & (df["Detail"] != "")]
89
+ # df['Date'] = pd.to_datetime(df['Date'], unit='s')
90
+ # df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
91
+ # return df
92
 
93
  # Button to trigger scraping and sentiment analysis
94
  if st.button("Scrape and Sentiment Analysis"):
 
102
  st.session_state["df"] = df
103
 
104
  # ------------------ Sentiment Analysis Functions ------------------------#
105
+ # def split_text_by_token_limit(text, tokenizer, max_tokens):
106
+ # tokens = tokenizer.encode(text, add_special_tokens=False)
107
+ # chunks = []
108
+ # for i in range(0, len(tokens), max_tokens):
109
+ # chunk_tokens = tokens[i:i+max_tokens]
110
+ # chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
111
+ # chunks.append(chunk_text)
112
+ # return chunks
113
 
114
+ # def safe_sentiment(text):
115
+ # try:
116
+ # result = sentiment_pipeline(text)[0]
117
+ # except Exception as e:
118
+ # result = None
119
+ # return result
120
 
121
+ # def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
122
+ # text = preprocess_text(text)
123
+ # chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
124
+ # if not chunks:
125
+ # return None
126
+ # # Initialize accumulated scores for each sentiment category
127
+ # scores = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
128
+ # for chunk in chunks:
129
+ # result = safe_sentiment(chunk)
130
+ # if result is not None:
131
+ # label = result['label'].upper()
132
+ # if label in scores:
133
+ # scores[label] += result['score']
134
+ # final_label = max(scores, key=lambda k: scores[k])
135
+ # final_score = scores[final_label]
136
+ # return {"label": final_label, "score": final_score}
137
 
138
+ # def preprocess_text(text):
139
+ # # Replace URLs and user mentions
140
+ # text = re.sub(r'http\S+', 'http', text)
141
+ # text = re.sub(r'@\w+', '@user', text)
142
+ # return text
143
  #-----------------------------------------------------------------------#
144
 
145
  # Load the sentiment pipeline (cached)