Spaces:

kusa04
/

g13_DL_project

Sleeping

App Files Files Community

kusa04 commited on Mar 20

Commit

188fc65

verified ·

1 Parent(s): 0a33dc9

Update function.py

Browse files

Files changed (1) hide show

function.py +117 -2

function.py CHANGED Viewed

@@ -1,3 +1,103 @@
 # ------------------ Sentiment Analysis Functions ------------------------#
 def split_text_by_token_limit(text, tokenizer, max_tokens):
     tokens = tokenizer.encode(text, add_special_tokens=False)
@@ -8,13 +108,15 @@ def split_text_by_token_limit(text, tokenizer, max_tokens):
         chunks.append(chunk_text)
     return chunks
-def safe_sentiment(text):
     try:
         result = sentiment_pipeline(text)[0]
     except Exception as e:
         result = None
     return result
 def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
     text = preprocess_text(text)
     chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
@@ -32,8 +134,21 @@ def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
     final_score = scores[final_label]
     return {"label": final_label, "score": final_score}
 def preprocess_text(text):
     # Replace URLs and user mentions
     text = re.sub(r'http\S+', 'http', text)
     text = re.sub(r'@\w+', '@user', text)
-    return text

+from collections import Counter
+import matplotlib.pyplot as plt
+import pandas as pd
+import praw  # Reddit's API
+import re  # Regular expression module
+import streamlit as st
+import time
+import numpy as np
+from wordcloud import WordCloud
+from transformers import (
+    pipeline,
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    TokenClassificationPipeline
+    )
+from transformers.pipelines import AggregationStrategy
+# ---------- Cached function for loading the model pipelines ----------
+@st.cache_resource
+def load_sentiment_pipeline(): # sentiment pipeline
+    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+    model = AutoModelForSequenceClassification.from_pretrained(
+        "cardiffnlp/twitter-roberta-base-sentiment-latest",
+        use_auth_token=st.secrets["hugging_face_token"]
+    )
+    sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0) # -1 to 0
+    max_tokens = tokenizer.model_max_length
+    if max_tokens > 10000:
+        max_tokens = 512
+    return sentiment_pipeline, tokenizer, max_tokens
+# class for keyword extraction
+@st.cache_resource
+class KeyphraseExtractionPipeline(TokenClassificationPipeline):
+    def __init__(self, model, *args, **kwargs):
+        super().__init__(
+            model=AutoModelForTokenClassification.from_pretrained(model),
+            tokenizer=AutoTokenizer.from_pretrained(model),
+            *args,
+            **kwargs
+        )
+    def postprocess(self, all_outputs):
+        results = super().postprocess(
+            all_outputs=all_outputs,
+            aggregation_strategy=AggregationStrategy.SIMPLE,
+        )
+        return np.unique([result.get("word").strip() for result in results])
+def keyword_extractor():
+    model_name = "ml6team/keyphrase-extraction-kbir-inspec"
+    extractor = KeyphraseExtractionPipeline(model=model_name)
+    return extractor
+# Function to normalize text by replacing multiple spaces/newlines with a single space
+def normalize_text(text):
+    if not isinstance(text, str):
+        return ""
+    return re.sub(r'\s+', ' ', text).strip()
+# ---------- Cached function for scraping Reddit data ----------
+@st.cache_data(show_spinner=False)
+def scrape_reddit_data(search_query, total_limit):
+    # Retrieve API credentials from st.secrets
+    reddit = praw.Reddit(
+        client_id=st.secrets["reddit_client_id"],
+        client_secret=st.secrets["reddit_client_secret"],
+        user_agent=st.secrets["reddit_user_agent"]
+    )
+    subreddit = reddit.subreddit("all")
+    posts_data = []
+    # Iterate over submissions based on the search query and limit
+    for i, submission in enumerate(subreddit.search(search_query, sort="relevance", limit=total_limit)):
+        # No UI updates here as caching does not allow live progress updates
+        if submission.title and submission.selftext:
+            posts_data.append([
+                submission.title,
+                submission.url,
+                submission.created_utc,
+                submission.selftext,
+            ])
+            time.sleep(0.25)
+    df = pd.DataFrame(posts_data, columns=["Title", "URL", "Date", "Detail"])
+    for col in ["Title", "Detail"]:
+        df[col] = df[col].apply(normalize_text)
+    # Filter out rows with empty Title or Detail
+    df = df[(df["Title"] != "") & (df["Detail"] != "")]
+    df['Date'] = pd.to_datetime(df['Date'], unit='s')
+    df = df.sort_values(by="Date", ascending=True).reset_index(drop=True)
+    return df
 # ------------------ Sentiment Analysis Functions ------------------------#
 def split_text_by_token_limit(text, tokenizer, max_tokens):
     tokens = tokenizer.encode(text, add_special_tokens=False)
         chunks.append(chunk_text)
     return chunks
+def safe_sentiment(sentiment_pipeline, text):
     try:
         result = sentiment_pipeline(text)[0]
     except Exception as e:
         result = None
     return result
 def analyze_detail(text, tokenizer, sentiment_pipeline, max_tokens):
     text = preprocess_text(text)
     chunks = split_text_by_token_limit(text, tokenizer, max_tokens)
     final_score = scores[final_label]
     return {"label": final_label, "score": final_score}
 def preprocess_text(text):
     # Replace URLs and user mentions
     text = re.sub(r'http\S+', 'http', text)
     text = re.sub(r'@\w+', '@user', text)
+    return text
+# def keyword_extraction(text):
+#     try:
+#         extractor = keyword_extractor()
+#         result = extractor(text)
+#     except Exception as e:
+#         # Optionally, log the error: print(f"Error processing text: {e}")
+#         result = None
+#     return result