Spaces:
Sleeping
Sleeping
File size: 13,260 Bytes
91eb9f9 fc29cdc 91eb9f9 7fc6144 4e6ae40 77da31d dfd6b67 8b9bf01 77da31d d9a35fb 8b9bf01 9f00778 d9a35fb 5ab1c08 6c1e788 91eb9f9 9f00778 ac1df46 d6463b9 91eb9f9 9c4bbfa d6463b9 2382fa5 076376a a652654 34549e6 33e01f0 1ee20a5 34549e6 ac1df46 34549e6 1bd0566 1ee20a5 ac1df46 1ee20a5 ac1df46 1bd0566 1ee20a5 076376a 77da31d f62daa9 076376a 77da31d f62daa9 77da31d 33248ee 77da31d 35ed070 77da31d 076376a 77da31d 076376a 77da31d 076376a 77da31d 9f00778 77da31d cc58a52 33248ee f1b5d59 33248ee 77da31d 076376a 82ade9a d6463b9 82ade9a 77da31d f6e732f 35ed070 9f00778 82ade9a 7b1d3eb b393a11 0e7e464 b393a11 9f00778 77da31d 82ade9a 6521a16 82ade9a ac1df46 82ade9a d6463b9 9ba919a 82ade9a 1ee20a5 82ade9a 77da31d 82ade9a 77da31d 82ade9a 9f00778 82ade9a 77da31d 82ade9a 9f00778 82ade9a 77da31d 82ade9a 77da31d 82ade9a 1ee20a5 82ade9a 77da31d 82ade9a ee46d7b 82ade9a 77da31d 82ade9a 1d9c203 77da31d c746718 77da31d c746718 77da31d 9f00778 82ade9a 68286e4 77da31d 68286e4 82ade9a 9f00778 82ade9a 9f00778 f6e732f 82ade9a cc58a52 9f00778 82ade9a e5a7e55 cc58a52 82ade9a f6e732f 82ade9a f62daa9 e5a7e55 82ade9a 9f00778 82ade9a 77da31d 82ade9a 03d0510 9f00778 848a741 82ade9a 9f00778 03d0510 82ade9a ee41d09 03d0510 9f00778 82ade9a 77da31d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 |
import os
import nest_asyncio
nest_asyncio.apply()
import streamlit as st
from transformers import pipeline
from huggingface_hub import login
from streamlit.components.v1 import html
import pandas as pd
import torch
import random
import gc
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# Retrieve the token from environment variables for Hugging Face login
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
st.error("Hugging Face token not found. Please set the HF_TOKEN environment variable.")
st.stop()
# Hugging Face login with the token just in case the models requirement authorization
login(token=hf_token)
# Timer component using HTML and JavaScript
def timer():
return """
<div id="timer" style="font-size:16px;color:#666;margin-bottom:10px;">⏱️ Elapsed: 00:00</div>
<script>
(function() {
var start = Date.now();
var timerElement = document.getElementById('timer');
localStorage.removeItem("freezeTimer");
var interval = setInterval(function() {
if(localStorage.getItem("freezeTimer") === "true"){
clearInterval(interval);
timerElement.style.color = '#00cc00';
return;
}
var elapsed = Date.now() - start;
var minutes = Math.floor(elapsed / 60000);
var seconds = Math.floor((elapsed % 60000) / 1000);
timerElement.innerHTML = '⏱️ Elapsed: ' +
(minutes < 10 ? '0' : '') + minutes + ':' +
(seconds < 10 ? '0' : '') + seconds;
}, 1000);
})();
</script>
"""
# Display the Title
st.set_page_config(page_title="Twitter/X Tweets Scorer & Report Generator", page_icon="📝")
st.header("𝕏/Twitter Tweets Sentiment Report Generator")
# Concise introduction
st.write("This model🎰 will score your tweets in your CSV file🗄️ based on their sentiment😀 and generate a report🗟 answering your query question❔ based on those results.")
# Display VRAM status for debug
def print_gpu_status(label):
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**3
reserved = torch.cuda.memory_reserved() / 1024**3
st.info(f"{label}: Allocated {allocated:.2f} GB, Reserved {reserved:.2f} GB")
# Cache the model loading functions
@st.cache_resource
def get_sentiment_model():
return pipeline("text-classification",
model="cardiffnlp/twitter-roberta-base-sentiment-latest",
device=0 if torch.cuda.is_available() else -1)
@st.cache_resource
def get_summary_model():
return pipeline("text-generation",
model="frankai98/T5FinetunedCommentSummary",
device=0 if torch.cuda.is_available() else -1)
# Function to clear GPU memory
def clear_gpu_memory():
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
# Function to build the prompt for text generation model using only the sentiment label
def build_prompt(query_input, sampled_docs):
docs_text = ""
# Use the sentiment label directly from each document (converted to lowercase)
for idx, doc in enumerate(sampled_docs):
sentiment_word = doc['sentiment'].lower() if doc.get('sentiment') else "unknown"
docs_text += f"Tweet {idx+1} (Sentiment: {sentiment_word}): {doc['comment']}\n"
system_message = """You are an helpful assistant. Read the Tweets with their sentiment (Negative, Neutral, Positive) provided and produce a well-structured report that answers the query question.
Your task:
- Summarize both positive and negative aspects, highlighting any trends in user sentiment.
- Include an introduction, key insights, and a conclusion, reaching about 400 words.
- DO NOT repeat these instructions or the user's query in the final report. Only provide the final text."""
user_content = f"""**Tweets**:
{docs_text}
**Query Question**: "{query_input}"
Now produce the final report only, without reiterating these instructions or the query."""
# This is the required chat format for Llama-3.2-1B-Instruct model, select intruct model for better performance
messages = [
[
{
"role": "system",
"content": [{"type": "text", "text": system_message}]
},
{
"role": "user",
"content": [{"type": "text", "text": user_content}]
}
]
]
return messages
# Main Function Part:
def main():
# Let the user specify the column name for tweets text (defaulting to "content")
tweets_column = st.text_input("Enter the column name for Tweets🐦:", value="content")
# Input: Query question for analysis and CSV file upload for candidate tweets
query_input = st.text_area("Enter your query question❓for analysis (Format: How do these people feel about ...?) (this does not need to be part of the CSV):")
uploaded_file = st.file_uploader(f"Upload Tweets CSV File < 1MB🗄️(must contain a '{tweets_column}' column with preferably <1000 tweets)", type=["csv"])
# Error check steps to ensure that the uploaded file meets the requirements
candidate_docs = []
if uploaded_file is not None:
if uploaded_file.size > 1 * 1024 * 1024:
st.error("The file is too large! Please upload a file smaller than 1MB.")
else:
try:
df = pd.read_csv(uploaded_file)
if tweets_column not in df.columns:
st.error(f"CSV must contain a '{tweets_column}' column.")
else:
candidate_docs = df[tweets_column].dropna().astype(str).tolist()
st.write("File uploaded successfully!🎆")
except Exception as e:
st.error(f"Error reading CSV file: {e}")
# Click on the button will start running the pipelines in sequence and the timer
if st.button("Generate Report"):
st.session_state.setdefault("timer_started", False)
st.session_state.setdefault("timer_frozen", False)
if uploaded_file is None:
st.error("Please upload a CSV file🗄️.")
elif not tweets_column.strip():
st.error("Please enter your column name")
elif not candidate_docs:
st.error(f"CSV must contain a '{tweets_column}' column.")
elif not query_input.strip():
st.error("Please enter a query question❔!")
else:
if not st.session_state.timer_started and not st.session_state.timer_frozen:
st.session_state.timer_started = True
html(timer(), height=50)
status_text = st.empty()
progress_bar = st.progress(0)
processed_docs = []
scored_results = []
# Check which documents need summarization (tweets longer than 280 characters)
docs_to_summarize = []
docs_indices = []
for i, doc in enumerate(candidate_docs):
if len(doc) > 280:
docs_to_summarize.append(doc)
docs_indices.append(i)
# Summarize long tweets if needed
if docs_to_summarize:
status_text.markdown("**📝 Loading summarization model...**")
t5_pipe = get_summary_model()
status_text.markdown("**📝 Summarizing long tweets...**")
# Dispay the progress
for idx, (i, doc) in enumerate(zip(docs_indices, docs_to_summarize)):
progress = int((idx / len(docs_to_summarize)) * 25)
progress_bar.progress(progress)
input_text = "summarize: " + doc
try:
summary_result = t5_pipe(
input_text,
max_length=128,
min_length=10,
no_repeat_ngram_size=2,
num_beams=4,
early_stopping=True,
truncation=True
)
candidate_docs[i] = summary_result[0]['generated_text']
except Exception as e:
st.warning(f"Error summarizing document {i}: {str(e)}")
# Delete summarization model from VRAM for optimization
del t5_pipe
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Load sentiment analysis model
status_text.markdown("**🔍 Loading sentiment analysis model...**")
progress_bar.progress(25)
score_pipe = get_sentiment_model()
status_text.markdown("**🔍 Scoring documents...**")
for i, doc in enumerate(candidate_docs):
progress_offset = 25 if docs_to_summarize else 0
progress = progress_offset + int((i / len(candidate_docs)) * (50 - progress_offset))
progress_bar.progress(progress)
try:
result = score_pipe(doc, truncation=True, max_length=512)
if isinstance(result, list):
result = result[0]
processed_docs.append(doc)
# Store only the sentiment label (e.g., "Negative", "Neutral", "Positive")
scored_results.append(result)
except Exception as e:
st.warning(f"Error scoring document {i}: {str(e)}")
processed_docs.append("Error processing this document")
scored_results.append({"label": "Neutral"})
if i % max(1, len(candidate_docs) // 10) == 0:
status_text.markdown(f"**🔍 Scoring documents... ({i}/{len(candidate_docs)})**")
# Pair documents with sentiment labels using key "comment"
scored_docs = [
{"comment": doc, "sentiment": result.get("label", "Neutral")}
for doc, result in zip(processed_docs, scored_results)
]
# Delete sentiment analysis model from VRAM for optimization
del score_pipe
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
status_text.markdown("**📊 Loading report generation model...**")
progress_bar.progress(67)
# Clear the VRAM to prepare for Llama so it wouldn't encounter OOM errors
clear_gpu_memory()
status_text.markdown("**📝 Preparing data for report generation...**")
progress_bar.progress(75)
# Set the maximum examples text generation model can take
max_tweets = 1000
if len(scored_docs) > max_tweets:
sampled_docs = random.sample(scored_docs, max_tweets)
st.info(f"Sampling {max_tweets} out of {len(scored_docs)} tweets for report generation")
else:
sampled_docs = scored_docs
prompt = build_prompt(query_input, sampled_docs)
# Define the text generation pipeline
def process_with_llama(prompt):
try:
pipe = pipeline(
"text-generation",
model="unsloth/Llama-3.2-1B-Instruct",
device="cuda" if torch.cuda.is_available() else -1,
torch_dtype=torch.bfloat16,
)
result = pipe(prompt, max_new_tokens=400, return_full_text=False)
return result, None
except Exception as e:
return None, str(e)
status_text.markdown("**📝 Generating report with Llama...**")
progress_bar.progress(80)
raw_result, error = process_with_llama(prompt)
# Process the result to get the report or display the error
if error:
st.error(f"Gemma processing failed: {str(error)}")
report = "Error generating report. Please try again with fewer tweets."
else:
report = raw_result[0][0]['generated_text']
# Clear the VRAM in the end so it won't affect the next app run
clear_gpu_memory()
progress_bar.progress(100)
status_text.success("**✅ Generation complete!**")
html("<script>localStorage.setItem('freezeTimer', 'true');</script>", height=0)
st.session_state.timer_frozen = True
# Replace special characters for the correct format
formatted_report = report.replace('\n', '<br>')
st.subheader("Generated Report:")
st.markdown(f"<div style='font-size: normal; font-weight: normal;'>{formatted_report}</div>", unsafe_allow_html=True)
# Running the main function
if __name__ == '__main__':
main()
|