File size: 13,260 Bytes
91eb9f9
 
 
 
fc29cdc
91eb9f9
 
 
7fc6144
4e6ae40
77da31d
 
dfd6b67
8b9bf01
77da31d
d9a35fb
 
 
 
8b9bf01
9f00778
d9a35fb
5ab1c08
6c1e788
91eb9f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f00778
ac1df46
d6463b9
91eb9f9
9c4bbfa
d6463b9
2382fa5
076376a
a652654
 
 
 
 
 
34549e6
33e01f0
1ee20a5
34549e6
ac1df46
34549e6
1bd0566
1ee20a5
ac1df46
1ee20a5
ac1df46
 
1bd0566
1ee20a5
 
 
 
 
 
076376a
77da31d
f62daa9
076376a
77da31d
f62daa9
77da31d
 
 
33248ee
77da31d
 
35ed070
77da31d
076376a
77da31d
 
076376a
77da31d
076376a
77da31d
9f00778
77da31d
cc58a52
33248ee
 
 
 
 
 
f1b5d59
33248ee
 
77da31d
 
076376a
82ade9a
 
 
d6463b9
82ade9a
77da31d
f6e732f
35ed070
9f00778
82ade9a
 
7b1d3eb
 
b393a11
 
 
 
 
 
 
0e7e464
b393a11
 
9f00778
 
77da31d
 
82ade9a
6521a16
82ade9a
 
 
ac1df46
82ade9a
d6463b9
9ba919a
82ade9a
 
 
 
 
1ee20a5
82ade9a
 
 
77da31d
82ade9a
 
 
 
 
 
 
77da31d
82ade9a
 
 
 
9f00778
82ade9a
77da31d
82ade9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f00778
82ade9a
 
77da31d
 
82ade9a
77da31d
82ade9a
 
 
 
 
 
 
 
 
1ee20a5
82ade9a
 
 
 
77da31d
82ade9a
ee46d7b
82ade9a
 
77da31d
82ade9a
 
 
1d9c203
77da31d
c746718
77da31d
c746718
77da31d
9f00778
82ade9a
68286e4
77da31d
 
68286e4
82ade9a
 
9f00778
82ade9a
 
 
 
9f00778
f6e732f
82ade9a
 
 
 
 
 
cc58a52
9f00778
 
82ade9a
 
 
e5a7e55
cc58a52
 
82ade9a
f6e732f
82ade9a
 
 
f62daa9
e5a7e55
82ade9a
 
9f00778
 
82ade9a
77da31d
 
82ade9a
03d0510
9f00778
848a741
82ade9a
 
 
 
9f00778
03d0510
82ade9a
ee41d09
03d0510
9f00778
82ade9a
77da31d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import os
import nest_asyncio
nest_asyncio.apply()
import streamlit as st
from transformers import pipeline
from huggingface_hub import login
from streamlit.components.v1 import html
import pandas as pd
import torch
import random
import gc

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Retrieve the token from environment variables for Hugging Face login
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
    st.error("Hugging Face token not found. Please set the HF_TOKEN environment variable.")
    st.stop()

# Hugging Face login with the token just in case the models requirement authorization
login(token=hf_token)

# Timer component using HTML and JavaScript
def timer():
    return """
    <div id="timer" style="font-size:16px;color:#666;margin-bottom:10px;">⏱️ Elapsed: 00:00</div>
    <script>
    (function() {
        var start = Date.now();
        var timerElement = document.getElementById('timer');
        localStorage.removeItem("freezeTimer");
        var interval = setInterval(function() {
            if(localStorage.getItem("freezeTimer") === "true"){
                clearInterval(interval);
                timerElement.style.color = '#00cc00';
                return;
            }
            var elapsed = Date.now() - start;
            var minutes = Math.floor(elapsed / 60000);
            var seconds = Math.floor((elapsed % 60000) / 1000);
            timerElement.innerHTML = '⏱️ Elapsed: ' +
            (minutes < 10 ? '0' : '') + minutes + ':' +
            (seconds < 10 ? '0' : '') + seconds;
        }, 1000);
    })();
    </script>
    """
# Display the Title
st.set_page_config(page_title="Twitter/X Tweets Scorer & Report Generator", page_icon="📝")
st.header("𝕏/Twitter Tweets Sentiment Report Generator")

# Concise introduction
st.write("This model🎰 will score your tweets in your CSV file🗄️ based on their sentiment😀 and generate a report🗟 answering your query question❔ based on those results.")

# Display VRAM status for debug
def print_gpu_status(label):
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        st.info(f"{label}: Allocated {allocated:.2f} GB, Reserved {reserved:.2f} GB")

# Cache the model loading functions
@st.cache_resource
def get_sentiment_model():
    return pipeline("text-classification", 
                    model="cardiffnlp/twitter-roberta-base-sentiment-latest", 
                    device=0 if torch.cuda.is_available() else -1)

@st.cache_resource
def get_summary_model():
    return pipeline("text-generation", 
                   model="frankai98/T5FinetunedCommentSummary",
                   device=0 if torch.cuda.is_available() else -1)

# Function to clear GPU memory
def clear_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        
# Function to build the prompt for text generation model using only the sentiment label
def build_prompt(query_input, sampled_docs):
    docs_text = ""
    # Use the sentiment label directly from each document (converted to lowercase)
    for idx, doc in enumerate(sampled_docs):
        sentiment_word = doc['sentiment'].lower() if doc.get('sentiment') else "unknown"
        docs_text += f"Tweet {idx+1} (Sentiment: {sentiment_word}): {doc['comment']}\n"

    system_message = """You are an helpful assistant. Read the Tweets with their sentiment (Negative, Neutral, Positive) provided and produce a well-structured report that answers the query question.
Your task:
- Summarize both positive and negative aspects, highlighting any trends in user sentiment.
- Include an introduction, key insights, and a conclusion, reaching about 400 words.
- DO NOT repeat these instructions or the user's query in the final report. Only provide the final text."""

    user_content = f"""**Tweets**:
{docs_text}

**Query Question**: "{query_input}"

Now produce the final report only, without reiterating these instructions or the query."""
    # This is the required chat format for Llama-3.2-1B-Instruct model, select intruct model for better performance
    messages = [
        [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}]
            },
            {
                "role": "user",
                "content": [{"type": "text", "text": user_content}]
            }
        ]
    ]
    return messages

# Main Function Part:
def main():
    # Let the user specify the column name for tweets text (defaulting to "content")
    tweets_column = st.text_input("Enter the column name for Tweets🐦:", value="content")
    
    # Input: Query question for analysis and CSV file upload for candidate tweets
    query_input = st.text_area("Enter your query question❓for analysis (Format: How do these people feel about ...?) (this does not need to be part of the CSV):")
    uploaded_file = st.file_uploader(f"Upload Tweets CSV File < 1MB🗄️(must contain a '{tweets_column}' column with preferably <1000 tweets)", type=["csv"])
    # Error check steps to ensure that the uploaded file meets the requirements
    candidate_docs = []
    if uploaded_file is not None:
        if uploaded_file.size > 1 * 1024 * 1024:
            st.error("The file is too large! Please upload a file smaller than 1MB.")
        else:
            try:
                df = pd.read_csv(uploaded_file)
                if tweets_column not in df.columns:
                    st.error(f"CSV must contain a '{tweets_column}' column.")
                else:
                    candidate_docs = df[tweets_column].dropna().astype(str).tolist()
                    st.write("File uploaded successfully!🎆")
            except Exception as e:
                st.error(f"Error reading CSV file: {e}")
    # Click on the button will start running the pipelines in sequence and the timer
    if st.button("Generate Report"):
        st.session_state.setdefault("timer_started", False)
        st.session_state.setdefault("timer_frozen", False)
        if uploaded_file is None:
            st.error("Please upload a CSV file🗄️.")
        elif not tweets_column.strip():
            st.error("Please enter your column name")
        elif not candidate_docs:
            st.error(f"CSV must contain a '{tweets_column}' column.")
        elif not query_input.strip():
            st.error("Please enter a query question❔!")
        else:
            if not st.session_state.timer_started and not st.session_state.timer_frozen:
                st.session_state.timer_started = True
                html(timer(), height=50)
            status_text = st.empty()
            progress_bar = st.progress(0)
            
            processed_docs = []
            scored_results = []
            
            # Check which documents need summarization (tweets longer than 280 characters)
            docs_to_summarize = []
            docs_indices = []
            for i, doc in enumerate(candidate_docs):
                if len(doc) > 280:
                    docs_to_summarize.append(doc)
                    docs_indices.append(i)
            
            # Summarize long tweets if needed
            if docs_to_summarize:
                status_text.markdown("**📝 Loading summarization model...**")
                t5_pipe = get_summary_model()
                status_text.markdown("**📝 Summarizing long tweets...**")
                # Dispay the progress
                for idx, (i, doc) in enumerate(zip(docs_indices, docs_to_summarize)):
                    progress = int((idx / len(docs_to_summarize)) * 25)
                    progress_bar.progress(progress)
                    input_text = "summarize: " + doc
                    try:
                        summary_result = t5_pipe(
                            input_text, 
                            max_length=128,
                            min_length=10,
                            no_repeat_ngram_size=2,
                            num_beams=4,
                            early_stopping=True,
                            truncation=True
                        )
                        candidate_docs[i] = summary_result[0]['generated_text']
                    except Exception as e:
                        st.warning(f"Error summarizing document {i}: {str(e)}")
                # Delete summarization model from VRAM for optimization
                del t5_pipe
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            
            # Load sentiment analysis model
            status_text.markdown("**🔍 Loading sentiment analysis model...**")
            progress_bar.progress(25)
            score_pipe = get_sentiment_model()
            status_text.markdown("**🔍 Scoring documents...**")
            
            for i, doc in enumerate(candidate_docs):
                progress_offset = 25 if docs_to_summarize else 0
                progress = progress_offset + int((i / len(candidate_docs)) * (50 - progress_offset))
                progress_bar.progress(progress)
                try:
                    result = score_pipe(doc, truncation=True, max_length=512)
                    if isinstance(result, list):
                        result = result[0]
                    processed_docs.append(doc)
                    # Store only the sentiment label (e.g., "Negative", "Neutral", "Positive")
                    scored_results.append(result)
                except Exception as e:
                    st.warning(f"Error scoring document {i}: {str(e)}")
                    processed_docs.append("Error processing this document")
                    scored_results.append({"label": "Neutral"})
                
                if i % max(1, len(candidate_docs) // 10) == 0:
                    status_text.markdown(f"**🔍 Scoring documents... ({i}/{len(candidate_docs)})**")
            
            # Pair documents with sentiment labels using key "comment"
            scored_docs = [
                {"comment": doc, "sentiment": result.get("label", "Neutral")}
                for doc, result in zip(processed_docs, scored_results)
            ]
            # Delete sentiment analysis model from VRAM for optimization
            del score_pipe
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            status_text.markdown("**📊 Loading report generation model...**")
            progress_bar.progress(67)
            # Clear the VRAM to prepare for Llama so it wouldn't encounter OOM errors
            clear_gpu_memory()
            
            status_text.markdown("**📝 Preparing data for report generation...**")
            progress_bar.progress(75)
            # Set the maximum examples text generation model can take
            max_tweets = 1000
            if len(scored_docs) > max_tweets:
                sampled_docs = random.sample(scored_docs, max_tweets)
                st.info(f"Sampling {max_tweets} out of {len(scored_docs)} tweets for report generation")
            else:
                sampled_docs = scored_docs
            
            prompt = build_prompt(query_input, sampled_docs)
            # Define the text generation pipeline
            def process_with_llama(prompt):
                try:
                    pipe = pipeline(
                        "text-generation",
                        model="unsloth/Llama-3.2-1B-Instruct",
                        device="cuda" if torch.cuda.is_available() else -1,
                        torch_dtype=torch.bfloat16,
                    )
                    result = pipe(prompt, max_new_tokens=400, return_full_text=False)
                    return result, None
                except Exception as e:
                    return None, str(e)

            status_text.markdown("**📝 Generating report with Llama...**")
            progress_bar.progress(80)
            
            raw_result, error = process_with_llama(prompt)
            # Process the result to get the report or display the error
            if error:
                st.error(f"Gemma processing failed: {str(error)}")
                report = "Error generating report. Please try again with fewer tweets."
            else:
                report = raw_result[0][0]['generated_text']
            # Clear the VRAM in the end so it won't affect the next app run    
            clear_gpu_memory()
            progress_bar.progress(100)
            status_text.success("**✅ Generation complete!**")
            html("<script>localStorage.setItem('freezeTimer', 'true');</script>", height=0)
            st.session_state.timer_frozen = True
            # Replace special characters for the correct format
            formatted_report = report.replace('\n', '<br>')
            
            st.subheader("Generated Report:")
            st.markdown(f"<div style='font-size: normal; font-weight: normal;'>{formatted_report}</div>", unsafe_allow_html=True)
# Running the main function            
if __name__ == '__main__':
    main()