File size: 9,621 Bytes
91eb9f9
 
 
 
503f042
91eb9f9
 
 
7fc6144
4e6ae40
8b9bf01
6c1e788
d9a35fb
 
 
 
8b9bf01
6c1e788
d9a35fb
5ab1c08
9c4bbfa
9ba919a
 
 
 
8b9bf01
6c1e788
91eb9f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2382fa5
9c4bbfa
 
91eb9f9
9c4bbfa
503f042
2382fa5
6c1e788
a4c7cd3
91eb9f9
549832e
 
 
9ba919a
549832e
 
0c389c7
503f042
549832e
9ba919a
 
549832e
9ba919a
549832e
503f042
 
549832e
503f042
 
549832e
 
 
9ba919a
 
549832e
 
9c4bbfa
2382fa5
503f042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ba919a
9c4bbfa
2382fa5
503f042
9c4bbfa
 
78f714f
2382fa5
9ba919a
 
 
 
 
91eb9f9
9ba919a
 
 
 
 
91eb9f9
9ba919a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d9c203
5e679fe
1d9c203
 
4e6ae40
 
 
 
 
 
 
ee46d7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e6ae40
 
 
 
 
d47f4f6
 
1d9c203
d47f4f6
1d9c203
5e679fe
1d9c203
4e6ae40
 
 
 
 
 
5e679fe
 
52b9eb2
baf5aeb
 
 
 
 
 
 
 
 
5e679fe
503f042
 
1d9c203
 
 
 
fd87044
503f042
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import os
import nest_asyncio
nest_asyncio.apply()
import streamlit as st
from transformers import pipeline, AutoTokenizer
from huggingface_hub import login
from streamlit.components.v1 import html
import pandas as pd
import torch
import random

# Retrieve the token from environment variables
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
    st.error("Hugging Face token not found. Please set the HF_TOKEN environment variable.")
    st.stop()

# Login with the token
login(token=hf_token)

# Initialize session state for timer
#if 'timer_started' not in st.session_state:
    #st.session_state.timer_started = False
#if 'timer_frozen' not in st.session_state:
    #st.session_state.timer_frozen = False

# Timer component using HTML and JavaScript
def timer():
    return """
    <div id="timer" style="font-size:16px;color:#666;margin-bottom:10px;">⏱️ Elapsed: 00:00</div>
    <script>
    (function() {
        var start = Date.now();
        var timerElement = document.getElementById('timer');
        localStorage.removeItem("freezeTimer");
        var interval = setInterval(function() {
            if(localStorage.getItem("freezeTimer") === "true"){
                clearInterval(interval);
                timerElement.style.color = '#00cc00';
                return;
            }
            var elapsed = Date.now() - start;
            var minutes = Math.floor(elapsed / 60000);
            var seconds = Math.floor((elapsed % 60000) / 1000);
            timerElement.innerHTML = '⏱️ Elapsed: ' +
            (minutes < 10 ? '0' : '') + minutes + ':' +
            (seconds < 10 ? '0' : '') + seconds;
        }, 1000);
    })();
    </script>
    """

st.set_page_config(page_title="Review Scorer & Report Generator", page_icon="πŸ“")
st.header("Review Scorer & Report Generator")

# Concise introduction
st.write("This model will score your reviews in your CSV file and generate a report based on your query and those results.")

# Load models with caching to avoid reloading on every run
@st.cache_resource
def load_models():
    score_pipe = None
    gemma_pipe = None
    
    try:
        st.info("Loading sentiment analysis model...")
        score_pipe = pipeline("text-classification", 
                              model="cardiffnlp/twitter-roberta-base-sentiment-latest", 
                              device=0 if torch.cuda.is_available() else -1)
        st.success("Sentiment analysis model loaded successfully!")
    except Exception as e:
        st.error(f"Error loading score model: {e}")
    
    try:
        st.info("Loading Gemma model...")
        # Load the tokenizer separately with the chat template
        tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
        gemma_pipe = pipeline("text-generation", 
                              model="google/gemma-3-1b-it",
                              tokenizer=tokenizer,  # Pass the loaded tokenizer here
                              device=0, 
                              torch_dtype=torch.bfloat16)
        st.success("Gemma model loaded successfully!")
    except Exception as e:
        st.error(f"Error loading Gemma model: {e}")
        st.error(f"Detailed error: {type(e).__name__}: {str(e)}")
    
    return score_pipe, gemma_pipe

def extract_assistant_content(raw_response):
    """Extract only the assistant's content from the Gemma-3 response."""
    # Convert to string and work with it directly
    response_str = str(raw_response)

    # Look for the assistant's content marker
    assistant_marker = "'role': 'assistant', 'content': '"
    if assistant_marker in response_str:
        start_idx = response_str.find(assistant_marker) + len(assistant_marker)
        # Extract everything after the marker until the end or closing quote
        content = response_str[start_idx:]

        # Find the end of the content (last single quote before the end of the string or before closing curly brace)
        end_markers = ["'}", "'}]"]
        end_idx = len(content)
        for marker in end_markers:
            pos = content.rfind(marker)
            if pos != -1 and pos < end_idx:
                end_idx = pos

        return content[:end_idx]

    # Fallback - return the original response
    return response_str

score_pipe, gemma_pipe = load_models()


# Input: Query text for scoring and CSV file upload for candidate reviews
query_input = st.text_area("Enter your query text for analysis (this does not need to be part of the CSV):")
uploaded_file = st.file_uploader("Upload Reviews CSV File (must contain a 'reviewText' column)", type=["csv"])

if score_pipe is None or gemma_pipe is None:
    st.error("Model loading failed. Please check your model names, token permissions, and GPU configuration.")
else:
    candidate_docs = []
    if uploaded_file is not None:
        try:
            df = pd.read_csv(uploaded_file)
            if 'reviewText' not in df.columns:
                st.error("CSV must contain a 'reviewText' column.")
            else:
                candidate_docs = df['reviewText'].dropna().astype(str).tolist()
        except Exception as e:
            st.error(f"Error reading CSV file: {e}")
    
    if st.button("Generate Report"):
        # Reset timer state so that the timer always shows up
        st.session_state.timer_started = False
        st.session_state.timer_frozen = False
        if uploaded_file is None:
            st.error("Please upload a CSV file.")
        elif not candidate_docs:
            st.error("CSV must contain a 'reviewText' column.")
        elif not query_input.strip():
            st.error("Please enter a query text!")
        else:
            if not st.session_state.timer_started and not st.session_state.timer_frozen:
                st.session_state.timer_started = True
                html(timer(), height=50)
            status_text = st.empty()
            progress_bar = st.progress(0)
            
            # Stage 1: Score candidate documents using the provided query.
            status_text.markdown("**πŸ” Scoring candidate documents...**")
            
           # Process each review individually to avoid memory issues
            scored_results = []
            for i, doc in enumerate(candidate_docs):
                # Update progress based on current document
                progress = int((i / len(candidate_docs)) * 50)  # First half of progress bar (0-50%)
                progress_bar.progress(progress)
                
                # Process single document with truncation to avoid tensor size mismatch
                try:
                    # Use the tokenizer to properly truncate the input
                    tokenizer = score_pipe.tokenizer
                    max_length = tokenizer.model_max_length  # Usually 512 for RoBERTa
                    
                    # Truncate the text using the tokenizer to ensure it fits
                    encoded_input = tokenizer(doc, truncation=True, max_length=max_length, return_tensors="pt")
                    # Decode back to text to get the truncated version
                    truncated_doc = tokenizer.decode(encoded_input["input_ids"][0], skip_special_tokens=True)
                    
                    # Now process the truncated document
                    result = score_pipe(truncated_doc)
                    scored_results.append(result[0])  # Get the first result
                except Exception as e:
                    st.warning(f"Error processing document {i}: {str(e)}")
                    # Add a placeholder result to maintain indexing
                    scored_results.append({"label": "ERROR", "score": 0})
                
                # Display occasional status updates for large datasets
                if i % max(1, len(candidate_docs) // 10) == 0:
                    status_text.markdown(f"**πŸ” Scoring documents... ({i}/{len(candidate_docs)})**")
                    
            # Pair each review with its score assuming the output order matches the input order.
            scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
            
            progress_bar.progress(67)
            
            # Stage 2: Generate Report using Gemma in the new messages format.
            status_text.markdown("**πŸ“ Generating report with Gemma...**")

            # For very large datasets, summarize or sample the scored_docs before sending to Gemma
            sampled_docs = scored_docs
            if len(scored_docs) > 10000:  # Arbitrary threshold for what's "too large"
                # Option 1: Random sampling
                sampled_docs = random.sample(scored_docs, 1000)
            
            # Build the user content with query, sentiment results, and original review data.
            # Format the prompt as chat for Gemma
            messages = [
                {"role": "user", "content": f"""
Generate a concise 300-word report based on the following analysis without repeating what's in the analysis.
Query:
"{query_input}"
Candidate Reviews with their scores:
{scored_docs}
        """}
            ]
            
            raw_result = gemma_pipe(messages, max_new_tokens=50)
            report = extract_assistant_content(raw_result)
            progress_bar.progress(100)
            status_text.success("**βœ… Generation complete!**")
            html("<script>localStorage.setItem('freezeTimer', 'true');</script>", height=0)
            st.session_state.timer_frozen = True
            #st.write("**Scored Candidate Reviews:**", scored_docs)
            st.write("**Generated Report:**", report)