Spaces:

frankai98
/

Tokentesting

Sleeping

App Files Files Community

frankai98 commited on Mar 13

Commit

1ee20a5

verified ·

1 Parent(s): 496495e

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -119

app.py CHANGED Viewed

@@ -54,76 +54,35 @@ st.write("This model will score your reviews in your CSV file and generate a rep
 # Cache the model loading functions
 @st.cache_resource
-def load_llama_model():
-    """Load and cache the Llama 3.2 model"""
-    return pipeline("text-generation",
-                    model="meta-llama/Llama-3.2-1B-Instruct",
-                    device=0,  # Use GPU if available
-                    torch_dtype=torch.bfloat16)  # Use FP16 for efficiency
-@st.cache_resource
-def load_sentiment_model():
-    """Load and cache the sentiment analysis model"""
     return pipeline("text-classification",
                     model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                     device=0 if torch.cuda.is_available() else -1)
-# Load Llama 3.2 model
-loading_llama_placeholder = st.empty()
-loading_llama_placeholder.info("Loading Llama 3.2 summarization model...")
-try:
-    llama_pipe = load_llama_model()
-    # Clear loading message
-    loading_llama_placeholder.empty()
-    # Display success message in a placeholder
-    success_llama_placeholder = st.empty()
-    success_llama_placeholder.success("Llama 3.2 summarization model loaded successfully!")
-    # Use st.session_state to track when to clear the message
-    if "clear_llama_success_time" not in st.session_state:
-        st.session_state.clear_llama_success_time = time.time() + 5
-    # Check if it's time to clear the message
-    if time.time() > st.session_state.clear_llama_success_time:
-        success_llama_placeholder.empty()
-except Exception as e:
-    # Clear loading message
-    loading_llama_placeholder.empty()
-    st.error(f"Error loading Llama 3.2 summarization model: {e}")
-    st.error(f"Detailed error: {type(e).__name__}: {str(e)}")
-# Load sentiment analysis model
-loading_sentiment_placeholder = st.empty()
-loading_sentiment_placeholder.info("Loading sentiment analysis model...")
-try:
-    score_pipe = load_sentiment_model()
-    # Clear loading message
-    loading_sentiment_placeholder.empty()
-    # Display success message in a placeholder
-    success_sentiment_placeholder = st.empty()
-    success_sentiment_placeholder.success("Sentiment analysis model loaded successfully!")
-    # Use st.session_state to track when to clear the message
-    if "clear_sentiment_success_time" not in st.session_state:
-        st.session_state.clear_sentiment_success_time = time.time() + 5
-    # Check if it's time to clear the message
-    if time.time() > st.session_state.clear_sentiment_success_time:
-        success_sentiment_placeholder.empty()
-except Exception as e:
-    # Clear loading message
-    loading_sentiment_placeholder.empty()
-    st.error(f"Error loading sentiment analysis model: {e}")
 def extract_assistant_content(raw_response):
     """Extract only the assistant's content from the Gemma-3 response."""
@@ -186,94 +145,111 @@ else:
             progress_bar = st.progress(0)
-            # Stage 1: Process, summarize (if needed), and score candidate documents
-            status_text.markdown("**🔍 Processing and scoring candidate documents...**")
-            # Process each review individually with summarization for long documents
-            processed_docs = []  # Store processed (original or summarized) documents
-            scored_results = []  # Store sentiment scores
             for i, doc in enumerate(candidate_docs):
-                # Update progress based on current document
-                progress = int((i / len(candidate_docs)) * 50)  # First half of progress bar (0-50%)
-                progress_bar.progress(progress)
-                try:
-                    # Check if document exceeds the length limit for sentiment analysis
-                    if len(doc) > 1500:  # Approximate limit for sentiment model
-                        # Use Llama 3.2 to summarize the document
-                        summary_prompt = [
-                            {"role": "user", "content": f"Summarize the following text into a shorter version that preserves the sentiment and key points: {doc[:2000]}..."}
-                        ]
                         summary_result = llama_pipe(
                             summary_prompt,
-                            max_new_tokens=30,  # Limit summary length
                             do_sample=True,
-                            temperature=0.3,  # Lower temperature for more factual summaries
-                            return_full_text=False  # Return only the generated text
                         )
-                        # Extract the summary from the result
-                        processed_doc = summary_result[0]['generated_text']
-                        status_text.markdown(f"**📝 Summarized document {i+1}/{len(candidate_docs)}**")
-                    else:
-                        # Use the original document if it's short enough
-                        processed_doc = doc
-                    # Store the processed document (original or summary)
-                    processed_docs.append(processed_doc)
-                    # Process the document with sentiment analysis
-                    result = score_pipe(processed_doc)
                     # If it's a list, get the first element
                     if isinstance(result, list):
                         result = result[0]
                     scored_results.append(result)
-                    # Free memory
-                    torch.cuda.empty_cache()
                 except Exception as e:
-                    st.warning(f"Error processing document {i}: {str(e)}")
-                    # Add a placeholder result to maintain indexing
                     processed_docs.append("Error processing this document")
                     scored_results.append({"label": "NEUTRAL", "score": 0.5})
-                # Display occasional status updates for large datasets
                 if i % max(1, len(candidate_docs) // 10) == 0:
                     status_text.markdown(f"**🔍 Scoring documents... ({i}/{len(candidate_docs)})**")
-            # Pair each review with its score assuming the output order matches the input order.
-            scored_docs = list(zip(processed_docs, [result.get("score", 0.5) for result in scored_results]))
-            progress_bar.progress(67)
-            # Stage 2: Generate Report using Gemma in the new messages format.
-            status_text.markdown("**📝 Generating report with Gemma...**")
-            # After using score_pipe
             del score_pipe
             gc.collect()
             torch.cuda.empty_cache()
-            # After using summarization_pipe
-            del llama_pipe
-            gc.collect()
-            torch.cuda.empty_cache()
-            # Then reload Gemma specifically for the final step
             tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
             gemma_pipe = pipeline("text-generation",
-                                  model="google/gemma-3-1b-it",
-                                  tokenizer=tokenizer,
-                                  device=0,
-                                  torch_dtype=torch.bfloat16)
             # Sample or summarize the data for Gemma to avoid memory issues
             import random
             max_reviews = 50  # Adjust based on your GPU memory
             if len(scored_docs) > max_reviews:

 # Cache the model loading functions
 @st.cache_resource
+def get_sentiment_model():
     return pipeline("text-classification",
                     model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                     device=0 if torch.cuda.is_available() else -1)
+@st.cache_resource
+def get_llama_model():
+    return pipeline("text-generation",
+                   model="meta-llama/Llama-3.2-1B-Instruct",
+                   device=0,
+                   torch_dtype=torch.bfloat16)
+@st.cache_resource
+def get_gemma_model():
+    tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
+    return pipeline("text-generation",
+                   model="google/gemma-3-1b-it",
+                   tokenizer=tokenizer,
+                   device=0,
+                   torch_dtype=torch.bfloat16)
+# Function to clear GPU memory
+def clear_gpu_memory():
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
 def extract_assistant_content(raw_response):
     """Extract only the assistant's content from the Gemma-3 response."""
             progress_bar = st.progress(0)
+            processed_docs = []
+            scored_results = []
+            # First, check which documents need summarization
+            docs_to_summarize = []
+            docs_indices = []
             for i, doc in enumerate(candidate_docs):
+                if len(doc) > 1500:
+                    docs_to_summarize.append(doc)
+                    docs_indices.append(i)
+            # If we have documents to summarize, load Llama model first
+            if docs_to_summarize:
+                status_text.markdown("**📝 Loading summarization model...**")
+                llama_pipe = load_llama_model()
+                status_text.markdown("**📝 Summarizing long documents...**")
+                # Process documents that need summarization
+                for idx, (i, doc) in enumerate(zip(docs_indices, docs_to_summarize)):
+                    progress = int((idx / len(docs_to_summarize)) * 25)  # First quarter of progress
+                    progress_bar.progress(progress)
+                    summary_prompt = [
+                        {"role": "user", "content": f"Summarize the following text into a shorter version that preserves the sentiment and key points: {doc[:2000]}..."}
+                    ]
+                    try:
                         summary_result = llama_pipe(
                             summary_prompt,
+                            max_new_tokens=30,
                             do_sample=True,
+                            temperature=0.3,
+                            return_full_text=False
                         )
+                        # Store the summary in place of the original text
+                        candidate_docs[i] = summary_result[0]['generated_text']
+                    except Exception as e:
+                        st.warning(f"Error summarizing document {i}: {str(e)}")
+                # Clear Llama model from memory
+                del llama_pipe
+                gc.collect()
+                torch.cuda.empty_cache()
+            # Now load sentiment model
+            status_text.markdown("**🔍 Loading sentiment analysis model...**")
+            score_pipe = load_sentiment_model()
+            status_text.markdown("**🔍 Scoring documents...**")
+            # Process each document with sentiment analysis
+            for i, doc in enumerate(candidate_docs):
+                progress_offset = 25 if docs_to_summarize else 0
+                progress = progress_offset + int((i / len(candidate_docs)) * (50 - progress_offset))
+                progress_bar.progress(progress)
+                try:
+                    # Process with sentiment analysis
+                    result = score_pipe(doc)
                     # If it's a list, get the first element
                     if isinstance(result, list):
                         result = result[0]
+                    processed_docs.append(doc)
                     scored_results.append(result)
                 except Exception as e:
+                    st.warning(f"Error scoring document {i}: {str(e)}")
                     processed_docs.append("Error processing this document")
                     scored_results.append({"label": "NEUTRAL", "score": 0.5})
+                # Display occasional status updates
                 if i % max(1, len(candidate_docs) // 10) == 0:
                     status_text.markdown(f"**🔍 Scoring documents... ({i}/{len(candidate_docs)})**")
+            # Pair documents with scores
+            scored_docs = list(zip(processed_docs, [result.get("score", 0.5) for result in scored_results]))
+            # Clear sentiment model from memory
             del score_pipe
             gc.collect()
             torch.cuda.empty_cache()
+            progress_bar.progress(67)
+            # Load Gemma for final report generation
+            status_text.markdown("**📊 Loading report generation model...**")
+            progress_bar.progress(67)
             tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
             gemma_pipe = pipeline("text-generation",
+                                 model="google/gemma-3-1b-it",
+                                 tokenizer=tokenizer,
+                                 device=0,
+                                 torch_dtype=torch.bfloat16)
             # Sample or summarize the data for Gemma to avoid memory issues
+            status_text.markdown("**📝 Generating report...**")
+            progress_bar.progress(80)
             import random
             max_reviews = 50  # Adjust based on your GPU memory
             if len(scored_docs) > max_reviews: