frankai98 commited on
Commit
4e6ae40
Β·
verified Β·
1 Parent(s): 0c389c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -3
app.py CHANGED
@@ -7,6 +7,7 @@ from huggingface_hub import login
7
  from streamlit.components.v1 import html
8
  import pandas as pd
9
  import torch
 
10
 
11
  # Retrieve the token from environment variables
12
  hf_token = os.environ.get("HF_TOKEN")
@@ -151,10 +152,22 @@ else:
151
 
152
  # Stage 1: Score candidate documents using the provided query.
153
  status_text.markdown("**πŸ” Scoring candidate documents...**")
154
- progress_bar.progress(33)
155
 
156
- # Assuming score_pipe can take a list of texts directly:
157
- scored_results = score_pipe(candidate_docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # Pair each review with its score assuming the output order matches the input order.
159
  scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
160
 
@@ -162,6 +175,12 @@ else:
162
 
163
  # Stage 2: Generate Report using Gemma in the new messages format.
164
  status_text.markdown("**πŸ“ Generating report with Gemma...**")
 
 
 
 
 
 
165
 
166
  # Build the user content with query, sentiment results, and original review data.
167
  # Format the prompt as chat for Gemma
 
7
  from streamlit.components.v1 import html
8
  import pandas as pd
9
  import torch
10
+ import random
11
 
12
  # Retrieve the token from environment variables
13
  hf_token = os.environ.get("HF_TOKEN")
 
152
 
153
  # Stage 1: Score candidate documents using the provided query.
154
  status_text.markdown("**πŸ” Scoring candidate documents...**")
 
155
 
156
+ # Process each review individually to avoid memory issues
157
+ scored_results = []
158
+ for i, doc in enumerate(candidate_docs):
159
+ # Update progress based on current document
160
+ progress = int((i / len(candidate_docs)) * 50) # First half of progress bar (0-50%)
161
+ progress_bar.progress(progress)
162
+
163
+ # Process single document
164
+ result = score_pipe([doc])[0]
165
+ scored_results.append(result)
166
+
167
+ # Display occasional status updates for large datasets
168
+ if i % max(1, len(candidate_docs) // 10) == 0:
169
+ status_text.markdown(f"**πŸ” Scoring documents... ({i}/{len(candidate_docs)})**")
170
+
171
  # Pair each review with its score assuming the output order matches the input order.
172
  scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
173
 
 
175
 
176
  # Stage 2: Generate Report using Gemma in the new messages format.
177
  status_text.markdown("**πŸ“ Generating report with Gemma...**")
178
+
179
+ # For very large datasets, summarize or sample the scored_docs before sending to Gemma
180
+ sampled_docs = scored_docs
181
+ if len(scored_docs) > 10000: # Arbitrary threshold for what's "too large"
182
+ # Option 1: Random sampling
183
+ sampled_docs = random.sample(scored_docs, 1000)
184
 
185
  # Build the user content with query, sentiment results, and original review data.
186
  # Format the prompt as chat for Gemma