Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ from huggingface_hub import login
|
|
7 |
from streamlit.components.v1 import html
|
8 |
import pandas as pd
|
9 |
import torch
|
|
|
10 |
|
11 |
# Retrieve the token from environment variables
|
12 |
hf_token = os.environ.get("HF_TOKEN")
|
@@ -151,10 +152,22 @@ else:
|
|
151 |
|
152 |
# Stage 1: Score candidate documents using the provided query.
|
153 |
status_text.markdown("**π Scoring candidate documents...**")
|
154 |
-
progress_bar.progress(33)
|
155 |
|
156 |
-
|
157 |
-
scored_results =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
# Pair each review with its score assuming the output order matches the input order.
|
159 |
scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
|
160 |
|
@@ -162,6 +175,12 @@ else:
|
|
162 |
|
163 |
# Stage 2: Generate Report using Gemma in the new messages format.
|
164 |
status_text.markdown("**π Generating report with Gemma...**")
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
# Build the user content with query, sentiment results, and original review data.
|
167 |
# Format the prompt as chat for Gemma
|
|
|
7 |
from streamlit.components.v1 import html
|
8 |
import pandas as pd
|
9 |
import torch
|
10 |
+
import random
|
11 |
|
12 |
# Retrieve the token from environment variables
|
13 |
hf_token = os.environ.get("HF_TOKEN")
|
|
|
152 |
|
153 |
# Stage 1: Score candidate documents using the provided query.
|
154 |
status_text.markdown("**π Scoring candidate documents...**")
|
|
|
155 |
|
156 |
+
# Process each review individually to avoid memory issues
|
157 |
+
scored_results = []
|
158 |
+
for i, doc in enumerate(candidate_docs):
|
159 |
+
# Update progress based on current document
|
160 |
+
progress = int((i / len(candidate_docs)) * 50) # First half of progress bar (0-50%)
|
161 |
+
progress_bar.progress(progress)
|
162 |
+
|
163 |
+
# Process single document
|
164 |
+
result = score_pipe([doc])[0]
|
165 |
+
scored_results.append(result)
|
166 |
+
|
167 |
+
# Display occasional status updates for large datasets
|
168 |
+
if i % max(1, len(candidate_docs) // 10) == 0:
|
169 |
+
status_text.markdown(f"**π Scoring documents... ({i}/{len(candidate_docs)})**")
|
170 |
+
|
171 |
# Pair each review with its score assuming the output order matches the input order.
|
172 |
scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
|
173 |
|
|
|
175 |
|
176 |
# Stage 2: Generate Report using Gemma in the new messages format.
|
177 |
status_text.markdown("**π Generating report with Gemma...**")
|
178 |
+
|
179 |
+
# For very large datasets, summarize or sample the scored_docs before sending to Gemma
|
180 |
+
sampled_docs = scored_docs
|
181 |
+
if len(scored_docs) > 10000: # Arbitrary threshold for what's "too large"
|
182 |
+
# Option 1: Random sampling
|
183 |
+
sampled_docs = random.sample(scored_docs, 1000)
|
184 |
|
185 |
# Build the user content with query, sentiment results, and original review data.
|
186 |
# Format the prompt as chat for Gemma
|