Spaces:

frankai98
/

Tokentesting

Sleeping

App Files Files Community

frankai98 commited on Mar 13

Commit

4e6ae40

verified ·

1 Parent(s): 0c389c7

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -3

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from huggingface_hub import login
 from streamlit.components.v1 import html
 import pandas as pd
 import torch
 # Retrieve the token from environment variables
 hf_token = os.environ.get("HF_TOKEN")
@@ -151,10 +152,22 @@ else:
             # Stage 1: Score candidate documents using the provided query.
             status_text.markdown("**🔍 Scoring candidate documents...**")
-            progress_bar.progress(33)
-            # Assuming score_pipe can take a list of texts directly:
-            scored_results = score_pipe(candidate_docs)
             # Pair each review with its score assuming the output order matches the input order.
             scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
@@ -162,6 +175,12 @@ else:
             # Stage 2: Generate Report using Gemma in the new messages format.
             status_text.markdown("**📝 Generating report with Gemma...**")
             # Build the user content with query, sentiment results, and original review data.
             # Format the prompt as chat for Gemma

 from streamlit.components.v1 import html
 import pandas as pd
 import torch
+import random
 # Retrieve the token from environment variables
 hf_token = os.environ.get("HF_TOKEN")
             # Stage 1: Score candidate documents using the provided query.
             status_text.markdown("**🔍 Scoring candidate documents...**")
+           # Process each review individually to avoid memory issues
+            scored_results = []
+            for i, doc in enumerate(candidate_docs):
+                # Update progress based on current document
+                progress = int((i / len(candidate_docs)) * 50)  # First half of progress bar (0-50%)
+                progress_bar.progress(progress)
+                # Process single document
+                result = score_pipe([doc])[0]
+                scored_results.append(result)
+                # Display occasional status updates for large datasets
+                if i % max(1, len(candidate_docs) // 10) == 0:
+                    status_text.markdown(f"**🔍 Scoring documents... ({i}/{len(candidate_docs)})**")
             # Pair each review with its score assuming the output order matches the input order.
             scored_docs = list(zip(candidate_docs, [result["score"] for result in scored_results]))
             # Stage 2: Generate Report using Gemma in the new messages format.
             status_text.markdown("**📝 Generating report with Gemma...**")
+            # For very large datasets, summarize or sample the scored_docs before sending to Gemma
+            sampled_docs = scored_docs
+            if len(scored_docs) > 10000:  # Arbitrary threshold for what's "too large"
+                # Option 1: Random sampling
+                sampled_docs = random.sample(scored_docs, 1000)
             # Build the user content with query, sentiment results, and original review data.
             # Format the prompt as chat for Gemma