frankai98 commited on
Commit
ee46d7b
·
verified ·
1 Parent(s): 2553077

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -3
app.py CHANGED
@@ -160,9 +160,24 @@ else:
160
  progress = int((i / len(candidate_docs)) * 50) # First half of progress bar (0-50%)
161
  progress_bar.progress(progress)
162
 
163
- # Process single document
164
- result = score_pipe(doc)
165
- scored_results.append(result[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  # Display occasional status updates for large datasets
168
  if i % max(1, len(candidate_docs) // 10) == 0:
 
160
  progress = int((i / len(candidate_docs)) * 50) # First half of progress bar (0-50%)
161
  progress_bar.progress(progress)
162
 
163
+ # Process single document with truncation to avoid tensor size mismatch
164
+ try:
165
+ # Use the tokenizer to properly truncate the input
166
+ tokenizer = score_pipe.tokenizer
167
+ max_length = tokenizer.model_max_length # Usually 512 for RoBERTa
168
+
169
+ # Truncate the text using the tokenizer to ensure it fits
170
+ encoded_input = tokenizer(doc, truncation=True, max_length=max_length, return_tensors="pt")
171
+ # Decode back to text to get the truncated version
172
+ truncated_doc = tokenizer.decode(encoded_input["input_ids"][0], skip_special_tokens=True)
173
+
174
+ # Now process the truncated document
175
+ result = score_pipe(truncated_doc)
176
+ scored_results.append(result[0]) # Get the first result
177
+ except Exception as e:
178
+ st.warning(f"Error processing document {i}: {str(e)}")
179
+ # Add a placeholder result to maintain indexing
180
+ scored_results.append({"label": "ERROR", "score": 0})
181
 
182
  # Display occasional status updates for large datasets
183
  if i % max(1, len(candidate_docs) // 10) == 0: