aliasgerovs commited on
Commit
4451e36
·
2 Parent(s): f53c349 4df475b

Merge branch 'demo'

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -213,7 +213,7 @@ def update_character_count(text):
213
  return f"{len(text)} characters"
214
 
215
 
216
- def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40, min_last_segment_length=170, type_det='bc'):
217
  sentences = nltk.sent_tokenize(text)
218
  segments = []
219
  current_segment = []
@@ -269,7 +269,7 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40,
269
 
270
  def predict_bc(model, tokenizer, text):
271
  tokens = text_bc_tokenizer(
272
- text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
273
  ).to(device)["input_ids"]
274
 
275
  output = model(tokens)
@@ -279,7 +279,7 @@ def predict_bc(model, tokenizer, text):
279
 
280
  def predict_mc(model, tokenizer, text):
281
  tokens = text_mc_tokenizer(
282
- text, padding='max_length', truncation=True, return_tensors="pt", max_length=512
283
  ).to(device)["input_ids"]
284
  output = model(tokens)
285
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
 
213
  return f"{len(text)} characters"
214
 
215
 
216
+ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30, min_last_segment_length=100, type_det='bc'):
217
  sentences = nltk.sent_tokenize(text)
218
  segments = []
219
  current_segment = []
 
269
 
270
  def predict_bc(model, tokenizer, text):
271
  tokens = text_bc_tokenizer(
272
+ text, padding='max_length', truncation=True, max_length=512, return_tensors="pt"
273
  ).to(device)["input_ids"]
274
 
275
  output = model(tokens)
 
279
 
280
  def predict_mc(model, tokenizer, text):
281
  tokens = text_mc_tokenizer(
282
+ text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
283
  ).to(device)["input_ids"]
284
  output = model(tokens)
285
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]