eljanmahammadli commited on
Commit
9d6deff
·
1 Parent(s): 3539fee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -214,7 +214,7 @@ def update_character_count(text):
214
  return f"{len(text)} characters"
215
 
216
 
217
- def split_text_allow_complete_sentences_nltk(text, max_length=300, tolerance=30, min_last_segment_length=180):
218
  sentences = nltk.sent_tokenize(text)
219
  segments = []
220
  current_segment = []
@@ -264,7 +264,7 @@ def split_text_allow_complete_sentences_nltk(text, max_length=300, tolerance=30,
264
 
265
  def predict_bc(model, tokenizer, text):
266
  tokens = tokenizer(
267
- text, padding='max_length', truncation=True, max_length=300, return_tensors="pt"
268
  ).to(device)["input_ids"]
269
 
270
  output = model(tokens)
@@ -274,7 +274,7 @@ def predict_bc(model, tokenizer, text):
274
 
275
  def predict_mc(model, tokenizer, text):
276
  tokens = tokenizer(
277
- text, padding='max_length', truncation=True, return_tensors="pt", max_length=300
278
  ).to(device)["input_ids"]
279
  output = model(tokens)
280
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
 
214
  return f"{len(text)} characters"
215
 
216
 
217
+ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40, min_last_segment_length=150):
218
  sentences = nltk.sent_tokenize(text)
219
  segments = []
220
  current_segment = []
 
264
 
265
  def predict_bc(model, tokenizer, text):
266
  tokens = tokenizer(
267
+ text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
268
  ).to(device)["input_ids"]
269
 
270
  output = model(tokens)
 
274
 
275
  def predict_mc(model, tokenizer, text):
276
  tokens = tokenizer(
277
+ text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
278
  ).to(device)["input_ids"]
279
  output = model(tokens)
280
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]