aliasgerovs commited on
Commit
7e7ab8f
·
2 Parent(s): b105c66 7c28462

Merge branch 'main' into demo

Browse files
isotonic_regression_model.joblib ADDED
Binary file (2 kB). View file
 
predictors.py CHANGED
@@ -19,6 +19,7 @@ from scipy.special import softmax
19
  import yaml
20
  import os
21
  from utils import *
 
22
 
23
  with open("config.yaml", "r") as file:
24
  params = yaml.safe_load(file)
@@ -55,11 +56,19 @@ for model_name, model in zip(mc_label_map, text_1on1_models):
55
  ).to(device)
56
 
57
  # proxy models for explainability
58
- mini_model_name = "polygraf-ai/bc-model-bert-mini"
59
- bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_model_name)
60
- bc_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_model_name).to(
61
- device
62
- )
 
 
 
 
 
 
 
 
63
 
64
 
65
  def split_text_allow_complete_sentences_nltk(
@@ -164,8 +173,8 @@ def predict_for_explainanility(text, model_type=None):
164
  if model_type == "quillbot":
165
  cleaning = False
166
  max_length = 256
167
- model = quillbot_model
168
- tokenizer = quillbot_tokenizer
169
  elif model_type == "bc":
170
  cleaning = True
171
  max_length = 512
@@ -260,13 +269,23 @@ def predict_bc_scores(input):
260
  samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
261
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
262
  for i in range(samples_len_bc):
263
- cleaned_text_bc = remove_special_characters(segments_bc[i])
 
 
 
 
 
264
  bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
265
  bc_scores.append(bc_score)
266
  bc_scores_array = np.array(bc_scores)
267
  average_bc_scores = np.mean(bc_scores_array, axis=0)
268
  bc_score_list = average_bc_scores.tolist()
269
- bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
 
 
 
 
 
270
  return bc_score
271
 
272
 
 
19
  import yaml
20
  import os
21
  from utils import *
22
+ import joblib
23
 
24
  with open("config.yaml", "r") as file:
25
  params = yaml.safe_load(file)
 
56
  ).to(device)
57
 
58
  # proxy models for explainability
59
+ mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
60
+ bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
61
+ bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
62
+ mini_bc_model_name
63
+ ).to(device)
64
+ mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
65
+ humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(mini_humanizer_model_name)
66
+ humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
67
+ mini_humanizer_model_name
68
+ ).to(device)
69
+
70
+ # model score calibration
71
+ iso_reg = joblib.load("isotonic_regression_model.joblib")
72
 
73
 
74
  def split_text_allow_complete_sentences_nltk(
 
173
  if model_type == "quillbot":
174
  cleaning = False
175
  max_length = 256
176
+ model = humanizer_model_mini
177
+ tokenizer = humanizer_tokenizer_mini
178
  elif model_type == "bc":
179
  cleaning = True
180
  max_length = 512
 
269
  samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
270
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
271
  for i in range(samples_len_bc):
272
+
273
+ cleaned_text_bc = (
274
+ segments_bc[i].replace("<s>", "").replace("</s>", "")
275
+ ) # this is caused by above
276
+
277
+ cleaned_text_bc = remove_special_characters(cleaned_text_bc)
278
  bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
279
  bc_scores.append(bc_score)
280
  bc_scores_array = np.array(bc_scores)
281
  average_bc_scores = np.mean(bc_scores_array, axis=0)
282
  bc_score_list = average_bc_scores.tolist()
283
+ print(f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}")
284
+ # isotonic regression calibration
285
+ ai_score = iso_reg.predict([bc_score_list[1]])[0]
286
+ human_score = 1 - ai_score
287
+ bc_score = {"AI": ai_score, "HUMAN": human_score}
288
+ print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
289
  return bc_score
290
 
291
 
requirements.txt CHANGED
@@ -24,4 +24,5 @@ pymupdf
24
  sentence-transformers
25
  Unidecode
26
  python-dotenv
27
- lime
 
 
24
  sentence-transformers
25
  Unidecode
26
  python-dotenv
27
+ lime
28
+ joblib