Spaces:
Running
Running
Merge branch 'main' into demo
Browse files- isotonic_regression_model.joblib +0 -0
- predictors.py +28 -9
- requirements.txt +2 -1
isotonic_regression_model.joblib
ADDED
Binary file (2 kB). View file
|
|
predictors.py
CHANGED
@@ -19,6 +19,7 @@ from scipy.special import softmax
|
|
19 |
import yaml
|
20 |
import os
|
21 |
from utils import *
|
|
|
22 |
|
23 |
with open("config.yaml", "r") as file:
|
24 |
params = yaml.safe_load(file)
|
@@ -55,11 +56,19 @@ for model_name, model in zip(mc_label_map, text_1on1_models):
|
|
55 |
).to(device)
|
56 |
|
57 |
# proxy models for explainability
|
58 |
-
|
59 |
-
bc_tokenizer_mini = AutoTokenizer.from_pretrained(
|
60 |
-
bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
61 |
-
|
62 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
|
65 |
def split_text_allow_complete_sentences_nltk(
|
@@ -164,8 +173,8 @@ def predict_for_explainanility(text, model_type=None):
|
|
164 |
if model_type == "quillbot":
|
165 |
cleaning = False
|
166 |
max_length = 256
|
167 |
-
model =
|
168 |
-
tokenizer =
|
169 |
elif model_type == "bc":
|
170 |
cleaning = True
|
171 |
max_length = 512
|
@@ -260,13 +269,23 @@ def predict_bc_scores(input):
|
|
260 |
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
|
261 |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
262 |
for i in range(samples_len_bc):
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
264 |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
265 |
bc_scores.append(bc_score)
|
266 |
bc_scores_array = np.array(bc_scores)
|
267 |
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
268 |
bc_score_list = average_bc_scores.tolist()
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
270 |
return bc_score
|
271 |
|
272 |
|
|
|
19 |
import yaml
|
20 |
import os
|
21 |
from utils import *
|
22 |
+
import joblib
|
23 |
|
24 |
with open("config.yaml", "r") as file:
|
25 |
params = yaml.safe_load(file)
|
|
|
56 |
).to(device)
|
57 |
|
58 |
# proxy models for explainability
|
59 |
+
mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
|
60 |
+
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
|
61 |
+
bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
62 |
+
mini_bc_model_name
|
63 |
+
).to(device)
|
64 |
+
mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
|
65 |
+
humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(mini_humanizer_model_name)
|
66 |
+
humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
|
67 |
+
mini_humanizer_model_name
|
68 |
+
).to(device)
|
69 |
+
|
70 |
+
# model score calibration
|
71 |
+
iso_reg = joblib.load("isotonic_regression_model.joblib")
|
72 |
|
73 |
|
74 |
def split_text_allow_complete_sentences_nltk(
|
|
|
173 |
if model_type == "quillbot":
|
174 |
cleaning = False
|
175 |
max_length = 256
|
176 |
+
model = humanizer_model_mini
|
177 |
+
tokenizer = humanizer_tokenizer_mini
|
178 |
elif model_type == "bc":
|
179 |
cleaning = True
|
180 |
max_length = 512
|
|
|
269 |
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
|
270 |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
271 |
for i in range(samples_len_bc):
|
272 |
+
|
273 |
+
cleaned_text_bc = (
|
274 |
+
segments_bc[i].replace("<s>", "").replace("</s>", "")
|
275 |
+
) # this is caused by above
|
276 |
+
|
277 |
+
cleaned_text_bc = remove_special_characters(cleaned_text_bc)
|
278 |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
279 |
bc_scores.append(bc_score)
|
280 |
bc_scores_array = np.array(bc_scores)
|
281 |
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
282 |
bc_score_list = average_bc_scores.tolist()
|
283 |
+
print(f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}")
|
284 |
+
# isotonic regression calibration
|
285 |
+
ai_score = iso_reg.predict([bc_score_list[1]])[0]
|
286 |
+
human_score = 1 - ai_score
|
287 |
+
bc_score = {"AI": ai_score, "HUMAN": human_score}
|
288 |
+
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
|
289 |
return bc_score
|
290 |
|
291 |
|
requirements.txt
CHANGED
@@ -24,4 +24,5 @@ pymupdf
|
|
24 |
sentence-transformers
|
25 |
Unidecode
|
26 |
python-dotenv
|
27 |
-
lime
|
|
|
|
24 |
sentence-transformers
|
25 |
Unidecode
|
26 |
python-dotenv
|
27 |
+
lime
|
28 |
+
joblib
|