Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

aliasgerovs commited on Mar 27, 2024

Commit

7e7ab8f

2 Parent(s): b105c66 7c28462

Merge branch 'main' into demo

Browse files

Files changed (3) hide show

isotonic_regression_model.joblib +0 -0
predictors.py +28 -9
requirements.txt +2 -1

isotonic_regression_model.joblib ADDED Viewed

Binary file (2 kB). View file

predictors.py CHANGED Viewed

@@ -19,6 +19,7 @@ from scipy.special import softmax
 import yaml
 import os
 from utils import *
 with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
@@ -55,11 +56,19 @@ for model_name, model in zip(mc_label_map, text_1on1_models):
     ).to(device)
 # proxy models for explainability
-mini_model_name = "polygraf-ai/bc-model-bert-mini"
-bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_model_name)
-bc_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_model_name).to(
-    device
-)
 def split_text_allow_complete_sentences_nltk(
@@ -164,8 +173,8 @@ def predict_for_explainanility(text, model_type=None):
     if model_type == "quillbot":
         cleaning = False
         max_length = 256
-        model = quillbot_model
-        tokenizer = quillbot_tokenizer
     elif model_type == "bc":
         cleaning = True
         max_length = 512
@@ -260,13 +269,23 @@ def predict_bc_scores(input):
     samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
     segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
-        cleaned_text_bc = remove_special_characters(segments_bc[i])
         bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
         bc_scores.append(bc_score)
     bc_scores_array = np.array(bc_scores)
     average_bc_scores = np.mean(bc_scores_array, axis=0)
     bc_score_list = average_bc_scores.tolist()
-    bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
     return bc_score

 import yaml
 import os
 from utils import *
+import joblib
 with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
     ).to(device)
 # proxy models for explainability
+mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
+bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
+bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
+    mini_bc_model_name
+).to(device)
+mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
+humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(mini_humanizer_model_name)
+humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
+    mini_humanizer_model_name
+).to(device)
+# model score calibration
+iso_reg = joblib.load("isotonic_regression_model.joblib")
 def split_text_allow_complete_sentences_nltk(
     if model_type == "quillbot":
         cleaning = False
         max_length = 256
+        model = humanizer_model_mini
+        tokenizer = humanizer_tokenizer_mini
     elif model_type == "bc":
         cleaning = True
         max_length = 512
     samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
     segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
+        cleaned_text_bc = (
+            segments_bc[i].replace("<s>", "").replace("</s>", "")
+        )  # this is caused by above
+        cleaned_text_bc = remove_special_characters(cleaned_text_bc)
         bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
         bc_scores.append(bc_score)
     bc_scores_array = np.array(bc_scores)
     average_bc_scores = np.mean(bc_scores_array, axis=0)
     bc_score_list = average_bc_scores.tolist()
+    print(f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}")
+    # isotonic regression calibration
+    ai_score = iso_reg.predict([bc_score_list[1]])[0]
+    human_score = 1 - ai_score
+    bc_score = {"AI": ai_score, "HUMAN": human_score}
+    print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
     return bc_score

requirements.txt CHANGED Viewed

@@ -24,4 +24,5 @@ pymupdf
 sentence-transformers
 Unidecode
 python-dotenv
-lime

 sentence-transformers
 Unidecode
 python-dotenv
+lime
+joblib