File size: 4,265 Bytes
479eb9c f81b82b 479eb9c e77b318 479eb9c f81b82b 479eb9c dbb9b6d 479eb9c e77b318 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
from torch.nn import functional as F
import numpy as np
import re
tokenizer = AutoTokenizer.from_pretrained("gpt3_finetuned_model/checkpoint-30048")
tokenizer_v2 = AutoTokenizer.from_pretrained("gpt2-large")
model = AutoModelForSequenceClassification.from_pretrained("gpt3_finetuned_model/checkpoint-30048").to("cuda")
# probabilities = 1 / (1 + np.exp(-logit_score))
def logit2prob(logit):
# odds =np.exp(logit)
# prob = odds / (1 + odds)
prob= 1/(1+ np.exp(-logit))
return np.round(prob, 3)
def split_sentence(sentence:str):
# Create a regular expression pattern from the list of separators
sentence= sentence.replace('\n', '')
separators = ['. ', '.', ':']
pattern = '|'.join(map(re.escape, separators))
# Split the sentence using the pattern as a delimiter
parts = re.split(pattern, sentence)
return parts
def predict(sentence: str):
'''
Returns (probability_human, probability_AI, label)
'''
inputs = tokenizer(sentence, return_tensors="pt").to("cuda")
with torch.no_grad():
logits = model(**inputs).logits
print("logits: ", logits)
predicted_class_id = logits.argmax().item()
# get probabilities using softmax from logit score and convert it to numpy array
probabilities_scores = np.round(
F.softmax(logits.to("cpu"), dim = -1).numpy()[0],
3)
print("P(Human): ", probabilities_scores[0])
print("P(AI): ", probabilities_scores[1])
label= "Human Written" if model.config.id2label[predicted_class_id]=='NEGATIVE' else 'AI written'
print("Label: ", label)
print(model.config.id2label[predicted_class_id])
return probabilities_scores[0], probabilities_scores[1], label
def calculate_burstiness(sentence: str):
'''
Returns (variance, average_length)
'''
list_of_sentences= split_sentence(sentence)
arr= []
for i in list_of_sentences:
if len(i)==0:
continue
ei= tokenizer_v2(i, return_tensors="pt")
arr.append(ei.input_ids.size(1))
variance= np.var(np.array(arr))
std_deviation= np.std(np.array(arr))
avg_length= np.average(np.array(arr))
print(f"arr= {(arr)}")
print(f'variance: {variance}')
print(f'std: {std_deviation}')
print(f'average length: {avg_length}')
return variance, avg_length
def complete_sentence_analysis(sentence:str):
'''
Returns a dictionary
{
p_human : probablity that the text is written by the human
p_ai : probablity that the text is written by ai
label : label {ai/human}
variance : variance in the length of the sentences
avg_length: average tokens per sentence
}
'''
p_human, p_ai, label= predict(sentence)
variance, avg_length= calculate_burstiness(sentence)
return {
"p_human": p_human,
"p_ai": p_ai,
"label": label,
"variance": variance,
"avg_length": avg_length
}
def get_top_labels(keyword: str):
'''
Returns score list
'''
inputs = tokenizer(keyword, return_tensors="pt").to("cuda")
with torch.no_grad():
logits = model(**inputs).logits
# print("logits: ", logits)
# predicted_class_id = logits.argmax().item()
# get probabilities using softmax from logit score and convert it to numpy array
# probabilities_scores = F.softmax(logits.cpu(), dim = -1).numpy()[0]
individual_probabilities_scores = logit2prob(logits.cpu().numpy()[0])
score_list= []
for i in range(2):
label= "Human Written" if model.config.id2label[i]=='NEGATIVE' else 'AI written'
score= individual_probabilities_scores[i]
score_list.append(
(label, score)
)
# if score>=0.5:
# score_list.append(
# (id2label[i], score)
# )
score_list.sort(
key= lambda x: x[1], reverse=True
)
return score_list[:5] |