File size: 3,306 Bytes
53a13a1 243b95e 53a13a1 243b95e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
from typing import Dict, Any
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re
class EndpointHandler():
def __init__(self, path=""):
self.tokenizer = AutoTokenizer.from_pretrained(path)
self.model = AutoModelForSequenceClassification.from_pretrained(path)
self.model.eval()
self.id2label = {0: "Human", 1: "Mixed", 2: "AI"}
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
def split_into_sentences(self, text: str):
sentences = re.split(r'(?<=[.!?])\s+', text)
return [s.strip() for s in sentences if s.strip()]
def get_token_predictions(self, text: str):
tokens = self.tokenizer.tokenize(text)
token_predictions = []
for i in range(len(tokens)):
start = max(0, i - 10)
end = min(len(tokens), i + 10)
context = self.tokenizer.convert_tokens_to_string(tokens[start:end])
inputs = self.tokenizer(context, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
ai_prob = probs[0][2].item()
token = tokens[i].replace("Ġ", " ").replace("▁", " ").replace("Ċ", " ").strip()
if token:
token_predictions.append({"token": token, "ai_prob": ai_prob})
return token_predictions
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
text = data.get("inputs", "")
# Document level
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
pred = torch.argmax(probs, dim=1).item()
doc_result = {
"prediction": self.id2label[pred],
"confidence": probs[0][pred].item(),
"probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])}
}
# Sentence level
sentences = self.split_into_sentences(text)
sent_results = []
for sent in sentences:
inputs = self.tokenizer(sent, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
pred = torch.argmax(probs, dim=1).item()
sent_results.append({
"sentence": sent,
"prediction": self.id2label[pred],
"confidence": probs[0][pred].item(),
"probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])}
})
# Token level
token_results = self.get_token_predictions(text)
return [{
"document": doc_result,
"sentences": sent_results,
"tokens": token_results
}] |