Spaces:
Runtime error
Runtime error
from flask import Flask, request | |
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from transformers import RobertaConfig | |
from torch import cuda | |
import torch | |
import gradio as gr | |
import os | |
import re | |
import pdfplumber | |
app = Flask(__name__) | |
ACCESS_TOKEN = os.environ["ACCESS_TOKEN"] | |
# config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN) | |
# model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config) | |
# model_name = "roberta-base" | |
# tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu')) | |
tokenizer = AutoTokenizer.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN) | |
model = AutoModelForSequenceClassification.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN) | |
# function to break text into an array of sentences | |
def text_to_sentences(text): | |
return re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text) | |
# function to concatenate sentences into chunks of size 900 or less | |
def chunks_of_900(text, chunk_size=900): | |
sentences = text_to_sentences(text) | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
if len(current_chunk + sentence) <= chunk_size: | |
if len(current_chunk)!=0: | |
current_chunk += " "+sentence | |
else: | |
current_chunk += sentence | |
else: | |
chunks.append(current_chunk) | |
current_chunk = sentence | |
chunks.append(current_chunk) | |
return chunks | |
def predict(query, device="cpu"): | |
tokens = tokenizer.encode(query) | |
all_tokens = len(tokens) | |
tokens = tokens[:tokenizer.model_max_length - 2] | |
used_tokens = len(tokens) | |
tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0) | |
mask = torch.ones_like(tokens) | |
with torch.no_grad(): | |
logits = model(tokens.to(device), attention_mask=mask.to(device))[0] | |
probs = logits.softmax(dim=-1) | |
fake, real = probs.detach().cpu().flatten().numpy().tolist() | |
return real | |
def findRealProb(text): | |
chunksOfText = (chunks_of_900(text)) | |
results = [] | |
for chunk in chunksOfText: | |
output = predict(chunk) | |
results.append([output, len(chunk)]) | |
ans = 0 | |
cnt=0 | |
for prob, length in results: | |
ans = ans + prob*length | |
cnt+=length | |
realProb = ans/cnt | |
return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text} | |
def upload_file(file): | |
if file: | |
pdf_file = file.name | |
text = "" | |
with pdfplumber.open(pdf_file) as pdf: | |
cnt = 0 | |
for page in pdf.pages: | |
cnt+=1 | |
text+=(page.extract_text(x_tolerance = 1)) | |
if cnt>5: | |
break | |
text = text.replace('\n', ' ') | |
return findRealProb(text) | |
else: | |
return {"error":'No PDF file found in request'} | |
demo = gr.Interface( | |
fn=upload_file, | |
inputs=gr.File(), | |
article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!", | |
outputs=gr.outputs.JSON(), | |
interpretation="default",) | |
demo.launch(show_api=False) |