PirateXX's picture
Update app.py
b98f10f
from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import RobertaConfig
from torch import cuda
import torch
import gradio as gr
import os
import re
import pdfplumber
app = Flask(__name__)
ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
# config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
# model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)
# model_name = "roberta-base"
# tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))
tokenizer = AutoTokenizer.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN)
model = AutoModelForSequenceClassification.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN)
# function to break text into an array of sentences
def text_to_sentences(text):
return re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text)
# function to concatenate sentences into chunks of size 900 or less
def chunks_of_900(text, chunk_size=900):
sentences = text_to_sentences(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk + sentence) <= chunk_size:
if len(current_chunk)!=0:
current_chunk += " "+sentence
else:
current_chunk += sentence
else:
chunks.append(current_chunk)
current_chunk = sentence
chunks.append(current_chunk)
return chunks
def predict(query, device="cpu"):
tokens = tokenizer.encode(query)
all_tokens = len(tokens)
tokens = tokens[:tokenizer.model_max_length - 2]
used_tokens = len(tokens)
tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
mask = torch.ones_like(tokens)
with torch.no_grad():
logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
probs = logits.softmax(dim=-1)
fake, real = probs.detach().cpu().flatten().numpy().tolist()
return real
def findRealProb(text):
chunksOfText = (chunks_of_900(text))
results = []
for chunk in chunksOfText:
output = predict(chunk)
results.append([output, len(chunk)])
ans = 0
cnt=0
for prob, length in results:
ans = ans + prob*length
cnt+=length
realProb = ans/cnt
return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}
def upload_file(file):
if file:
pdf_file = file.name
text = ""
with pdfplumber.open(pdf_file) as pdf:
cnt = 0
for page in pdf.pages:
cnt+=1
text+=(page.extract_text(x_tolerance = 1))
if cnt>5:
break
text = text.replace('\n', ' ')
return findRealProb(text)
else:
return {"error":'No PDF file found in request'}
demo = gr.Interface(
fn=upload_file,
inputs=gr.File(),
article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
outputs=gr.outputs.JSON(),
interpretation="default",)
demo.launch(show_api=False)