Spaces:
Runtime error
Runtime error
File size: 3,963 Bytes
3ced2ed 42d7fda 3dfc25f 42d7fda 3ced2ed 8ec7c36 3ced2ed c6a55b0 66f5dd3 c43109e 3dfc25f c43109e 3dfc25f c43109e 5f78e9a c43109e 3ced2ed 94abe9f 3ced2ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
import torch
import gradio as gr
import os
import re
import PyPDF2
import pdfplumber
app = Flask(__name__)
ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))
# function to break text into an array of sentences
def text_to_sentences(text):
re.sub(r'(?<=[.!?])(?=[^\s])', r' ', text)
return re.split(r'[.!?]', text)
# function to concatenate sentences into chunks of size 600 or less
def chunks_of_600(text, chunk_size=600):
sentences = text_to_sentences(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk + sentence) <= chunk_size:
current_chunk += sentence
else:
chunks.append(current_chunk)
current_chunk = sentence
chunks.append(current_chunk)
return chunks
def predict(query, device="cpu"):
tokens = tokenizer.encode(query)
all_tokens = len(tokens)
tokens = tokens[:tokenizer.model_max_length - 2]
used_tokens = len(tokens)
tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
mask = torch.ones_like(tokens)
with torch.no_grad():
logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
probs = logits.softmax(dim=-1)
fake, real = probs.detach().cpu().flatten().numpy().tolist()
return real
def findRealProb(text):
chunksOfText = (chunks_of_600(text))
results = []
for chunk in chunksOfText:
output = predict(chunk)
print(chunk)
print("-----------------------------------")
results.append([output, len(chunk)])
ans = 0
for prob, length in results:
ans = ans + prob*length
realProb = ans/len(text)
return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}
def upload_file(file):
# if 'pdfFile' in request.files:
# pdf_file = request.files['pdfFile']
# text = ""
# with pdfplumber.open(pdf_file) as pdf:
# cnt = 0
# for page in pdf.pages:
# cnt+=1
# text+=(page.extract_text(x_tolerance = 1))
# print(text)
# if cnt>5:
# break
# text = text.replace('\n', ' ')
# return findRealProb(text)
# return jsonify({'text': text})
if file:
# with open(file.name, 'rb') as pdf_file:
# pdf_reader = PyPDF2.PdfReader(pdf_file)
# text = ''
# for page_num in range(len(pdf_reader.pages)):
# page = pdf_reader.pages[page_num]
# text += page.extract_text()
# text = text.replace('\n', ' ')
# return findRealProb(text)
pdf_file = file.name
print(file, pdf_file)
text = ""
with pdfplumber.open(pdf_file) as pdf:
cnt = 0
for page in pdf.pages:
cnt+=1
text+=(page.extract_text(x_tolerance = 1))
if cnt>5:
break
text = text.replace('\n', ' ')
return findRealProb(text)
else:
return {"error":'No PDF file found in request'}
demo = gr.Interface(
fn=upload_file,
inputs=gr.File(),
article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
outputs=gr.outputs.JSON(),
interpretation="default",)
demo.launch(show_api=False) |