from flask import Flask, request from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig import torch import gradio as gr import os import re import PyPDF2 import pdfplumber app = Flask(__name__) ACCESS_TOKEN = os.environ["ACCESS_TOKEN"] config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN) model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config) model_name = "roberta-base" tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu')) # function to break text into an array of sentences def text_to_sentences(text): re.sub(r'(?<=[.!?])(?=[^\s])', r' ', text) return re.split(r'[.!?]', text) # function to concatenate sentences into chunks of size 600 or less def chunks_of_600(text, chunk_size=600): sentences = text_to_sentences(text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk + sentence) <= chunk_size: current_chunk += sentence else: chunks.append(current_chunk) current_chunk = sentence chunks.append(current_chunk) return chunks def predict(query, device="cpu"): tokens = tokenizer.encode(query) all_tokens = len(tokens) tokens = tokens[:tokenizer.model_max_length - 2] used_tokens = len(tokens) tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0) mask = torch.ones_like(tokens) with torch.no_grad(): logits = model(tokens.to(device), attention_mask=mask.to(device))[0] probs = logits.softmax(dim=-1) fake, real = probs.detach().cpu().flatten().numpy().tolist() return real def findRealProb(text): chunksOfText = (chunks_of_600(text)) results = [] for chunk in chunksOfText: output = predict(chunk) print(chunk) print("-----------------------------------") results.append([output, len(chunk)]) ans = 0 for prob, length in results: ans = ans + prob*length realProb = ans/len(text) return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text} def upload_file(file): # if 'pdfFile' in request.files: # pdf_file = request.files['pdfFile'] # text = "" # with pdfplumber.open(pdf_file) as pdf: # cnt = 0 # for page in pdf.pages: # cnt+=1 # text+=(page.extract_text(x_tolerance = 1)) # print(text) # if cnt>5: # break # text = text.replace('\n', ' ') # return findRealProb(text) # return jsonify({'text': text}) if file: # with open(file.name, 'rb') as pdf_file: # pdf_reader = PyPDF2.PdfReader(pdf_file) # text = '' # for page_num in range(len(pdf_reader.pages)): # page = pdf_reader.pages[page_num] # text += page.extract_text() # text = text.replace('\n', ' ') # return findRealProb(text) pdf_file = file.name print(file, pdf_file) text = "" with pdfplumber.open(pdf_file) as pdf: cnt = 0 for page in pdf.pages: cnt+=1 text+=(page.extract_text(x_tolerance = 1)) if cnt>5: break text = text.replace('\n', ' ') return findRealProb(text) else: return {"error":'No PDF file found in request'} demo = gr.Interface( fn=upload_file, inputs=gr.File(), article = "Visit AI Content Detector for better user experience!", outputs=gr.outputs.JSON(), interpretation="default",) demo.launch(show_api=False)