File size: 3,963 Bytes
3ced2ed
 
 
 
 
 
42d7fda
3dfc25f
42d7fda
3ced2ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec7c36
 
3ced2ed
 
 
 
 
 
 
 
c6a55b0
66f5dd3
c43109e
 
 
 
 
 
 
 
 
 
 
 
 
3dfc25f
c43109e
3dfc25f
 
 
 
 
 
 
 
 
c43109e
 
 
 
 
 
 
 
 
 
5f78e9a
c43109e
3ced2ed
 
 
 
 
 
 
 
 
94abe9f
3ced2ed
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
import torch
import gradio as gr
import os
import re
import PyPDF2
import pdfplumber

app = Flask(__name__)

ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)

model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))

# function to break text into an array of sentences
def text_to_sentences(text):
    re.sub(r'(?<=[.!?])(?=[^\s])', r' ', text)
    return re.split(r'[.!?]', text)

# function to concatenate sentences into chunks of size 600 or less
def chunks_of_600(text, chunk_size=600):
    sentences = text_to_sentences(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk + sentence) <= chunk_size:
            current_chunk += sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    chunks.append(current_chunk)
    return chunks
    
def predict(query, device="cpu"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

def findRealProb(text):
    chunksOfText = (chunks_of_600(text))
    results = []
    for chunk in chunksOfText:
        output = predict(chunk)
        print(chunk)
        print("-----------------------------------")
        results.append([output, len(chunk)])
    
    ans = 0
    for prob, length in results:
        ans = ans + prob*length
    realProb = ans/len(text)
    return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}

def upload_file(file):
   
    # if 'pdfFile' in request.files:
    #     pdf_file = request.files['pdfFile']
    #     text = ""
    #     with pdfplumber.open(pdf_file) as pdf:
    #         cnt = 0
    #         for page in pdf.pages:
    #             cnt+=1
    #             text+=(page.extract_text(x_tolerance = 1))
    #             print(text)
    #             if cnt>5: 
    #                 break
    #         text = text.replace('\n', ' ') 
    #         return findRealProb(text)
            # return jsonify({'text': text})
    if file:

    #     with open(file.name, 'rb') as pdf_file:
    #         pdf_reader = PyPDF2.PdfReader(pdf_file)
    #         text = ''
    #         for page_num in range(len(pdf_reader.pages)):
    #             page = pdf_reader.pages[page_num]
    #             text += page.extract_text()
    #         text = text.replace('\n', ' ')    
    #         return findRealProb(text)
        pdf_file = file.name
        print(file, pdf_file)
        text = ""
        with pdfplumber.open(pdf_file) as pdf:
            cnt = 0
            for page in pdf.pages:
                cnt+=1
                text+=(page.extract_text(x_tolerance = 1))
                if cnt>5: 
                    break
            text = text.replace('\n', ' ')
            return findRealProb(text)
    else:
        return {"error":'No PDF file found in request'}


demo = gr.Interface(
        fn=upload_file, 
        inputs=gr.File(), 
         article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
        outputs=gr.outputs.JSON(),
        interpretation="default",)

demo.launch(show_api=False)