PirateXX commited on
Commit
3ced2ed
·
1 Parent(s): b7eaa44

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request
2
+ from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
3
+ import torch
4
+ import gradio as gr
5
+ import os
6
+ import re
7
+ app = Flask(__name__)
8
+
9
+ ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
10
+ config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
11
+ model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)
12
+
13
+ model_name = "roberta-base"
14
+ tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))
15
+
16
+ # function to break text into an array of sentences
17
+ def text_to_sentences(text):
18
+ re.sub(r'(?<=[.!?])(?=[^\s])', r' ', text)
19
+ return re.split(r'[.!?]', text)
20
+
21
+ # function to concatenate sentences into chunks of size 600 or less
22
+ def chunks_of_600(text, chunk_size=600):
23
+ sentences = text_to_sentences(text)
24
+ chunks = []
25
+ current_chunk = ""
26
+ for sentence in sentences:
27
+ if len(current_chunk + sentence) <= chunk_size:
28
+ current_chunk += sentence
29
+ else:
30
+ chunks.append(current_chunk)
31
+ current_chunk = sentence
32
+ chunks.append(current_chunk)
33
+ return chunks
34
+
35
+ def predict(query, device="cpu"):
36
+ tokens = tokenizer.encode(query)
37
+ all_tokens = len(tokens)
38
+ tokens = tokens[:tokenizer.model_max_length - 2]
39
+ used_tokens = len(tokens)
40
+ tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
41
+ mask = torch.ones_like(tokens)
42
+
43
+ with torch.no_grad():
44
+ logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
45
+ probs = logits.softmax(dim=-1)
46
+
47
+ fake, real = probs.detach().cpu().flatten().numpy().tolist()
48
+ return real
49
+
50
+ def findRealProb(text):
51
+ chunksOfText = (chunks_of_600(text))
52
+ results = []
53
+ for chunk in chunksOfText:
54
+ output = predict(chunk)
55
+ results.append([output, len(chunk)])
56
+
57
+ ans = 0
58
+ for prob, length in results:
59
+ ans = ans + prob*length
60
+ realProb = ans/len(text)
61
+ return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}
62
+
63
+ def upload_file():
64
+ if 'pdfFile' in request.files:
65
+ pdf_file = request.files['pdfFile']
66
+ text = ""
67
+ with pdfplumber.open(pdf_file) as pdf:
68
+ cnt = 0
69
+ for page in pdf.pages:
70
+ cnt+=1
71
+ text+=(page.extract_text(x_tolerance = 1))
72
+ print(text)
73
+ if cnt>5:
74
+ break
75
+ return findRealProb(text)
76
+ # return jsonify({'text': text})
77
+ else:
78
+ return {"error":'No PDF file found in request'}
79
+
80
+
81
+ demo = gr.Interface(
82
+ fn=upload_file,
83
+ inputs=gr.File(),
84
+ article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
85
+ outputs=gr.outputs.JSON(),
86
+ interpretation="default",
87
+
88
+ demo.launch(show_api=False)