minko186 commited on
Commit
029c7a1
·
1 Parent(s): fe15d80

refactored plagiarism checker

Browse files
Files changed (2) hide show
  1. app.py +1 -187
  2. plagiarism.py +340 -0
app.py CHANGED
@@ -32,6 +32,7 @@ from utils import cos_sim_torch, embed_text
32
  import multiprocessing
33
  from functools import partial
34
  import concurrent.futures
 
35
 
36
  nltk.download("punkt")
37
 
@@ -50,193 +51,6 @@ from writing_analysis import (
50
  np.set_printoptions(suppress=True)
51
 
52
 
53
- def plagiarism_check(
54
- plag_option,
55
- input,
56
- year_from,
57
- month_from,
58
- day_from,
59
- year_to,
60
- month_to,
61
- day_to,
62
- domains_to_skip,
63
- ):
64
- api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
65
- api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
66
- api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
67
- # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
68
- api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
69
-
70
- cse_id = "851813e81162b4ed4"
71
-
72
- time1 = time.perf_counter()
73
- start = time.perf_counter()
74
- sentences = getSentences(input)
75
- urlCount = {}
76
- ScoreArray = []
77
- urlList = []
78
-
79
- date_from = build_date(year_from, month_from, day_from)
80
- date_to = build_date(year_to, month_to, day_to)
81
- sort_date = f"date:r:{date_from}:{date_to}"
82
-
83
- # get list of URLS to check
84
- urlCount, ScoreArray = googleSearch(
85
- plag_option,
86
- sentences,
87
- urlCount,
88
- ScoreArray,
89
- urlList,
90
- sort_date,
91
- domains_to_skip,
92
- api_key,
93
- cse_id,
94
- )
95
- print(f"Time for google search: {time.perf_counter()-time1}")
96
- time1 = time.perf_counter()
97
-
98
- print("Number of URLs: ", len(urlCount))
99
- print(urlList)
100
-
101
- # Scrape URLs in list
102
- formatted_tokens = []
103
- soups = asyncio.run(parallel_scrap(urlList))
104
-
105
- print(f"Time for scraping: {time.perf_counter()-time1}")
106
- time1 = time.perf_counter()
107
- print(len(soups))
108
- print(
109
- "Successful scraping: "
110
- + str(len([x for x in soups if x is not None]))
111
- + "out of "
112
- + str(len(urlList))
113
- )
114
-
115
- source_embeddings = []
116
- for i, soup in enumerate(soups):
117
- if soup:
118
- page_content = soup.text
119
- source_embeddings.append(embed_text(page_content))
120
- else:
121
- source_embeddings.append(None)
122
-
123
- # Populate matching scores for scrapped pages
124
- # for i, soup in enumerate(soups):
125
- # print(f"Analyzing {i+1} of {len(soups)} soups........................")
126
- # if soup:
127
- # page_content = soup.text
128
- # for j, sent in enumerate(sentences):
129
- # # score = matchingScore(sent, page_content)
130
- # # score = matchingScoreWithTimeout(sent, page_content)
131
- # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
132
- # ScoreArray[i][j] = score
133
-
134
- def compute_cosine_similarity(args):
135
- sent, source_embedding, i, j = args
136
- score = cos_sim_torch(embed_text(sent), source_embedding)
137
- return i, j, score
138
-
139
- def main(soups, sentences):
140
- source_embeddings = [preprocess(soup) for soup in soups]
141
- ScoreArray = [[0 for _ in sentences] for _ in soups]
142
- args_list = []
143
- for i, soup in enumerate(soups):
144
- if soup:
145
- for j, sent in enumerate(sentences):
146
- args_list.append((sent, source_embeddings[i], i, j))
147
- with concurrent.futures.ProcessPoolExecutor() as executor:
148
- results = executor.map(compute_cosine_similarity, args_list)
149
- for i, j, score in results:
150
- ScoreArray[i][j] = score
151
- return ScoreArray
152
-
153
- ScoreArray = main(soups, sentences)
154
-
155
- print(f"Time for matching score: {time.perf_counter()-time1}")
156
- time1 = time.perf_counter()
157
-
158
- # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
159
- # print("New Score Array:\n")
160
- # print2D(ScoreArray)
161
-
162
- # Gradio formatting section
163
- sentencePlag = [False] * len(sentences)
164
- sentenceToMaxURL = [-1] * len(sentences)
165
- for j in range(len(sentences)):
166
- if j > 0:
167
- maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
168
- sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
169
- else:
170
- maxScore = -1
171
- for i in range(len(ScoreArray)):
172
- margin = (
173
- 0.1
174
- if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
175
- else 0
176
- )
177
- if ScoreArray[i][j] - maxScore > margin:
178
- maxScore = ScoreArray[i][j]
179
- sentenceToMaxURL[j] = i
180
- if maxScore > 0.5:
181
- sentencePlag[j] = True
182
-
183
- if (
184
- (len(sentences) > 1)
185
- and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
186
- and (
187
- ScoreArray[sentenceToMaxURL[0]][0]
188
- - ScoreArray[sentenceToMaxURL[1]][0]
189
- < 0.1
190
- )
191
- ):
192
- sentenceToMaxURL[0] = sentenceToMaxURL[1]
193
-
194
- index = np.unique(sentenceToMaxURL)
195
-
196
- urlScore = {}
197
- for url in index:
198
- s = [
199
- ScoreArray[url][sen]
200
- for sen in range(len(sentences))
201
- if sentenceToMaxURL[sen] == url
202
- ]
203
- urlScore[url] = sum(s) / len(s)
204
-
205
- index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
206
-
207
- urlMap = {}
208
- for count, i in enumerate(index_descending):
209
- urlMap[i] = count + 1
210
- for i, sent in enumerate(sentences):
211
- formatted_tokens.append(
212
- (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
213
- )
214
-
215
- formatted_tokens.append(("\n", None))
216
- formatted_tokens.append(("\n", None))
217
- formatted_tokens.append(("\n", None))
218
-
219
- print(formatted_tokens)
220
- print(index_descending)
221
-
222
- for ind in index_descending:
223
- formatted_tokens.append(
224
- (
225
- urlList[ind]
226
- + " --- Matching Score: "
227
- + f"{str(round(urlScore[ind] * 100, 2))}%",
228
- "[" + str(urlMap[ind]) + "]",
229
- )
230
- )
231
- formatted_tokens.append(("\n", None))
232
-
233
- print(f"Formatted Tokens: {formatted_tokens}")
234
-
235
- print(f"Time for plagiarism check: {time.perf_counter()-start}")
236
-
237
- return formatted_tokens
238
-
239
-
240
  """
241
  AI DETECTION SECTION
242
  """
 
32
  import multiprocessing
33
  from functools import partial
34
  import concurrent.futures
35
+ from plagiarism import plagiarism_check
36
 
37
  nltk.download("punkt")
38
 
 
51
  np.set_printoptions(suppress=True)
52
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  """
55
  AI DETECTION SECTION
56
  """
plagiarism.py CHANGED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from nltk.tokenize import sent_tokenize
3
+ from googleapiclient.discovery import build
4
+ from collections import Counter
5
+ import re, math
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import asyncio
8
+ import httpx
9
+ from bs4 import BeautifulSoup
10
+ import numpy as np
11
+
12
+
13
+ WORD = re.compile(r"\w+")
14
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
15
+
16
+
17
+ # returns cosine similarity of two vectors
18
+ # input: two vectors
19
+ # output: integer between 0 and 1.
20
+ def get_cosine(vec1, vec2):
21
+ intersection = set(vec1.keys()) & set(vec2.keys())
22
+
23
+ # calculating numerator
24
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
25
+
26
+ # calculating denominator
27
+ sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
28
+ sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
29
+ denominator = math.sqrt(sum1) * math.sqrt(sum2)
30
+
31
+ # checking for divide by zero
32
+ if denominator == 0:
33
+ return 0.0
34
+ else:
35
+ return float(numerator) / denominator
36
+
37
+
38
+ # converts given text into a vector
39
+ def text_to_vector(text):
40
+ # uses the Regular expression above and gets all words
41
+ words = WORD.findall(text)
42
+ # returns a counter of all the words (count of number of occurences)
43
+ return Counter(words)
44
+
45
+
46
+ # returns cosine similarity of two words
47
+ # uses: text_to_vector(text) and get_cosine(v1,v2)
48
+ def cosineSim(text1, text2):
49
+ vector1 = text_to_vector(text1)
50
+ vector2 = text_to_vector(text2)
51
+ # print vector1,vector2
52
+ cosine = get_cosine(vector1, vector2)
53
+ return cosine
54
+
55
+
56
+ def cos_sim_torch(embedding_1, embedding_2):
57
+ return util.pytorch_cos_sim(embedding_1, embedding_2).item()
58
+
59
+
60
+ def embed_text(text):
61
+ return model.encode(text, convert_to_tensor=True)
62
+
63
+
64
+ def sentence_similarity(text1, text2):
65
+ embedding_1 = model.encode(text1, convert_to_tensor=True)
66
+ embedding_2 = model.encode(text2, convert_to_tensor=True)
67
+
68
+ o = util.pytorch_cos_sim(embedding_1, embedding_2)
69
+ return o.item()
70
+
71
+
72
+ def google_search(
73
+ plag_option,
74
+ sentences,
75
+ urlCount,
76
+ scoreArray,
77
+ urlList,
78
+ sorted_date,
79
+ domains_to_skip,
80
+ api_key,
81
+ cse_id,
82
+ **kwargs,
83
+ ):
84
+ service = build("customsearch", "v1", developerKey=api_key)
85
+ for i, sentence in enumerate(sentences):
86
+ results = (
87
+ service.cse()
88
+ .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
89
+ .execute()
90
+ )
91
+ if "items" in results and len(results["items"]) > 0:
92
+ for count, link in enumerate(results["items"]):
93
+ # stop after 3 pages
94
+ if count >= 3:
95
+ break
96
+ # skip user selected domains
97
+ if any(
98
+ ("." + domain) in link["link"] for domain in domains_to_skip
99
+ ):
100
+ continue
101
+ # clean up snippet of '...'
102
+ snippet = link["snippet"]
103
+ ind = snippet.find("...")
104
+ if ind < 20 and ind > 9:
105
+ snippet = snippet[ind + len("... ") :]
106
+ ind = snippet.find("...")
107
+ if ind > len(snippet) - 5:
108
+ snippet = snippet[:ind]
109
+
110
+ # update cosine similarity between snippet and given text
111
+ url = link["link"]
112
+ if url not in urlList:
113
+ urlList.append(url)
114
+ scoreArray.append([0] * len(sentences))
115
+ urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
116
+ if plag_option == "Standard":
117
+ scoreArray[urlList.index(url)][i] = cosineSim(
118
+ sentence, snippet
119
+ )
120
+ else:
121
+ scoreArray[urlList.index(url)][i] = sentence_similarity(
122
+ sentence, snippet
123
+ )
124
+ return urlCount, scoreArray
125
+
126
+
127
+ def split_sentence_blocks(text):
128
+
129
+ sents = sent_tokenize(text)
130
+ two_sents = []
131
+ for i in range(len(sents)):
132
+ if (i % 2) == 0:
133
+ two_sents.append(sents[i])
134
+ else:
135
+ two_sents[len(two_sents) - 1] += " " + sents[i]
136
+ return two_sents
137
+
138
+
139
+ months = {
140
+ "January": "01",
141
+ "February": "02",
142
+ "March": "03",
143
+ "April": "04",
144
+ "May": "05",
145
+ "June": "06",
146
+ "July": "07",
147
+ "August": "08",
148
+ "September": "09",
149
+ "October": "10",
150
+ "November": "11",
151
+ "December": "12",
152
+ }
153
+
154
+
155
+ def build_date(year=2024, month="March", day=1):
156
+ return f"{year}{months[month]}{day}"
157
+
158
+
159
+ async def get_url_data(url, client):
160
+ try:
161
+ r = await client.get(url)
162
+ # print(r.status_code)
163
+ if r.status_code == 200:
164
+ # print("in")
165
+ soup = BeautifulSoup(r.content, "html.parser")
166
+ return soup
167
+ except Exception:
168
+ return None
169
+
170
+
171
+ def remove_punc(text):
172
+ res = re.sub(r"[^\w\s]", "", text)
173
+ return res
174
+
175
+
176
+ def split_ngrams(text, n):
177
+ # return n-grams of size n
178
+ words = text.split()
179
+ return [words[i : i + n] for i in range(len(words) - n + 1)]
180
+
181
+
182
+ async def parallel_scrap(urls):
183
+ async with httpx.AsyncClient(timeout=30) as client:
184
+ tasks = []
185
+ for url in urls:
186
+ tasks.append(get_url_data(url=url, client=client))
187
+ results = await asyncio.gather(*tasks, return_exceptions=True)
188
+ return results
189
+
190
+
191
+ def matching_score(sentence, content):
192
+ sentence = remove_punc(sentence)
193
+ content = remove_punc(content)
194
+ if sentence in content:
195
+ return 1
196
+ else:
197
+ n = 5
198
+ ngrams = split_ngrams(sentence, n)
199
+ if len(ngrams) == 0:
200
+ return 0
201
+ matched = [x for x in ngrams if " ".join(x) in content]
202
+ return len(matched) / len(ngrams)
203
+
204
+
205
+ def plagiarism_check(
206
+ plag_option,
207
+ input,
208
+ year_from,
209
+ month_from,
210
+ day_from,
211
+ year_to,
212
+ month_to,
213
+ day_to,
214
+ domains_to_skip,
215
+ ):
216
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
217
+ api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
218
+ api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
219
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
220
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
221
+ cse_id = "851813e81162b4ed4"
222
+
223
+ sentences = split_sentence_blocks(input)
224
+ urlCount = {}
225
+ ScoreArray = []
226
+ urlList = []
227
+ date_from = build_date(year_from, month_from, day_from)
228
+ date_to = build_date(year_to, month_to, day_to)
229
+ sort_date = f"date:r:{date_from}:{date_to}"
230
+ # get list of URLS to check
231
+ urlCount, ScoreArray = google_search(
232
+ plag_option,
233
+ sentences,
234
+ urlCount,
235
+ ScoreArray,
236
+ urlList,
237
+ sort_date,
238
+ domains_to_skip,
239
+ api_key,
240
+ cse_id,
241
+ )
242
+
243
+ # Scrape URLs in list
244
+ formatted_tokens = []
245
+ soups = asyncio.run(parallel_scrap(urlList))
246
+
247
+ # Populate matching scores for scrapped pages
248
+ for i, soup in enumerate(soups):
249
+ print(f"Analyzing {i+1} of {len(soups)} soups........................")
250
+ if soup:
251
+ page_content = soup.text
252
+ for j, sent in enumerate(sentences):
253
+ score = matching_score(sent, page_content)
254
+ score = matching_score(sent, page_content)
255
+ # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
256
+ ScoreArray[i][j] = score
257
+
258
+ # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
259
+ # source_embeddings = []
260
+ # for i, soup in enumerate(soups):
261
+ # if soup:
262
+ # page_content = soup.text
263
+ # source_embeddings.append(embed_text(page_content))
264
+ # else:
265
+ # source_embeddings.append(None)
266
+
267
+ # def compute_cosine_similarity(args):
268
+ # sent, source_embedding, i, j = args
269
+ # score = cos_sim_torch(embed_text(sent), source_embedding)
270
+ # return i, j, score
271
+
272
+ # def main(soups, sentences):
273
+ # source_embeddings = [preprocess(soup) for soup in soups]
274
+ # ScoreArray = [[0 for _ in sentences] for _ in soups]
275
+ # args_list = []
276
+ # for i, soup in enumerate(soups):
277
+ # if soup:
278
+ # for j, sent in enumerate(sentences):
279
+ # args_list.append((sent, source_embeddings[i], i, j))
280
+ # with concurrent.futures.ProcessPoolExecutor() as executor:
281
+ # results = executor.map(compute_cosine_similarity, args_list)
282
+ # for i, j, score in results:
283
+ # ScoreArray[i][j] = score
284
+ # return ScoreArray
285
+
286
+ # # Populate matching scores for scrapped pages
287
+ # ScoreArray = main(soups, sentences)
288
+ # *******************************************************************************************
289
+
290
+ # Calculate URL of max matching score for each sentence chunk
291
+ sentenceToMaxURL = [-1] * len(sentences)
292
+ for j in range(len(sentences)):
293
+ if j > 0:
294
+ maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
295
+ sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
296
+ else:
297
+ maxScore = -1
298
+
299
+ for i in range(len(ScoreArray)):
300
+ margin = (
301
+ 0.1
302
+ if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
303
+ else 0
304
+ )
305
+ if ScoreArray[i][j] - maxScore > margin:
306
+ maxScore = ScoreArray[i][j]
307
+ sentenceToMaxURL[j] = i
308
+
309
+ index = np.unique(sentenceToMaxURL)
310
+
311
+ urlScore = {}
312
+ for url in index:
313
+ s = [
314
+ ScoreArray[url][sen]
315
+ for sen in range(len(sentences))
316
+ if sentenceToMaxURL[sen] == url
317
+ ]
318
+ urlScore[url] = sum(s) / len(s)
319
+
320
+ index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
321
+
322
+ urlMap = {}
323
+ for count, i in enumerate(index_descending):
324
+ urlMap[i] = count + 1
325
+ for i, sent in enumerate(sentences):
326
+ formatted_tokens.append(
327
+ (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
328
+ )
329
+ for ind in index_descending:
330
+ formatted_tokens.append(
331
+ (
332
+ urlList[ind]
333
+ + " --- Matching Score: "
334
+ + f"{str(round(urlScore[ind] * 100, 2))}%",
335
+ "[" + str(urlMap[ind]) + "]",
336
+ )
337
+ )
338
+ formatted_tokens.append(("\n", None))
339
+
340
+ return formatted_tokens