Spaces:
Sleeping
Sleeping
change highlight from gradio to html
Browse files- app.py +34 -14
- plagiarism.py +135 -58
app.py
CHANGED
@@ -4,7 +4,7 @@ from datetime import date
|
|
4 |
from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
|
5 |
from analysis import depth_analysis
|
6 |
from predictors import predict_quillbot
|
7 |
-
from plagiarism import plagiarism_check, build_date
|
8 |
from highlighter import analyze_and_highlight
|
9 |
from utils import extract_text_from_pdf, len_validator
|
10 |
import yaml
|
@@ -20,7 +20,9 @@ model_list = params["MC_OUTPUT_LABELS"]
|
|
20 |
|
21 |
|
22 |
analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
|
23 |
-
analyze_and_highlight_quillbot = partial(
|
|
|
|
|
24 |
|
25 |
|
26 |
def ai_generated_test(option, input, models):
|
@@ -46,7 +48,18 @@ def main(
|
|
46 |
domains_to_skip,
|
47 |
):
|
48 |
|
49 |
-
formatted_tokens = plagiarism_check(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
plag_option,
|
51 |
input,
|
52 |
year_from,
|
@@ -211,15 +224,19 @@ with gr.Blocks() as demo:
|
|
211 |
|
212 |
with gr.Row():
|
213 |
with gr.Column():
|
214 |
-
sentenceBreakdown = gr.HighlightedText(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
label="Source Detection Sentence Breakdown",
|
216 |
-
|
217 |
-
color_map={
|
218 |
-
"[1]": "red",
|
219 |
-
"[2]": "orange",
|
220 |
-
"[3]": "yellow",
|
221 |
-
"[4]": "green",
|
222 |
-
},
|
223 |
)
|
224 |
|
225 |
with gr.Row():
|
@@ -268,7 +285,8 @@ with gr.Blocks() as demo:
|
|
268 |
)
|
269 |
|
270 |
only_plagiarism_btn.click(
|
271 |
-
fn=plagiarism_check,
|
|
|
272 |
inputs=[
|
273 |
plag_option,
|
274 |
input_text,
|
@@ -311,5 +329,7 @@ with gr.Blocks() as demo:
|
|
311 |
date_to = ""
|
312 |
|
313 |
|
314 |
-
if __name__ == "__main__":
|
315 |
-
demo.launch(
|
|
|
|
|
|
4 |
from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
|
5 |
from analysis import depth_analysis
|
6 |
from predictors import predict_quillbot
|
7 |
+
from plagiarism import plagiarism_check, build_date, html_highlight
|
8 |
from highlighter import analyze_and_highlight
|
9 |
from utils import extract_text_from_pdf, len_validator
|
10 |
import yaml
|
|
|
20 |
|
21 |
|
22 |
analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
|
23 |
+
analyze_and_highlight_quillbot = partial(
|
24 |
+
analyze_and_highlight, model_type="quillbot"
|
25 |
+
)
|
26 |
|
27 |
|
28 |
def ai_generated_test(option, input, models):
|
|
|
48 |
domains_to_skip,
|
49 |
):
|
50 |
|
51 |
+
# formatted_tokens = plagiarism_check(
|
52 |
+
# plag_option,
|
53 |
+
# input,
|
54 |
+
# year_from,
|
55 |
+
# month_from,
|
56 |
+
# day_from,
|
57 |
+
# year_to,
|
58 |
+
# month_to,
|
59 |
+
# day_to,
|
60 |
+
# domains_to_skip,
|
61 |
+
# )
|
62 |
+
formatted_tokens = html_highlight(
|
63 |
plag_option,
|
64 |
input,
|
65 |
year_from,
|
|
|
224 |
|
225 |
with gr.Row():
|
226 |
with gr.Column():
|
227 |
+
# sentenceBreakdown = gr.HighlightedText(
|
228 |
+
# label="Source Detection Sentence Breakdown",
|
229 |
+
# combine_adjacent=True,
|
230 |
+
# color_map={
|
231 |
+
# "[1]": "red",
|
232 |
+
# "[2]": "orange",
|
233 |
+
# "[3]": "yellow",
|
234 |
+
# "[4]": "green",
|
235 |
+
# },
|
236 |
+
# )
|
237 |
+
sentenceBreakdown = gr.HTML(
|
238 |
label="Source Detection Sentence Breakdown",
|
239 |
+
value="Source Detection Sentence Breakdown",
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
)
|
241 |
|
242 |
with gr.Row():
|
|
|
285 |
)
|
286 |
|
287 |
only_plagiarism_btn.click(
|
288 |
+
# fn=plagiarism_check,
|
289 |
+
fn=html_highlight,
|
290 |
inputs=[
|
291 |
plag_option,
|
292 |
input_text,
|
|
|
329 |
date_to = ""
|
330 |
|
331 |
|
332 |
+
if __name__ == "__main__":
|
333 |
+
demo.launch(
|
334 |
+
share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
|
335 |
+
)
|
plagiarism.py
CHANGED
@@ -20,6 +20,7 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
20 |
# input: two vectors
|
21 |
# output: integer between 0 and 1.
|
22 |
|
|
|
23 |
def get_cosine(vec1, vec2):
|
24 |
intersection = set(vec1.keys()) & set(vec2.keys())
|
25 |
|
@@ -75,9 +76,9 @@ def sentence_similarity(text1, text2):
|
|
75 |
def google_search(
|
76 |
plag_option,
|
77 |
sentences,
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
sorted_date,
|
82 |
domains_to_skip,
|
83 |
api_key,
|
@@ -112,19 +113,19 @@ def google_search(
|
|
112 |
|
113 |
# update cosine similarity between snippet and given text
|
114 |
url = link["link"]
|
115 |
-
if url not in
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
if plag_option == "Standard":
|
120 |
-
|
121 |
sentence, snippet
|
122 |
)
|
123 |
else:
|
124 |
-
|
125 |
sentence, snippet
|
126 |
)
|
127 |
-
return
|
128 |
|
129 |
|
130 |
def split_sentence_blocks(text):
|
@@ -191,7 +192,6 @@ async def parallel_scrap(urls):
|
|
191 |
return results
|
192 |
|
193 |
|
194 |
-
|
195 |
def matching_score(sentence_content_tuple):
|
196 |
sentence, content = sentence_content_tuple
|
197 |
if sentence in content:
|
@@ -204,11 +204,65 @@ def matching_score(sentence_content_tuple):
|
|
204 |
matched = [x for x in ngrams if " ".join(x) in content]
|
205 |
return len(matched) / len(ngrams)
|
206 |
|
|
|
207 |
def process_with_multiprocessing(input_data):
|
208 |
-
with Pool(processes=
|
209 |
scores = pool.map(matching_score, input_data)
|
210 |
return scores
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
def plagiarism_check(
|
213 |
plag_option,
|
214 |
input,
|
@@ -227,41 +281,44 @@ def plagiarism_check(
|
|
227 |
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
228 |
cse_id = "851813e81162b4ed4"
|
229 |
|
|
|
|
|
|
|
|
|
230 |
sentences = split_sentence_blocks(input)
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
date_from = build_date(year_from, month_from, day_from)
|
235 |
date_to = build_date(year_to, month_to, day_to)
|
236 |
sort_date = f"date:r:{date_from}:{date_to}"
|
237 |
# get list of URLS to check
|
238 |
-
|
239 |
plag_option,
|
240 |
sentences,
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
sort_date,
|
245 |
domains_to_skip,
|
246 |
api_key,
|
247 |
cse_id,
|
248 |
)
|
249 |
-
|
250 |
# Scrape URLs in list
|
251 |
formatted_tokens = []
|
252 |
-
soups = asyncio.run(parallel_scrap(
|
253 |
-
|
254 |
# # Populate matching scores for scrapped pages
|
255 |
# for i, soup in enumerate(soups):
|
256 |
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
257 |
# if soup:
|
258 |
# page_content = soup.text
|
259 |
-
|
260 |
# for j, sent in enumerate(sentences):
|
261 |
# args_list = (sent, page_content)
|
262 |
# score = matching_score(args_list)
|
263 |
# # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
264 |
-
#
|
265 |
|
266 |
input_data = []
|
267 |
for i, soup in enumerate(soups):
|
@@ -269,69 +326,89 @@ def plagiarism_check(
|
|
269 |
page_content = soup.text
|
270 |
for j, sent in enumerate(sentences):
|
271 |
input_data.append((sent, page_content))
|
272 |
-
|
273 |
scores = process_with_multiprocessing(input_data)
|
274 |
-
|
|
|
|
|
275 |
for i, soup in enumerate(soups):
|
276 |
if soup:
|
277 |
for j, _ in enumerate(sentences):
|
278 |
-
|
279 |
-
k += 1
|
280 |
-
|
|
|
|
|
281 |
sentenceToMaxURL = [-1] * len(sentences)
|
282 |
-
|
283 |
for j in range(len(sentences)):
|
284 |
if j > 0:
|
285 |
-
maxScore =
|
286 |
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
287 |
else:
|
288 |
maxScore = -1
|
289 |
|
290 |
-
for i in range(len(
|
291 |
margin = (
|
292 |
-
0.
|
293 |
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
294 |
else 0
|
295 |
)
|
296 |
-
if
|
297 |
-
maxScore =
|
298 |
sentenceToMaxURL[j] = i
|
|
|
|
|
|
|
299 |
|
300 |
index = np.unique(sentenceToMaxURL)
|
301 |
|
302 |
-
|
303 |
for url in index:
|
304 |
s = [
|
305 |
-
|
306 |
for sen in range(len(sentences))
|
307 |
if sentenceToMaxURL[sen] == url
|
308 |
]
|
309 |
-
|
310 |
|
311 |
-
index_descending = sorted(
|
312 |
|
313 |
urlMap = {}
|
314 |
for count, i in enumerate(index_descending):
|
315 |
urlMap[i] = count + 1
|
316 |
-
|
317 |
for i, sent in enumerate(sentences):
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
formatted_tokens.append(("\n", None))
|
323 |
-
formatted_tokens.append(("\n", None))
|
324 |
-
formatted_tokens.append(("\n", None))
|
325 |
-
|
326 |
-
for ind in index_descending:
|
327 |
-
formatted_tokens.append(
|
328 |
-
(
|
329 |
-
urlList[ind]
|
330 |
-
+ " --- Matching Score: "
|
331 |
-
+ f"{str(round(urlScore[ind] * 100, 2))}%",
|
332 |
-
"[" + str(urlMap[ind]) + "]",
|
333 |
)
|
|
|
|
|
|
|
|
|
|
|
334 |
)
|
335 |
-
formatted_tokens.append(("\n", None))
|
336 |
|
337 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# input: two vectors
|
21 |
# output: integer between 0 and 1.
|
22 |
|
23 |
+
|
24 |
def get_cosine(vec1, vec2):
|
25 |
intersection = set(vec1.keys()) & set(vec2.keys())
|
26 |
|
|
|
76 |
def google_search(
|
77 |
plag_option,
|
78 |
sentences,
|
79 |
+
url_count,
|
80 |
+
score_array,
|
81 |
+
url_list,
|
82 |
sorted_date,
|
83 |
domains_to_skip,
|
84 |
api_key,
|
|
|
113 |
|
114 |
# update cosine similarity between snippet and given text
|
115 |
url = link["link"]
|
116 |
+
if url not in url_list:
|
117 |
+
url_list.append(url)
|
118 |
+
score_array.append([0] * len(sentences))
|
119 |
+
url_count[url] = url_count[url] + 1 if url in url_count else 1
|
120 |
if plag_option == "Standard":
|
121 |
+
score_array[url_list.index(url)][i] = cosineSim(
|
122 |
sentence, snippet
|
123 |
)
|
124 |
else:
|
125 |
+
score_array[url_list.index(url)][i] = sentence_similarity(
|
126 |
sentence, snippet
|
127 |
)
|
128 |
+
return url_count, score_array
|
129 |
|
130 |
|
131 |
def split_sentence_blocks(text):
|
|
|
192 |
return results
|
193 |
|
194 |
|
|
|
195 |
def matching_score(sentence_content_tuple):
|
196 |
sentence, content = sentence_content_tuple
|
197 |
if sentence in content:
|
|
|
204 |
matched = [x for x in ngrams if " ".join(x) in content]
|
205 |
return len(matched) / len(ngrams)
|
206 |
|
207 |
+
|
208 |
def process_with_multiprocessing(input_data):
|
209 |
+
with Pool(processes=1) as pool:
|
210 |
scores = pool.map(matching_score, input_data)
|
211 |
return scores
|
212 |
+
|
213 |
+
|
214 |
+
def print2d(array):
|
215 |
+
for row in array:
|
216 |
+
print(row)
|
217 |
+
|
218 |
+
|
219 |
+
def html_highlight(
|
220 |
+
plag_option,
|
221 |
+
input,
|
222 |
+
year_from,
|
223 |
+
month_from,
|
224 |
+
day_from,
|
225 |
+
year_to,
|
226 |
+
month_to,
|
227 |
+
day_to,
|
228 |
+
domains_to_skip,
|
229 |
+
):
|
230 |
+
sentence_scores, url_scores = plagiarism_check(
|
231 |
+
plag_option,
|
232 |
+
input,
|
233 |
+
year_from,
|
234 |
+
month_from,
|
235 |
+
day_from,
|
236 |
+
year_to,
|
237 |
+
month_to,
|
238 |
+
day_to,
|
239 |
+
domains_to_skip,
|
240 |
+
)
|
241 |
+
color_map = [
|
242 |
+
"#e06b63",
|
243 |
+
"#eb9d59",
|
244 |
+
"#c2ad36",
|
245 |
+
"#e1ed72",
|
246 |
+
"#c2db76",
|
247 |
+
"#a2db76",
|
248 |
+
]
|
249 |
+
html_content = "<div style='font-family: Roboto; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
|
250 |
+
for sentence, _, _, idx in sentence_scores:
|
251 |
+
color = color_map[idx - 1]
|
252 |
+
formatted_sentence = f'<p style="background-color: {color}; padding: 5px;">{sentence} [{idx}]</p>'
|
253 |
+
html_content += formatted_sentence
|
254 |
+
|
255 |
+
html_content += "<hr>"
|
256 |
+
for url, score, idx in url_scores:
|
257 |
+
color = color_map[idx - 1]
|
258 |
+
formatted_name = f'<p style="background-color: {color}; padding: 5px;">({idx}) {url} --- Matching Score:{score}</p>'
|
259 |
+
html_content += formatted_name
|
260 |
+
|
261 |
+
html_content += "</div>"
|
262 |
+
|
263 |
+
return html_content
|
264 |
+
|
265 |
+
|
266 |
def plagiarism_check(
|
267 |
plag_option,
|
268 |
input,
|
|
|
281 |
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
282 |
cse_id = "851813e81162b4ed4"
|
283 |
|
284 |
+
url_scores = []
|
285 |
+
sentence_scores = []
|
286 |
+
# for input in input.split("\n\n"):
|
287 |
+
print(input)
|
288 |
sentences = split_sentence_blocks(input)
|
289 |
+
url_count = {}
|
290 |
+
score_array = []
|
291 |
+
url_list = []
|
292 |
date_from = build_date(year_from, month_from, day_from)
|
293 |
date_to = build_date(year_to, month_to, day_to)
|
294 |
sort_date = f"date:r:{date_from}:{date_to}"
|
295 |
# get list of URLS to check
|
296 |
+
url_count, score_array = google_search(
|
297 |
plag_option,
|
298 |
sentences,
|
299 |
+
url_count,
|
300 |
+
score_array,
|
301 |
+
url_list,
|
302 |
sort_date,
|
303 |
domains_to_skip,
|
304 |
api_key,
|
305 |
cse_id,
|
306 |
)
|
|
|
307 |
# Scrape URLs in list
|
308 |
formatted_tokens = []
|
309 |
+
soups = asyncio.run(parallel_scrap(url_list))
|
310 |
+
|
311 |
# # Populate matching scores for scrapped pages
|
312 |
# for i, soup in enumerate(soups):
|
313 |
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
314 |
# if soup:
|
315 |
# page_content = soup.text
|
316 |
+
|
317 |
# for j, sent in enumerate(sentences):
|
318 |
# args_list = (sent, page_content)
|
319 |
# score = matching_score(args_list)
|
320 |
# # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
321 |
+
# score_array[i][j] = score
|
322 |
|
323 |
input_data = []
|
324 |
for i, soup in enumerate(soups):
|
|
|
326 |
page_content = soup.text
|
327 |
for j, sent in enumerate(sentences):
|
328 |
input_data.append((sent, page_content))
|
|
|
329 |
scores = process_with_multiprocessing(input_data)
|
330 |
+
|
331 |
+
k = 0
|
332 |
+
# Update score array for each (soup, sentence)
|
333 |
for i, soup in enumerate(soups):
|
334 |
if soup:
|
335 |
for j, _ in enumerate(sentences):
|
336 |
+
score_array[i][j] = scores[k]
|
337 |
+
k += 1
|
338 |
+
|
339 |
+
# Map sentence with max URL with small margin to keep consider same URL
|
340 |
+
# for consecutive sentences
|
341 |
sentenceToMaxURL = [-1] * len(sentences)
|
|
|
342 |
for j in range(len(sentences)):
|
343 |
if j > 0:
|
344 |
+
maxScore = score_array[sentenceToMaxURL[j - 1]][j]
|
345 |
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
346 |
else:
|
347 |
maxScore = -1
|
348 |
|
349 |
+
for i in range(len(score_array)):
|
350 |
margin = (
|
351 |
+
0.05
|
352 |
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
353 |
else 0
|
354 |
)
|
355 |
+
if score_array[i][j] - maxScore > margin:
|
356 |
+
maxScore = score_array[i][j]
|
357 |
sentenceToMaxURL[j] = i
|
358 |
+
# if score_array[i][j] > maxScore:
|
359 |
+
# maxScore = score_array[i][j]
|
360 |
+
# sentenceToMaxURL[j] = i
|
361 |
|
362 |
index = np.unique(sentenceToMaxURL)
|
363 |
|
364 |
+
url_source = {}
|
365 |
for url in index:
|
366 |
s = [
|
367 |
+
score_array[url][sen]
|
368 |
for sen in range(len(sentences))
|
369 |
if sentenceToMaxURL[sen] == url
|
370 |
]
|
371 |
+
url_source[url] = sum(s) / len(s)
|
372 |
|
373 |
+
index_descending = sorted(url_source, key=url_source.get, reverse=True)
|
374 |
|
375 |
urlMap = {}
|
376 |
for count, i in enumerate(index_descending):
|
377 |
urlMap[i] = count + 1
|
378 |
+
|
379 |
for i, sent in enumerate(sentences):
|
380 |
+
ind = sentenceToMaxURL[i]
|
381 |
+
if url_source[ind] > 0.1:
|
382 |
+
sentence_scores.append(
|
383 |
+
[sent, url_source[ind], url_list[ind], urlMap[ind]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
)
|
385 |
+
else:
|
386 |
+
sentence_scores.append([sent, None, url_list[ind], urlMap[ind]])
|
387 |
+
for ind in index_descending:
|
388 |
+
url_scores.append(
|
389 |
+
[url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
|
390 |
)
|
|
|
391 |
|
392 |
+
return sentence_scores, url_scores
|
393 |
+
|
394 |
+
# for i, sent in enumerate(sentences):
|
395 |
+
# formatted_tokens.append(
|
396 |
+
# (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
|
397 |
+
# )
|
398 |
+
|
399 |
+
# formatted_tokens.append(("\n", None))
|
400 |
+
# formatted_tokens.append(("\n", None))
|
401 |
+
# formatted_tokens.append(("\n", None))
|
402 |
+
|
403 |
+
# for ind in index_descending:
|
404 |
+
# formatted_tokens.append(
|
405 |
+
# (
|
406 |
+
# url_list[ind]
|
407 |
+
# + " --- Matching Score: "
|
408 |
+
# + f"{str(round(url_source[ind] * 100, 2))}%",
|
409 |
+
# "[" + str(urlMap[ind]) + "]",
|
410 |
+
# )
|
411 |
+
# )
|
412 |
+
# formatted_tokens.append(("\n", None))
|
413 |
+
|
414 |
+
# return formatted_tokens
|