minko186 commited on
Commit
d03ef17
·
1 Parent(s): 62f91f8

update speed plagiarism

Browse files
Files changed (1) hide show
  1. plagiarism.py +19 -4
plagiarism.py CHANGED
@@ -111,17 +111,29 @@ async def parallel_scrap(urls):
111
 
112
 
113
  def matching_score(sentence_content_tuple):
114
- sentence, content = sentence_content_tuple
115
  if sentence in content:
116
  return 1
 
 
117
  else:
118
  n = 5
 
119
  ngrams = split_ngrams(sentence, n)
120
  if len(ngrams) == 0:
121
  return 0
122
  matched = [x for x in ngrams if " ".join(x) in content]
123
  return len(matched) / len(ngrams)
124
 
 
 
 
 
 
 
 
 
 
125
 
126
  def process_with_multiprocessing(input_data):
127
  with Pool(processes=4) as pool:
@@ -216,12 +228,12 @@ def plagiarism_check(
216
  domains_to_skip,
217
  source_block_size,
218
  ):
219
- # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
220
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
221
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
222
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
223
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
224
- api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
225
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
226
  cse_id = "851813e81162b4ed4"
227
 
@@ -254,7 +266,7 @@ def plagiarism_check(
254
  if soup:
255
  page_content = soup.text
256
  for j, sent in enumerate(sentences):
257
- input_data.append((sent, page_content))
258
  scores = process_with_multiprocessing(input_data)
259
 
260
  k = 0
@@ -311,6 +323,7 @@ def html_highlight(
311
  domains_to_skip,
312
  source_block_size,
313
  ):
 
314
  sentence_scores, url_scores = plagiarism_check(
315
  plag_option,
316
  input,
@@ -351,4 +364,6 @@ def html_highlight(
351
 
352
  html_content += "</div>"
353
 
 
 
354
  return html_content
 
111
 
112
 
113
  def matching_score(sentence_content_tuple):
114
+ sentence, content, score = sentence_content_tuple
115
  if sentence in content:
116
  return 1
117
+ # if score > 0.9:
118
+ # return score
119
  else:
120
  n = 5
121
+
122
  ngrams = split_ngrams(sentence, n)
123
  if len(ngrams) == 0:
124
  return 0
125
  matched = [x for x in ngrams if " ".join(x) in content]
126
  return len(matched) / len(ngrams)
127
 
128
+ # ngrams_sentence = split_ngrams(sentence, n)
129
+ # if len(ngrams_sentence) == 0:
130
+ # return 0
131
+ # ngrams_content = set(tuple(ngram) for ngram in split_ngrams(content, n))
132
+ # matched_count = sum(
133
+ # 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
134
+ # )
135
+ # return matched_count / len(ngrams_sentence)
136
+
137
 
138
  def process_with_multiprocessing(input_data):
139
  with Pool(processes=4) as pool:
 
228
  domains_to_skip,
229
  source_block_size,
230
  ):
231
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
232
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
233
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
234
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
235
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
236
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
237
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
238
  cse_id = "851813e81162b4ed4"
239
 
 
266
  if soup:
267
  page_content = soup.text
268
  for j, sent in enumerate(sentences):
269
+ input_data.append((sent, page_content, score_array[i][j]))
270
  scores = process_with_multiprocessing(input_data)
271
 
272
  k = 0
 
323
  domains_to_skip,
324
  source_block_size,
325
  ):
326
+ start_time = time.perf_counter()
327
  sentence_scores, url_scores = plagiarism_check(
328
  plag_option,
329
  input,
 
364
 
365
  html_content += "</div>"
366
 
367
+ print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
368
+
369
  return html_content