Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

minko186 commited on May 14, 2024

Commit

d03ef17

1 Parent(s): 62f91f8

update speed plagiarism

Browse files

Files changed (1) hide show

plagiarism.py +19 -4

plagiarism.py CHANGED Viewed

@@ -111,17 +111,29 @@ async def parallel_scrap(urls):
 def matching_score(sentence_content_tuple):
-    sentence, content = sentence_content_tuple
     if sentence in content:
         return 1
     else:
         n = 5
         ngrams = split_ngrams(sentence, n)
         if len(ngrams) == 0:
             return 0
         matched = [x for x in ngrams if " ".join(x) in content]
         return len(matched) / len(ngrams)
 def process_with_multiprocessing(input_data):
     with Pool(processes=4) as pool:
@@ -216,12 +228,12 @@ def plagiarism_check(
     domains_to_skip,
     source_block_size,
 ):
-    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
-    api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
     # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
@@ -254,7 +266,7 @@ def plagiarism_check(
         if soup:
             page_content = soup.text
             for j, sent in enumerate(sentences):
-                input_data.append((sent, page_content))
     scores = process_with_multiprocessing(input_data)
     k = 0
@@ -311,6 +323,7 @@ def html_highlight(
     domains_to_skip,
     source_block_size,
 ):
     sentence_scores, url_scores = plagiarism_check(
         plag_option,
         input,
@@ -351,4 +364,6 @@ def html_highlight(
     html_content += "</div>"
     return html_content

 def matching_score(sentence_content_tuple):
+    sentence, content, score = sentence_content_tuple
     if sentence in content:
         return 1
+    # if score > 0.9:
+    #     return score
     else:
         n = 5
         ngrams = split_ngrams(sentence, n)
         if len(ngrams) == 0:
             return 0
         matched = [x for x in ngrams if " ".join(x) in content]
         return len(matched) / len(ngrams)
+        # ngrams_sentence = split_ngrams(sentence, n)
+        # if len(ngrams_sentence) == 0:
+        #     return 0
+        # ngrams_content = set(tuple(ngram) for ngram in split_ngrams(content, n))
+        # matched_count = sum(
+        #     1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
+        # )
+        # return matched_count / len(ngrams_sentence)
 def process_with_multiprocessing(input_data):
     with Pool(processes=4) as pool:
     domains_to_skip,
     source_block_size,
 ):
+    api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
+    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
     # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
         if soup:
             page_content = soup.text
             for j, sent in enumerate(sentences):
+                input_data.append((sent, page_content, score_array[i][j]))
     scores = process_with_multiprocessing(input_data)
     k = 0
     domains_to_skip,
     source_block_size,
 ):
+    start_time = time.perf_counter()
     sentence_scores, url_scores = plagiarism_check(
         plag_option,
         input,
     html_content += "</div>"
+    print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
     return html_content