Spaces:
Sleeping
Sleeping
refactored plagiarism checker
Browse files- app.py +1 -187
- plagiarism.py +340 -0
app.py
CHANGED
@@ -32,6 +32,7 @@ from utils import cos_sim_torch, embed_text
|
|
32 |
import multiprocessing
|
33 |
from functools import partial
|
34 |
import concurrent.futures
|
|
|
35 |
|
36 |
nltk.download("punkt")
|
37 |
|
@@ -50,193 +51,6 @@ from writing_analysis import (
|
|
50 |
np.set_printoptions(suppress=True)
|
51 |
|
52 |
|
53 |
-
def plagiarism_check(
|
54 |
-
plag_option,
|
55 |
-
input,
|
56 |
-
year_from,
|
57 |
-
month_from,
|
58 |
-
day_from,
|
59 |
-
year_to,
|
60 |
-
month_to,
|
61 |
-
day_to,
|
62 |
-
domains_to_skip,
|
63 |
-
):
|
64 |
-
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
65 |
-
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
66 |
-
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
67 |
-
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
68 |
-
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
69 |
-
|
70 |
-
cse_id = "851813e81162b4ed4"
|
71 |
-
|
72 |
-
time1 = time.perf_counter()
|
73 |
-
start = time.perf_counter()
|
74 |
-
sentences = getSentences(input)
|
75 |
-
urlCount = {}
|
76 |
-
ScoreArray = []
|
77 |
-
urlList = []
|
78 |
-
|
79 |
-
date_from = build_date(year_from, month_from, day_from)
|
80 |
-
date_to = build_date(year_to, month_to, day_to)
|
81 |
-
sort_date = f"date:r:{date_from}:{date_to}"
|
82 |
-
|
83 |
-
# get list of URLS to check
|
84 |
-
urlCount, ScoreArray = googleSearch(
|
85 |
-
plag_option,
|
86 |
-
sentences,
|
87 |
-
urlCount,
|
88 |
-
ScoreArray,
|
89 |
-
urlList,
|
90 |
-
sort_date,
|
91 |
-
domains_to_skip,
|
92 |
-
api_key,
|
93 |
-
cse_id,
|
94 |
-
)
|
95 |
-
print(f"Time for google search: {time.perf_counter()-time1}")
|
96 |
-
time1 = time.perf_counter()
|
97 |
-
|
98 |
-
print("Number of URLs: ", len(urlCount))
|
99 |
-
print(urlList)
|
100 |
-
|
101 |
-
# Scrape URLs in list
|
102 |
-
formatted_tokens = []
|
103 |
-
soups = asyncio.run(parallel_scrap(urlList))
|
104 |
-
|
105 |
-
print(f"Time for scraping: {time.perf_counter()-time1}")
|
106 |
-
time1 = time.perf_counter()
|
107 |
-
print(len(soups))
|
108 |
-
print(
|
109 |
-
"Successful scraping: "
|
110 |
-
+ str(len([x for x in soups if x is not None]))
|
111 |
-
+ "out of "
|
112 |
-
+ str(len(urlList))
|
113 |
-
)
|
114 |
-
|
115 |
-
source_embeddings = []
|
116 |
-
for i, soup in enumerate(soups):
|
117 |
-
if soup:
|
118 |
-
page_content = soup.text
|
119 |
-
source_embeddings.append(embed_text(page_content))
|
120 |
-
else:
|
121 |
-
source_embeddings.append(None)
|
122 |
-
|
123 |
-
# Populate matching scores for scrapped pages
|
124 |
-
# for i, soup in enumerate(soups):
|
125 |
-
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
126 |
-
# if soup:
|
127 |
-
# page_content = soup.text
|
128 |
-
# for j, sent in enumerate(sentences):
|
129 |
-
# # score = matchingScore(sent, page_content)
|
130 |
-
# # score = matchingScoreWithTimeout(sent, page_content)
|
131 |
-
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
132 |
-
# ScoreArray[i][j] = score
|
133 |
-
|
134 |
-
def compute_cosine_similarity(args):
|
135 |
-
sent, source_embedding, i, j = args
|
136 |
-
score = cos_sim_torch(embed_text(sent), source_embedding)
|
137 |
-
return i, j, score
|
138 |
-
|
139 |
-
def main(soups, sentences):
|
140 |
-
source_embeddings = [preprocess(soup) for soup in soups]
|
141 |
-
ScoreArray = [[0 for _ in sentences] for _ in soups]
|
142 |
-
args_list = []
|
143 |
-
for i, soup in enumerate(soups):
|
144 |
-
if soup:
|
145 |
-
for j, sent in enumerate(sentences):
|
146 |
-
args_list.append((sent, source_embeddings[i], i, j))
|
147 |
-
with concurrent.futures.ProcessPoolExecutor() as executor:
|
148 |
-
results = executor.map(compute_cosine_similarity, args_list)
|
149 |
-
for i, j, score in results:
|
150 |
-
ScoreArray[i][j] = score
|
151 |
-
return ScoreArray
|
152 |
-
|
153 |
-
ScoreArray = main(soups, sentences)
|
154 |
-
|
155 |
-
print(f"Time for matching score: {time.perf_counter()-time1}")
|
156 |
-
time1 = time.perf_counter()
|
157 |
-
|
158 |
-
# ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
|
159 |
-
# print("New Score Array:\n")
|
160 |
-
# print2D(ScoreArray)
|
161 |
-
|
162 |
-
# Gradio formatting section
|
163 |
-
sentencePlag = [False] * len(sentences)
|
164 |
-
sentenceToMaxURL = [-1] * len(sentences)
|
165 |
-
for j in range(len(sentences)):
|
166 |
-
if j > 0:
|
167 |
-
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
|
168 |
-
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
169 |
-
else:
|
170 |
-
maxScore = -1
|
171 |
-
for i in range(len(ScoreArray)):
|
172 |
-
margin = (
|
173 |
-
0.1
|
174 |
-
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
175 |
-
else 0
|
176 |
-
)
|
177 |
-
if ScoreArray[i][j] - maxScore > margin:
|
178 |
-
maxScore = ScoreArray[i][j]
|
179 |
-
sentenceToMaxURL[j] = i
|
180 |
-
if maxScore > 0.5:
|
181 |
-
sentencePlag[j] = True
|
182 |
-
|
183 |
-
if (
|
184 |
-
(len(sentences) > 1)
|
185 |
-
and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
|
186 |
-
and (
|
187 |
-
ScoreArray[sentenceToMaxURL[0]][0]
|
188 |
-
- ScoreArray[sentenceToMaxURL[1]][0]
|
189 |
-
< 0.1
|
190 |
-
)
|
191 |
-
):
|
192 |
-
sentenceToMaxURL[0] = sentenceToMaxURL[1]
|
193 |
-
|
194 |
-
index = np.unique(sentenceToMaxURL)
|
195 |
-
|
196 |
-
urlScore = {}
|
197 |
-
for url in index:
|
198 |
-
s = [
|
199 |
-
ScoreArray[url][sen]
|
200 |
-
for sen in range(len(sentences))
|
201 |
-
if sentenceToMaxURL[sen] == url
|
202 |
-
]
|
203 |
-
urlScore[url] = sum(s) / len(s)
|
204 |
-
|
205 |
-
index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
|
206 |
-
|
207 |
-
urlMap = {}
|
208 |
-
for count, i in enumerate(index_descending):
|
209 |
-
urlMap[i] = count + 1
|
210 |
-
for i, sent in enumerate(sentences):
|
211 |
-
formatted_tokens.append(
|
212 |
-
(sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
|
213 |
-
)
|
214 |
-
|
215 |
-
formatted_tokens.append(("\n", None))
|
216 |
-
formatted_tokens.append(("\n", None))
|
217 |
-
formatted_tokens.append(("\n", None))
|
218 |
-
|
219 |
-
print(formatted_tokens)
|
220 |
-
print(index_descending)
|
221 |
-
|
222 |
-
for ind in index_descending:
|
223 |
-
formatted_tokens.append(
|
224 |
-
(
|
225 |
-
urlList[ind]
|
226 |
-
+ " --- Matching Score: "
|
227 |
-
+ f"{str(round(urlScore[ind] * 100, 2))}%",
|
228 |
-
"[" + str(urlMap[ind]) + "]",
|
229 |
-
)
|
230 |
-
)
|
231 |
-
formatted_tokens.append(("\n", None))
|
232 |
-
|
233 |
-
print(f"Formatted Tokens: {formatted_tokens}")
|
234 |
-
|
235 |
-
print(f"Time for plagiarism check: {time.perf_counter()-start}")
|
236 |
-
|
237 |
-
return formatted_tokens
|
238 |
-
|
239 |
-
|
240 |
"""
|
241 |
AI DETECTION SECTION
|
242 |
"""
|
|
|
32 |
import multiprocessing
|
33 |
from functools import partial
|
34 |
import concurrent.futures
|
35 |
+
from plagiarism import plagiarism_check
|
36 |
|
37 |
nltk.download("punkt")
|
38 |
|
|
|
51 |
np.set_printoptions(suppress=True)
|
52 |
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
"""
|
55 |
AI DETECTION SECTION
|
56 |
"""
|
plagiarism.py
CHANGED
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from nltk.tokenize import sent_tokenize
|
3 |
+
from googleapiclient.discovery import build
|
4 |
+
from collections import Counter
|
5 |
+
import re, math
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
+
import asyncio
|
8 |
+
import httpx
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
|
13 |
+
WORD = re.compile(r"\w+")
|
14 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
15 |
+
|
16 |
+
|
17 |
+
# returns cosine similarity of two vectors
|
18 |
+
# input: two vectors
|
19 |
+
# output: integer between 0 and 1.
|
20 |
+
def get_cosine(vec1, vec2):
|
21 |
+
intersection = set(vec1.keys()) & set(vec2.keys())
|
22 |
+
|
23 |
+
# calculating numerator
|
24 |
+
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
25 |
+
|
26 |
+
# calculating denominator
|
27 |
+
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
28 |
+
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
|
29 |
+
denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
30 |
+
|
31 |
+
# checking for divide by zero
|
32 |
+
if denominator == 0:
|
33 |
+
return 0.0
|
34 |
+
else:
|
35 |
+
return float(numerator) / denominator
|
36 |
+
|
37 |
+
|
38 |
+
# converts given text into a vector
|
39 |
+
def text_to_vector(text):
|
40 |
+
# uses the Regular expression above and gets all words
|
41 |
+
words = WORD.findall(text)
|
42 |
+
# returns a counter of all the words (count of number of occurences)
|
43 |
+
return Counter(words)
|
44 |
+
|
45 |
+
|
46 |
+
# returns cosine similarity of two words
|
47 |
+
# uses: text_to_vector(text) and get_cosine(v1,v2)
|
48 |
+
def cosineSim(text1, text2):
|
49 |
+
vector1 = text_to_vector(text1)
|
50 |
+
vector2 = text_to_vector(text2)
|
51 |
+
# print vector1,vector2
|
52 |
+
cosine = get_cosine(vector1, vector2)
|
53 |
+
return cosine
|
54 |
+
|
55 |
+
|
56 |
+
def cos_sim_torch(embedding_1, embedding_2):
|
57 |
+
return util.pytorch_cos_sim(embedding_1, embedding_2).item()
|
58 |
+
|
59 |
+
|
60 |
+
def embed_text(text):
|
61 |
+
return model.encode(text, convert_to_tensor=True)
|
62 |
+
|
63 |
+
|
64 |
+
def sentence_similarity(text1, text2):
|
65 |
+
embedding_1 = model.encode(text1, convert_to_tensor=True)
|
66 |
+
embedding_2 = model.encode(text2, convert_to_tensor=True)
|
67 |
+
|
68 |
+
o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
69 |
+
return o.item()
|
70 |
+
|
71 |
+
|
72 |
+
def google_search(
|
73 |
+
plag_option,
|
74 |
+
sentences,
|
75 |
+
urlCount,
|
76 |
+
scoreArray,
|
77 |
+
urlList,
|
78 |
+
sorted_date,
|
79 |
+
domains_to_skip,
|
80 |
+
api_key,
|
81 |
+
cse_id,
|
82 |
+
**kwargs,
|
83 |
+
):
|
84 |
+
service = build("customsearch", "v1", developerKey=api_key)
|
85 |
+
for i, sentence in enumerate(sentences):
|
86 |
+
results = (
|
87 |
+
service.cse()
|
88 |
+
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
89 |
+
.execute()
|
90 |
+
)
|
91 |
+
if "items" in results and len(results["items"]) > 0:
|
92 |
+
for count, link in enumerate(results["items"]):
|
93 |
+
# stop after 3 pages
|
94 |
+
if count >= 3:
|
95 |
+
break
|
96 |
+
# skip user selected domains
|
97 |
+
if any(
|
98 |
+
("." + domain) in link["link"] for domain in domains_to_skip
|
99 |
+
):
|
100 |
+
continue
|
101 |
+
# clean up snippet of '...'
|
102 |
+
snippet = link["snippet"]
|
103 |
+
ind = snippet.find("...")
|
104 |
+
if ind < 20 and ind > 9:
|
105 |
+
snippet = snippet[ind + len("... ") :]
|
106 |
+
ind = snippet.find("...")
|
107 |
+
if ind > len(snippet) - 5:
|
108 |
+
snippet = snippet[:ind]
|
109 |
+
|
110 |
+
# update cosine similarity between snippet and given text
|
111 |
+
url = link["link"]
|
112 |
+
if url not in urlList:
|
113 |
+
urlList.append(url)
|
114 |
+
scoreArray.append([0] * len(sentences))
|
115 |
+
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
116 |
+
if plag_option == "Standard":
|
117 |
+
scoreArray[urlList.index(url)][i] = cosineSim(
|
118 |
+
sentence, snippet
|
119 |
+
)
|
120 |
+
else:
|
121 |
+
scoreArray[urlList.index(url)][i] = sentence_similarity(
|
122 |
+
sentence, snippet
|
123 |
+
)
|
124 |
+
return urlCount, scoreArray
|
125 |
+
|
126 |
+
|
127 |
+
def split_sentence_blocks(text):
|
128 |
+
|
129 |
+
sents = sent_tokenize(text)
|
130 |
+
two_sents = []
|
131 |
+
for i in range(len(sents)):
|
132 |
+
if (i % 2) == 0:
|
133 |
+
two_sents.append(sents[i])
|
134 |
+
else:
|
135 |
+
two_sents[len(two_sents) - 1] += " " + sents[i]
|
136 |
+
return two_sents
|
137 |
+
|
138 |
+
|
139 |
+
months = {
|
140 |
+
"January": "01",
|
141 |
+
"February": "02",
|
142 |
+
"March": "03",
|
143 |
+
"April": "04",
|
144 |
+
"May": "05",
|
145 |
+
"June": "06",
|
146 |
+
"July": "07",
|
147 |
+
"August": "08",
|
148 |
+
"September": "09",
|
149 |
+
"October": "10",
|
150 |
+
"November": "11",
|
151 |
+
"December": "12",
|
152 |
+
}
|
153 |
+
|
154 |
+
|
155 |
+
def build_date(year=2024, month="March", day=1):
|
156 |
+
return f"{year}{months[month]}{day}"
|
157 |
+
|
158 |
+
|
159 |
+
async def get_url_data(url, client):
|
160 |
+
try:
|
161 |
+
r = await client.get(url)
|
162 |
+
# print(r.status_code)
|
163 |
+
if r.status_code == 200:
|
164 |
+
# print("in")
|
165 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
166 |
+
return soup
|
167 |
+
except Exception:
|
168 |
+
return None
|
169 |
+
|
170 |
+
|
171 |
+
def remove_punc(text):
|
172 |
+
res = re.sub(r"[^\w\s]", "", text)
|
173 |
+
return res
|
174 |
+
|
175 |
+
|
176 |
+
def split_ngrams(text, n):
|
177 |
+
# return n-grams of size n
|
178 |
+
words = text.split()
|
179 |
+
return [words[i : i + n] for i in range(len(words) - n + 1)]
|
180 |
+
|
181 |
+
|
182 |
+
async def parallel_scrap(urls):
|
183 |
+
async with httpx.AsyncClient(timeout=30) as client:
|
184 |
+
tasks = []
|
185 |
+
for url in urls:
|
186 |
+
tasks.append(get_url_data(url=url, client=client))
|
187 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
188 |
+
return results
|
189 |
+
|
190 |
+
|
191 |
+
def matching_score(sentence, content):
|
192 |
+
sentence = remove_punc(sentence)
|
193 |
+
content = remove_punc(content)
|
194 |
+
if sentence in content:
|
195 |
+
return 1
|
196 |
+
else:
|
197 |
+
n = 5
|
198 |
+
ngrams = split_ngrams(sentence, n)
|
199 |
+
if len(ngrams) == 0:
|
200 |
+
return 0
|
201 |
+
matched = [x for x in ngrams if " ".join(x) in content]
|
202 |
+
return len(matched) / len(ngrams)
|
203 |
+
|
204 |
+
|
205 |
+
def plagiarism_check(
|
206 |
+
plag_option,
|
207 |
+
input,
|
208 |
+
year_from,
|
209 |
+
month_from,
|
210 |
+
day_from,
|
211 |
+
year_to,
|
212 |
+
month_to,
|
213 |
+
day_to,
|
214 |
+
domains_to_skip,
|
215 |
+
):
|
216 |
+
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
217 |
+
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
218 |
+
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
219 |
+
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
220 |
+
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
221 |
+
cse_id = "851813e81162b4ed4"
|
222 |
+
|
223 |
+
sentences = split_sentence_blocks(input)
|
224 |
+
urlCount = {}
|
225 |
+
ScoreArray = []
|
226 |
+
urlList = []
|
227 |
+
date_from = build_date(year_from, month_from, day_from)
|
228 |
+
date_to = build_date(year_to, month_to, day_to)
|
229 |
+
sort_date = f"date:r:{date_from}:{date_to}"
|
230 |
+
# get list of URLS to check
|
231 |
+
urlCount, ScoreArray = google_search(
|
232 |
+
plag_option,
|
233 |
+
sentences,
|
234 |
+
urlCount,
|
235 |
+
ScoreArray,
|
236 |
+
urlList,
|
237 |
+
sort_date,
|
238 |
+
domains_to_skip,
|
239 |
+
api_key,
|
240 |
+
cse_id,
|
241 |
+
)
|
242 |
+
|
243 |
+
# Scrape URLs in list
|
244 |
+
formatted_tokens = []
|
245 |
+
soups = asyncio.run(parallel_scrap(urlList))
|
246 |
+
|
247 |
+
# Populate matching scores for scrapped pages
|
248 |
+
for i, soup in enumerate(soups):
|
249 |
+
print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
250 |
+
if soup:
|
251 |
+
page_content = soup.text
|
252 |
+
for j, sent in enumerate(sentences):
|
253 |
+
score = matching_score(sent, page_content)
|
254 |
+
score = matching_score(sent, page_content)
|
255 |
+
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
256 |
+
ScoreArray[i][j] = score
|
257 |
+
|
258 |
+
# *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
|
259 |
+
# source_embeddings = []
|
260 |
+
# for i, soup in enumerate(soups):
|
261 |
+
# if soup:
|
262 |
+
# page_content = soup.text
|
263 |
+
# source_embeddings.append(embed_text(page_content))
|
264 |
+
# else:
|
265 |
+
# source_embeddings.append(None)
|
266 |
+
|
267 |
+
# def compute_cosine_similarity(args):
|
268 |
+
# sent, source_embedding, i, j = args
|
269 |
+
# score = cos_sim_torch(embed_text(sent), source_embedding)
|
270 |
+
# return i, j, score
|
271 |
+
|
272 |
+
# def main(soups, sentences):
|
273 |
+
# source_embeddings = [preprocess(soup) for soup in soups]
|
274 |
+
# ScoreArray = [[0 for _ in sentences] for _ in soups]
|
275 |
+
# args_list = []
|
276 |
+
# for i, soup in enumerate(soups):
|
277 |
+
# if soup:
|
278 |
+
# for j, sent in enumerate(sentences):
|
279 |
+
# args_list.append((sent, source_embeddings[i], i, j))
|
280 |
+
# with concurrent.futures.ProcessPoolExecutor() as executor:
|
281 |
+
# results = executor.map(compute_cosine_similarity, args_list)
|
282 |
+
# for i, j, score in results:
|
283 |
+
# ScoreArray[i][j] = score
|
284 |
+
# return ScoreArray
|
285 |
+
|
286 |
+
# # Populate matching scores for scrapped pages
|
287 |
+
# ScoreArray = main(soups, sentences)
|
288 |
+
# *******************************************************************************************
|
289 |
+
|
290 |
+
# Calculate URL of max matching score for each sentence chunk
|
291 |
+
sentenceToMaxURL = [-1] * len(sentences)
|
292 |
+
for j in range(len(sentences)):
|
293 |
+
if j > 0:
|
294 |
+
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
|
295 |
+
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
296 |
+
else:
|
297 |
+
maxScore = -1
|
298 |
+
|
299 |
+
for i in range(len(ScoreArray)):
|
300 |
+
margin = (
|
301 |
+
0.1
|
302 |
+
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
303 |
+
else 0
|
304 |
+
)
|
305 |
+
if ScoreArray[i][j] - maxScore > margin:
|
306 |
+
maxScore = ScoreArray[i][j]
|
307 |
+
sentenceToMaxURL[j] = i
|
308 |
+
|
309 |
+
index = np.unique(sentenceToMaxURL)
|
310 |
+
|
311 |
+
urlScore = {}
|
312 |
+
for url in index:
|
313 |
+
s = [
|
314 |
+
ScoreArray[url][sen]
|
315 |
+
for sen in range(len(sentences))
|
316 |
+
if sentenceToMaxURL[sen] == url
|
317 |
+
]
|
318 |
+
urlScore[url] = sum(s) / len(s)
|
319 |
+
|
320 |
+
index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
|
321 |
+
|
322 |
+
urlMap = {}
|
323 |
+
for count, i in enumerate(index_descending):
|
324 |
+
urlMap[i] = count + 1
|
325 |
+
for i, sent in enumerate(sentences):
|
326 |
+
formatted_tokens.append(
|
327 |
+
(sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
|
328 |
+
)
|
329 |
+
for ind in index_descending:
|
330 |
+
formatted_tokens.append(
|
331 |
+
(
|
332 |
+
urlList[ind]
|
333 |
+
+ " --- Matching Score: "
|
334 |
+
+ f"{str(round(urlScore[ind] * 100, 2))}%",
|
335 |
+
"[" + str(urlMap[ind]) + "]",
|
336 |
+
)
|
337 |
+
)
|
338 |
+
formatted_tokens.append(("\n", None))
|
339 |
+
|
340 |
+
return formatted_tokens
|