Spaces:
Running
Running
aliasgerovs
commited on
Commit
·
8ad69ed
1
Parent(s):
ff03afa
Update utils.py
Browse files
utils.py
CHANGED
@@ -34,6 +34,7 @@ def remove_special_characters(text):
|
|
34 |
text = remove_accents(text)
|
35 |
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
36 |
text = re.sub(pattern, "", text)
|
|
|
37 |
return text
|
38 |
|
39 |
|
@@ -76,289 +77,4 @@ def extract_text_from_pdf(pdf_path):
|
|
76 |
|
77 |
|
78 |
WORD = re.compile(r"\w+")
|
79 |
-
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
80 |
-
|
81 |
-
|
82 |
-
# returns cosine similarity of two vectors
|
83 |
-
# input: two vectors
|
84 |
-
# output: integer between 0 and 1.
|
85 |
-
# def get_cosine(vec1, vec2):
|
86 |
-
# intersection = set(vec1.keys()) & set(vec2.keys())
|
87 |
-
|
88 |
-
# # calculating numerator
|
89 |
-
# numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
90 |
-
|
91 |
-
# # calculating denominator
|
92 |
-
# sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
93 |
-
# sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
|
94 |
-
# denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
95 |
-
|
96 |
-
# # checking for divide by zero
|
97 |
-
# if denominator == 0:
|
98 |
-
# return 0.0
|
99 |
-
# else:
|
100 |
-
# return float(numerator) / denominator
|
101 |
-
|
102 |
-
|
103 |
-
# # converts given text into a vector
|
104 |
-
# def text_to_vector(text):
|
105 |
-
# # uses the Regular expression above and gets all words
|
106 |
-
# words = WORD.findall(text)
|
107 |
-
# # returns a counter of all the words (count of number of occurences)
|
108 |
-
# return Counter(words)
|
109 |
-
|
110 |
-
|
111 |
-
# # returns cosine similarity of two words
|
112 |
-
# # uses: text_to_vector(text) and get_cosine(v1,v2)
|
113 |
-
# def cosineSim(text1, text2):
|
114 |
-
# vector1 = text_to_vector(text1)
|
115 |
-
# vector2 = text_to_vector(text2)
|
116 |
-
# # print vector1,vector2
|
117 |
-
# cosine = get_cosine(vector1, vector2)
|
118 |
-
# return cosine
|
119 |
-
|
120 |
-
|
121 |
-
# def cos_sim_torch(embedding_1, embedding_2):
|
122 |
-
# return util.pytorch_cos_sim(embedding_1, embedding_2).item()
|
123 |
-
|
124 |
-
|
125 |
-
# def embed_text(text):
|
126 |
-
# return model.encode(text, convert_to_tensor=True)
|
127 |
-
|
128 |
-
|
129 |
-
# def sentence_similarity(text1, text2):
|
130 |
-
# embedding_1 = model.encode(text1, convert_to_tensor=True)
|
131 |
-
# embedding_2 = model.encode(text2, convert_to_tensor=True)
|
132 |
-
|
133 |
-
# o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
134 |
-
# return o.item()
|
135 |
-
|
136 |
-
|
137 |
-
# def get_soup_requests(url):
|
138 |
-
# page = requests.get(url)
|
139 |
-
# if page.status_code == 200:
|
140 |
-
# soup = BeautifulSoup(page.content, "html.parser")
|
141 |
-
# return soup
|
142 |
-
# print("HTML soup failed")
|
143 |
-
# return None
|
144 |
-
|
145 |
-
|
146 |
-
# def get_soup_httpx(url):
|
147 |
-
# client = httpx.Client(timeout=30)
|
148 |
-
# try:
|
149 |
-
# page = client.get(url)
|
150 |
-
# if page.status_code == httpx.codes.OK:
|
151 |
-
# soup = BeautifulSoup(page.content, "html.parser")
|
152 |
-
# return soup
|
153 |
-
# except:
|
154 |
-
# print("HTTPx soup failed")
|
155 |
-
# return None
|
156 |
-
|
157 |
-
|
158 |
-
# def getSentences(text):
|
159 |
-
# from nltk.tokenize import sent_tokenize
|
160 |
-
|
161 |
-
# sents = sent_tokenize(text)
|
162 |
-
# two_sents = []
|
163 |
-
# for i in range(len(sents)):
|
164 |
-
# if (i % 2) == 0:
|
165 |
-
# two_sents.append(sents[i])
|
166 |
-
# else:
|
167 |
-
# two_sents[len(two_sents) - 1] += " " + sents[i]
|
168 |
-
# return two_sents
|
169 |
-
|
170 |
-
|
171 |
-
# def googleSearch(
|
172 |
-
# plag_option,
|
173 |
-
# sentences,
|
174 |
-
# urlCount,
|
175 |
-
# scoreArray,
|
176 |
-
# urlList,
|
177 |
-
# sorted_date,
|
178 |
-
# domains_to_skip,
|
179 |
-
# api_key,
|
180 |
-
# cse_id,
|
181 |
-
# **kwargs,
|
182 |
-
# ):
|
183 |
-
# service = build("customsearch", "v1", developerKey=api_key)
|
184 |
-
# for i, sentence in enumerate(sentences):
|
185 |
-
# results = (
|
186 |
-
# service.cse()
|
187 |
-
# .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
188 |
-
# .execute()
|
189 |
-
# )
|
190 |
-
# if "items" in results and len(results["items"]) > 0:
|
191 |
-
# for count, link in enumerate(results["items"]):
|
192 |
-
# # stop after 3 pages
|
193 |
-
# if count >= 3:
|
194 |
-
# break
|
195 |
-
# # skip user selected domains
|
196 |
-
# if any(
|
197 |
-
# ("." + domain) in link["link"] for domain in domains_to_skip
|
198 |
-
# ):
|
199 |
-
# continue
|
200 |
-
# # clean up snippet of '...'
|
201 |
-
# snippet = link["snippet"]
|
202 |
-
# ind = snippet.find("...")
|
203 |
-
# if ind < 20 and ind > 9:
|
204 |
-
# snippet = snippet[ind + len("... ") :]
|
205 |
-
# ind = snippet.find("...")
|
206 |
-
# if ind > len(snippet) - 5:
|
207 |
-
# snippet = snippet[:ind]
|
208 |
-
|
209 |
-
# # update cosine similarity between snippet and given text
|
210 |
-
# url = link["link"]
|
211 |
-
# if url not in urlList:
|
212 |
-
# urlList.append(url)
|
213 |
-
# scoreArray.append([0] * len(sentences))
|
214 |
-
# urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
215 |
-
# if plag_option == "Standard":
|
216 |
-
# scoreArray[urlList.index(url)][i] = cosineSim(
|
217 |
-
# sentence, snippet
|
218 |
-
# )
|
219 |
-
# else:
|
220 |
-
# scoreArray[urlList.index(url)][i] = sentence_similarity(
|
221 |
-
# sentence, snippet
|
222 |
-
# )
|
223 |
-
# else:
|
224 |
-
# print("Google Search failed")
|
225 |
-
# return urlCount, scoreArray
|
226 |
-
|
227 |
-
|
228 |
-
# def getQueries(text, n):
|
229 |
-
# # return n-grams of size n
|
230 |
-
# words = text.split()
|
231 |
-
# return [words[i : i + n] for i in range(len(words) - n + 1)]
|
232 |
-
|
233 |
-
|
234 |
-
# def print2D(array):
|
235 |
-
# print(np.array(array))
|
236 |
-
|
237 |
-
|
238 |
-
# def removePunc(text):
|
239 |
-
# res = re.sub(r"[^\w\s]", "", text)
|
240 |
-
# return res
|
241 |
-
|
242 |
-
|
243 |
-
# async def get_url_data(url, client):
|
244 |
-
# try:
|
245 |
-
# r = await client.get(url)
|
246 |
-
# # print(r.status_code)
|
247 |
-
# if r.status_code == 200:
|
248 |
-
# # print("in")
|
249 |
-
# soup = BeautifulSoup(r.content, "html.parser")
|
250 |
-
# return soup
|
251 |
-
# except Exception:
|
252 |
-
# print("HTTPx parallel soup failed")
|
253 |
-
# return None
|
254 |
-
|
255 |
-
|
256 |
-
# async def parallel_scrap(urls):
|
257 |
-
# async with httpx.AsyncClient(timeout=30) as client:
|
258 |
-
# tasks = []
|
259 |
-
# for url in urls:
|
260 |
-
# tasks.append(get_url_data(url=url, client=client))
|
261 |
-
# results = await asyncio.gather(*tasks, return_exceptions=True)
|
262 |
-
# return results
|
263 |
-
|
264 |
-
|
265 |
-
# class TimeoutError(Exception):
|
266 |
-
# pass
|
267 |
-
|
268 |
-
|
269 |
-
# def matchingScore(sentence, content):
|
270 |
-
# if sentence in content:
|
271 |
-
# return 1
|
272 |
-
# sentence = removePunc(sentence)
|
273 |
-
# content = removePunc(content)
|
274 |
-
# if sentence in content:
|
275 |
-
# return 1
|
276 |
-
# else:
|
277 |
-
# n = 5
|
278 |
-
# ngrams = getQueries(sentence, n)
|
279 |
-
# if len(ngrams) == 0:
|
280 |
-
# return 0
|
281 |
-
# matched = [x for x in ngrams if " ".join(x) in content]
|
282 |
-
# return len(matched) / len(ngrams)
|
283 |
-
|
284 |
-
|
285 |
-
# # def matchingScoreWithTimeout(sentence, content):
|
286 |
-
# # def timeout_handler():
|
287 |
-
# # raise TimeoutError("Function timed out")
|
288 |
-
|
289 |
-
# # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
|
290 |
-
# # timer.start()
|
291 |
-
# # try:
|
292 |
-
# # score = sentence_similarity(sentence, content)
|
293 |
-
# # # score = matchingScore(sentence, content)
|
294 |
-
# # timer.cancel() # Cancel the timer if calculation completes before timeout
|
295 |
-
# # return score
|
296 |
-
# # except TimeoutError:
|
297 |
-
# # return 0
|
298 |
-
|
299 |
-
|
300 |
-
# # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
|
301 |
-
# # content = removePunc(content)
|
302 |
-
# # for j, sentence in enumerate(sentences):
|
303 |
-
# # sentence = removePunc(sentence)
|
304 |
-
# # if sentence in content:
|
305 |
-
# # ScoreArray[content_idx][j] = 1
|
306 |
-
# # else:
|
307 |
-
# # n = 5
|
308 |
-
# # ngrams = getQueries(sentence, n)
|
309 |
-
# # if len(ngrams) == 0:
|
310 |
-
# # return 0
|
311 |
-
# # matched = [x for x in ngrams if " ".join(x) in content]
|
312 |
-
# # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
|
313 |
-
# # print(
|
314 |
-
# # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
|
315 |
-
# # )
|
316 |
-
# # return ScoreArray
|
317 |
-
|
318 |
-
|
319 |
-
# async def matchingScoreAsync(
|
320 |
-
# sentences, content, content_idx, ScoreArray, model, util
|
321 |
-
# ):
|
322 |
-
# content = removePunc(content)
|
323 |
-
# for j, sentence in enumerate(sentences):
|
324 |
-
# sentence = removePunc(sentence)
|
325 |
-
# similarity_score = sentence_similarity(sentence, content, model, util)
|
326 |
-
# ScoreArray[content_idx][j] = similarity_score
|
327 |
-
# print(
|
328 |
-
# f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................"
|
329 |
-
# )
|
330 |
-
# return ScoreArray
|
331 |
-
|
332 |
-
|
333 |
-
# async def parallel_analyze(soups, sentences, ScoreArray):
|
334 |
-
# tasks = []
|
335 |
-
# for i, soup in enumerate(soups):
|
336 |
-
# if soup:
|
337 |
-
# page_content = soup.text
|
338 |
-
# tasks.append(
|
339 |
-
# matchingScoreAsync(sentences, page_content, i, ScoreArray)
|
340 |
-
# )
|
341 |
-
# else:
|
342 |
-
# print(
|
343 |
-
# f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
344 |
-
# )
|
345 |
-
# ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
346 |
-
# return ScoreArray
|
347 |
-
|
348 |
-
|
349 |
-
# async def parallel_analyze_2(soups, sentences, ScoreArray):
|
350 |
-
# tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
|
351 |
-
# for i, soup in enumerate(soups):
|
352 |
-
# if soup:
|
353 |
-
# page_content = soup.text
|
354 |
-
# for j, sent in enumerate(sentences):
|
355 |
-
# print(
|
356 |
-
# f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
|
357 |
-
# )
|
358 |
-
# tasks[i][j] = sentence_similarity(sent, page_content)
|
359 |
-
# else:
|
360 |
-
# print(
|
361 |
-
# f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
362 |
-
# )
|
363 |
-
# ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
364 |
-
# return ScoreArray
|
|
|
34 |
text = remove_accents(text)
|
35 |
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
36 |
text = re.sub(pattern, "", text)
|
37 |
+
text = text.replace("<s>", "").replace("</s>", "")
|
38 |
return text
|
39 |
|
40 |
|
|
|
77 |
|
78 |
|
79 |
WORD = re.compile(r"\w+")
|
80 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|