Prathmesh48 commited on
Commit
324113f
1 Parent(s): a5bb707

Upload 3 files

Browse files
Files changed (3) hide show
  1. embedding.py +378 -0
  2. preprocess.py +205 -0
  3. search.py +229 -0
embedding.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ import requests
3
+ import json
4
+ import os
5
+ import concurrent.futures
6
+ import random
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ from langchain_community.document_loaders import WebBaseLoader
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ import google.generativeai as genai
12
+ from langchain_core.messages import HumanMessage
13
+ from io import BytesIO
14
+ import numpy as np
15
+ import re
16
+ import torch
17
+ from transformers import AutoTokenizer, AutoModel
18
+
19
+ from search import search_images
20
+
21
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
22
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
23
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
24
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
25
+
26
+ vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
27
+ vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
28
+ vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
29
+ vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
32
+ model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
33
+ model.to('cpu') # Ensure the model is on the CPU
34
+
35
+
36
+ genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
37
+
38
+ def pdf_extractor(link):
39
+ text = ''
40
+
41
+ try:
42
+ # Fetch the PDF file from the URL
43
+ response = requests.get(link)
44
+ response.raise_for_status() # Raise an error for bad status codes
45
+
46
+ # Use BytesIO to handle the PDF content in memory
47
+ pdf_file = BytesIO(response.content)
48
+
49
+ # Load the PDF file
50
+ reader = PdfReader(pdf_file)
51
+ for page in reader.pages:
52
+ text += page.extract_text() # Extract text from each page
53
+
54
+ except requests.exceptions.HTTPError as e:
55
+ print(f'HTTP error occurred: {e}')
56
+ except Exception as e:
57
+ print(f'An error occurred: {e}')
58
+
59
+ return text
60
+
61
+ def web_extractor(link):
62
+ text = ''
63
+
64
+ try:
65
+ loader = WebBaseLoader(link)
66
+ pages = loader.load_and_split()
67
+
68
+ for page in pages:
69
+ text+=page.page_content
70
+ except:
71
+ pass
72
+
73
+ return text
74
+
75
+ def imporve_text(text):
76
+
77
+ prompt = f'''
78
+ Please rewrite the following text to make it short, concise, and of high quality.
79
+ Ensure that all essential information and key points are retained.
80
+ Focus on improving clarity, coherence, and word choice without altering the original meaning.
81
+
82
+ text = {text}
83
+ '''
84
+
85
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
86
+ result = model.invoke(prompt)
87
+
88
+ return result.content
89
+
90
+ def feature_extraction(tag, history , context):
91
+
92
+ prompt = f'''
93
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
94
+ 1. Tag_History: Previously gathered information about the product.
95
+ 2. Tag_Context: New data that might contain additional details.
96
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
97
+ Guidelines:
98
+ - Only add new details that are relevant to the {tag} FIELD.
99
+ - Do not add or modify any other fields in the Tag_History.
100
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
101
+ Here is the data:
102
+ Tag_Context: {str(context)}
103
+ Tag_History: {history}
104
+ Respond with the updated Tag_History.
105
+ '''
106
+
107
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
108
+ result = model.invoke(prompt)
109
+
110
+ return result.content
111
+
112
+ def feature_extraction_image(url):
113
+
114
+ vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
115
+ # result = gemini.invoke('''Hello''')
116
+ # Markdown(result.content)
117
+ # print(result)
118
+
119
+ text = 'None'
120
+ message = HumanMessage(content=[
121
+ {"type": "text", "text": "Please, Describe this image in detail"},
122
+ {"type": "image_url", "image_url": url}
123
+ ])
124
+ try:
125
+ model = random.choice([vision,vision1,vision2,vision3])
126
+ text = model.invoke([message])
127
+ except:
128
+ return text
129
+ return text.content
130
+
131
+ def detailed_feature_extraction(find, context):
132
+
133
+ prompt = f'''
134
+ You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
135
+ 1. Context: The gathered information about the product.
136
+ 2. Format: Details which need to be filled based on Context.
137
+ Your job is to read the Context and update the relevant field in Format using Context.
138
+ Guidelines:
139
+ - Only add details that are relevant to the individual FIELD.
140
+ - Do not add or modify any other fields in the Format.
141
+ - If nothing found return None.
142
+ Here is the data:
143
+ The Context is {str(context)}
144
+ The Format is {str(find)}
145
+ '''
146
+
147
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
148
+ result = model.invoke(prompt)
149
+
150
+ return result.content
151
+
152
+ def detailed_history(history):
153
+
154
+ details = {
155
+ "Introduction": {
156
+ "Product Name": None,
157
+ "Overview of the product": None,
158
+ "Purpose of the manual": None,
159
+ "Audience": None,
160
+ "Additional Details": None
161
+ },
162
+ "Specifications": {
163
+ "Technical specifications": None,
164
+ "Performance metrics": None,
165
+ "Additional Details": None
166
+ },
167
+ "Product Overview": {
168
+ "Product features": None,
169
+ "Key components and parts": None,
170
+ "Additional Details": None
171
+ },
172
+ "Safety Information": {
173
+ "Safety warnings and precautions": None,
174
+ "Compliance and certification information": None,
175
+ "Additional Details": None
176
+ },
177
+ "Installation Instructions": {
178
+ "Unboxing and inventory checklist": None,
179
+ "Step-by-step installation guide": None,
180
+ "Required tools and materials": None,
181
+ "Additional Details": None
182
+ },
183
+ "Setup and Configuration": {
184
+ "Initial setup procedures": None,
185
+ "Configuration settings": None,
186
+ "Troubleshooting setup issues": None,
187
+ "Additional Details": None
188
+ },
189
+ "Operation Instructions": {
190
+ "How to use the product": None,
191
+ "Detailed instructions for different functionalities": None,
192
+ "User interface guide": None,
193
+ "Additional Details": None
194
+ },
195
+ "Maintenance and Care": {
196
+ "Cleaning instructions": None,
197
+ "Maintenance schedule": None,
198
+ "Replacement parts and accessories": None,
199
+ "Additional Details": None
200
+ },
201
+ "Troubleshooting": {
202
+ "Common issues and solutions": None,
203
+ "Error messages and their meanings": None,
204
+ "Support Information": None,
205
+ "Additional Details": None
206
+ },
207
+ "Warranty Information": {
208
+ "Terms and Conditions": None,
209
+ "Service and repair information": None,
210
+ "Additional Details": None
211
+ },
212
+ "Legal Information": {
213
+ "Copyright information": None,
214
+ "Trademarks and patents": None,
215
+ "Disclaimers": None,
216
+ "Additional Details": None
217
+
218
+ }
219
+ }
220
+
221
+ for key,val in history.items():
222
+
223
+ find = details[key]
224
+
225
+ details[key] = str(detailed_feature_extraction(find,val))
226
+
227
+ return details
228
+
229
+
230
+ def get_embeddings(link,tag_option):
231
+
232
+ print(f"\n--> Creating Embeddings - {link}")
233
+
234
+ if tag_option=='Complete Document Similarity':
235
+ history = { "Details": "" }
236
+
237
+ else:
238
+ history = {
239
+ "Introduction": "",
240
+ "Specifications": "",
241
+ "Product Overview": "",
242
+ "Safety Information": "",
243
+ "Installation Instructions": "",
244
+ "Setup and Configuration": "",
245
+ "Operation Instructions": "",
246
+ "Maintenance and Care": "",
247
+ "Troubleshooting": "",
248
+ "Warranty Information": "",
249
+ "Legal Information": ""
250
+ }
251
+
252
+ # Extract Text -----------------------------
253
+ print("Extracting Text")
254
+ if link[-3:] == '.md' or link[8:11] == 'en.':
255
+ text = web_extractor(link)
256
+ else:
257
+ text = pdf_extractor(link)
258
+
259
+ # Create Chunks ----------------------------
260
+ print("Writing Tag Data")
261
+
262
+ if tag_option=="Complete Document Similarity":
263
+ history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
264
+
265
+ else:
266
+ chunks = text_splitter.create_documents(text)
267
+
268
+ for chunk in chunks:
269
+
270
+ with concurrent.futures.ThreadPoolExecutor() as executor:
271
+ future_to_key = {
272
+ executor.submit(
273
+ feature_extraction, f"Product {key}", history[key], chunk.page_content
274
+ ): key for key in history
275
+ }
276
+ for future in concurrent.futures.as_completed(future_to_key):
277
+ key = future_to_key[future]
278
+ try:
279
+ response = future.result()
280
+ history[key] = response
281
+ except Exception as e:
282
+ print(f"Error processing {key}: {e}")
283
+
284
+ print("Creating Vectors")
285
+ genai_embeddings=[]
286
+
287
+ for tag in history:
288
+ result = genai.embed_content(
289
+ model="models/embedding-001",
290
+ content=history[tag],
291
+ task_type="retrieval_document")
292
+ genai_embeddings.append(result['embedding'])
293
+
294
+
295
+ return history,genai_embeddings
296
+
297
+ def get_embed_chroma(link):
298
+
299
+ print(f"\n--> Creating Embeddings - {link}")
300
+
301
+ # Extract Text -----------------------------
302
+ if link[-3:] == '.md' or link[8:11] == 'en.':
303
+ text = web_extractor(link)
304
+ else:
305
+ text = pdf_extractor(link)
306
+ print("\u2713 Extracting Text")
307
+
308
+ # Create Chunks ----------------------------
309
+
310
+ text = re.sub(r'\.{2,}', '.', text)
311
+ text = re.sub(r'\s{2,}', ' ', text)
312
+ text = [re.sub(r'\n{2,}', '\n', text)]
313
+
314
+ chunks = text_splitter_small.create_documents(text)
315
+ print("\u2713 Writing Tag Data")
316
+
317
+ # Creating Vector
318
+ embedding_vectors=[]
319
+ textual_data = []
320
+ print("\u2713 Creating Vectors")
321
+
322
+
323
+ for text in chunks:
324
+
325
+ inputs = tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True)
326
+ inputs = {k: v.to('cpu') for k, v in inputs.items()}
327
+
328
+ # Get the model's outputs
329
+ with torch.no_grad():
330
+ outputs = model(**inputs)
331
+
332
+ embeddings = outputs.last_hidden_state.mean(dim=1)
333
+ embedding_vectors.append(embeddings.squeeze().cpu().numpy().tolist())
334
+ textual_data.append(text.page_content)
335
+
336
+ return textual_data , embedding_vectors
337
+
338
+
339
+
340
+ def get_image_embeddings(Product):
341
+ image_embeddings = []
342
+
343
+ links = search_images(Product)
344
+ with concurrent.futures.ThreadPoolExecutor() as executor:
345
+ descriptions = list(executor.map(feature_extraction_image, links))
346
+
347
+ for description in descriptions:
348
+ result = genai.embed_content(
349
+ model="models/embedding-001",
350
+ content=description,
351
+ task_type="retrieval_document")
352
+
353
+ image_embeddings.append(result['embedding'])
354
+ # print(image_embeddings)
355
+ return image_embeddings
356
+
357
+
358
+
359
+ global text_splitter
360
+ global data
361
+ global history
362
+
363
+
364
+ text_splitter = RecursiveCharacterTextSplitter(
365
+ chunk_size = 10000,
366
+ chunk_overlap = 100,
367
+ separators = ["",''," "]
368
+ )
369
+
370
+ text_splitter_small = RecursiveCharacterTextSplitter(
371
+ chunk_size = 2000,
372
+ chunk_overlap = 100,
373
+ separators = ["",''," "]
374
+ )
375
+
376
+ if __name__ == '__main__':
377
+ print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
378
+ # print(get_image_embeddings(Product='Samsung Galaxy S24'))
preprocess.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import random
4
+ import concurrent.futures
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from langdetect import detect_langs
8
+ import requests
9
+ from PyPDF2 import PdfReader
10
+ from io import BytesIO
11
+ from langchain_community.document_loaders import WebBaseLoader
12
+ from langchain_google_genai import ChatGoogleGenerativeAI
13
+ import logging
14
+ from pymongo import MongoClient
15
+
16
+
17
+ # Mongo Connections
18
+ # srv_connection_uri = "mongodb+srv://adityasm1410:[email protected]/?retryWrites=true&w=majority&appName=Patseer"
19
+
20
+ # client = MongoClient(srv_connection_uri)
21
+ # db = client['embeddings']
22
+ # collection = db['data']
23
+
24
+
25
+ # API Urls -----
26
+
27
+ # main_url = "http://127.0.0.1:5000/search/all"
28
+ main_url = "http://127.0.0.1:8000/search/all"
29
+ # main_product = "Samsung Galaxy s23 ultra"
30
+
31
+ # Revelevance Checking Models -----
32
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
33
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
34
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
35
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
36
+
37
+
38
+ API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
39
+ headers = {"Authorization": "Bearer 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"}
40
+
41
+ # Error Debug
42
+ logging.basicConfig(level=logging.INFO)
43
+
44
+
45
+ # Global Var --------
46
+
47
+ data = False
48
+ seen = set()
49
+ existing_products_urls = set('123')
50
+
51
+
52
+
53
+ def get_links(main_product,api_key):
54
+ params = {
55
+ "API_KEY": f"{api_key}",
56
+ "product": f"{main_product}",
57
+ }
58
+
59
+ # Flask
60
+ response = requests.get(main_url, params=params)
61
+
62
+ # FastAPI
63
+ # response = requests.post(main_url, json=params)
64
+
65
+
66
+ if response.status_code == 200:
67
+ results = response.json()
68
+ with open('data.json', 'w') as f:
69
+ json.dump(results, f)
70
+ else:
71
+ print(f"Failed to fetch results: {response.status_code}")
72
+
73
+
74
+
75
+ def language_preprocess(text):
76
+ try:
77
+ if detect_langs(text)[0].lang == 'en':
78
+ return True
79
+ return False
80
+ except:
81
+ return False
82
+
83
+
84
+ def relevant(product, similar_product, content):
85
+
86
+ try:
87
+ payload = { "inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}'''}
88
+
89
+ # response = requests.post(API_URL, headers=headers, json=payload)
90
+ # output = response.json()
91
+ # return bool(output[0]['generated_text'])
92
+
93
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
94
+ result = model.invoke(f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}''')
95
+ return bool(result)
96
+
97
+ except:
98
+ return False
99
+
100
+
101
+
102
+ def download_pdf(url, timeout=10):
103
+ try:
104
+ response = requests.get(url, timeout=timeout)
105
+ response.raise_for_status()
106
+ return BytesIO(response.content)
107
+
108
+ except requests.RequestException as e:
109
+ logging.error(f"PDF download error: {e}")
110
+ return None
111
+
112
+ def extract_text_from_pdf(pdf_file, pages):
113
+ reader = PdfReader(pdf_file)
114
+ extracted_text = ""
115
+
116
+ l = len(reader.pages)
117
+
118
+ try:
119
+ for page_num in pages:
120
+ if page_num < l:
121
+ page = reader.pages[page_num]
122
+ extracted_text += page.extract_text() + "\n"
123
+ else:
124
+ print(f"Page {page_num} does not exist in the document.")
125
+
126
+ return extracted_text
127
+
128
+ except:
129
+ return 'हे चालत नाही'
130
+
131
+ def extract_text_online(link):
132
+
133
+ loader = WebBaseLoader(link)
134
+ pages = loader.load_and_split()
135
+
136
+ text = ''
137
+
138
+ for page in pages[:3]:
139
+ text+=page.page_content
140
+
141
+ return text
142
+
143
+
144
+ def process_link(link, main_product, similar_product):
145
+ if link in seen:
146
+ return None
147
+ seen.add(link)
148
+ try:
149
+ if link[-3:]=='.md' or link[8:11] == 'en.':
150
+ text = extract_text_online(link)
151
+ else:
152
+ pdf_file = download_pdf(link)
153
+ text = extract_text_from_pdf(pdf_file, [0, 2, 4])
154
+
155
+ if language_preprocess(text):
156
+ if relevant(main_product, similar_product, text):
157
+ print("Accepted -",link)
158
+ return link
159
+ except:
160
+ pass
161
+ print("Rejected -",link)
162
+ return None
163
+
164
+ def filtering(urls, main_product, similar_product, link_count):
165
+ res = []
166
+
167
+ # print(f"Filtering Links of ---- {similar_product}")
168
+ # Main Preprocess ------------------------------
169
+ # with ThreadPoolExecutor() as executor:
170
+ # futures = {executor.submit(process_link, link, main_product, similar_product): link for link in urls}
171
+ # for future in concurrent.futures.as_completed(futures):
172
+ # result = future.result()
173
+ # if result is not None:
174
+ # res.append(result)
175
+
176
+ # return res
177
+
178
+ count = 0
179
+
180
+ print(f"--> Filtering Links of - {similar_product}")
181
+
182
+ for link in urls:
183
+
184
+ if link in existing_products_urls:
185
+ res.append((link,1))
186
+ count+=1
187
+
188
+ else:
189
+ result = process_link(link, main_product, similar_product)
190
+
191
+ if result is not None:
192
+ res.append((result,0))
193
+ count += 1
194
+
195
+ if count == link_count:
196
+ break
197
+
198
+ return res
199
+
200
+
201
+ # Main Functions -------------------------------------------------->
202
+
203
+ # get_links()
204
+ # preprocess()
205
+
search.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Library Imports
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from googlesearch import search
5
+ from duckduckgo_search import DDGS
6
+ import concurrent.futures
7
+ import re
8
+
9
+
10
+
11
+ # Search Functions -------------------------------------------------------------->
12
+
13
+ # Function to search DuckDuckGo
14
+ def search_duckduckgo(query):
15
+ try:
16
+ results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
17
+ return [res['href'] for res in results]
18
+ except:
19
+ return []
20
+
21
+ # Function to search Google
22
+ def search_google(query):
23
+
24
+ links = []
25
+ try:
26
+ api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
27
+ search_engine_id = 'c4ca951b9fc6949cb'
28
+
29
+ url = f"https://www.googleapis.com/customsearch/v1"
30
+ params = {
31
+ "key": api_key,
32
+ "cx": search_engine_id,
33
+ "q": query + " manual filetype:pdf"
34
+ }
35
+
36
+ response = requests.get(url, params=params)
37
+ results = response.json()
38
+
39
+ for item in results.get('items', []):
40
+ links.append(item['link'])
41
+ except:
42
+ pass
43
+
44
+ try:
45
+ extension = "ext:pdf"
46
+ for result in search(query + " manual " + extension, num_results=5):
47
+ if result.endswith('.pdf'):
48
+ links.append(result)
49
+ except:
50
+ pass
51
+
52
+ return links
53
+
54
+ # Function to search Internet Archive
55
+ def search_archive(query):
56
+
57
+ try:
58
+ url = "https://archive.org/advancedsearch.php"
59
+ params = {
60
+ 'q': f'{query} manual',
61
+ 'fl[]': ['identifier', 'title', 'format'],
62
+ 'rows': 50,
63
+ 'page': 1,
64
+ 'output': 'json'
65
+ }
66
+
67
+ # Make the request
68
+ response = requests.get(url, params=params)
69
+ data = response.json()
70
+
71
+ # Function to extract hyperlinks from a webpage
72
+ def extract_hyperlinks(url):
73
+ # Send a GET request to the URL
74
+ response = requests.get(url)
75
+
76
+ # Check if the request was successful
77
+ if response.status_code == 200:
78
+ # Parse the HTML content of the page
79
+ soup = BeautifulSoup(response.text, 'html.parser')
80
+
81
+ # Find all <a> tags (hyperlinks)
82
+ for link in soup.find_all('a', href=True):
83
+ href = link['href']
84
+ if href.endswith('.pdf'):
85
+ pdf_files.append(url+'/'+href)
86
+ if href.endswith('.iso'):
87
+ # If the link ends with .iso, follow the link and extract .pdf hyperlinks
88
+ extract_pdf_from_iso(url+'/'+href+'/')
89
+
90
+ # Function to extract .pdf hyperlinks from an .iso file
91
+ def extract_pdf_from_iso(iso_url):
92
+ # Send a GET request to the ISO URL
93
+ iso_response = requests.get(iso_url)
94
+
95
+ # Check if the request was successful
96
+ if iso_response.status_code == 200:
97
+ # Parse the HTML content of the ISO page
98
+ iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
99
+
100
+ # Find all <a> tags (hyperlinks) in the ISO page
101
+ for link in iso_soup.find_all('a', href=True):
102
+ href = link['href']
103
+ if href.endswith('.pdf'):
104
+ pdf_files.append('https:'+href)
105
+
106
+ pdf_files = []
107
+
108
+ def process_doc(doc):
109
+ identifier = doc.get('identifier', 'N/A')
110
+ # title = doc.get('title', 'N/A')
111
+ # format = doc.get('format', 'N/A')
112
+ pdf_link = f"https://archive.org/download/{identifier}"
113
+ extract_hyperlinks(pdf_link)
114
+
115
+ with concurrent.futures.ThreadPoolExecutor() as executor:
116
+ futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
117
+
118
+ # Optionally, wait for all futures to complete and handle any exceptions
119
+ for future in concurrent.futures.as_completed(futures):
120
+ try:
121
+ future.result() # This will raise an exception if the function call raised
122
+ except Exception as exc:
123
+ print(f'Generated an exception: {exc}')
124
+
125
+
126
+ return pdf_files
127
+
128
+ except:
129
+ return []
130
+
131
+ def search_github(query):
132
+
133
+ try:
134
+ # GitHub Search API endpoint
135
+ url = f"https://api.github.com/search/code?q={query}+extension:md"
136
+
137
+ headers = {
138
+ 'Authorization': 'Token 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'
139
+ }
140
+
141
+ # Make the request
142
+ response = requests.get(url,headers=headers)
143
+ data = response.json()
144
+ links = [item['html_url'] for item in data['items']]
145
+
146
+ return links
147
+
148
+ except:
149
+ return []
150
+
151
+ def search_wikipedia(product):
152
+
153
+ api_url = "https://en.wikipedia.org/w/api.php"
154
+ params = {
155
+ "action": "opensearch",
156
+ "search": product,
157
+ "limit": 5,
158
+ "namespace": 0,
159
+ "format": "json"
160
+ }
161
+
162
+ try:
163
+ response = requests.get(api_url, params=params)
164
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
165
+ data = response.json()
166
+
167
+ if data and len(data) > 3 and len(data[3]) > 0:
168
+ return data[3] # The URL is in the fourth element of the response array
169
+ else:
170
+ return []
171
+
172
+ except requests.RequestException as e:
173
+ print(f"An error occurred: {e}")
174
+ return []
175
+
176
+ # def search_all(product,num):
177
+
178
+ # similar_products = extract_similar_products(product)[num]
179
+
180
+ # # results = {
181
+ # # product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
182
+ # # }
183
+
184
+ # results = {}
185
+
186
+ # def search_product(p):
187
+ # return {
188
+ # 'product': p,
189
+ # 'duckduckgo': duckduckgo_search(p),
190
+ # 'google': google_search(p),
191
+ # 'github': github_search(p),
192
+ # 'archive': archive_search(p),
193
+ # 'wikipedia': wikipedia_search(p)
194
+ # }
195
+
196
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
197
+ # future_to_product = {executor.submit(search_product, p): p for p in similar_products}
198
+
199
+ # for future in concurrent.futures.as_completed(future_to_product):
200
+ # result = future.result()
201
+ # product = result['product']
202
+ # results[product] = [
203
+ # {'duckduckgo': result['duckduckgo']},
204
+ # {'google': result['google']},
205
+ # {'github': result['github']},
206
+ # {'archive': result['archive']},
207
+ # {'wikipedia': result['wikipedia']}
208
+ # ]
209
+
210
+ # return results
211
+
212
+ def search_images(product):
213
+ results = DDGS().images(f"{product}", max_results=5)
214
+ # print(results)
215
+ return [r['image'] for r in results]
216
+
217
+
218
+ # Similarity Check -------------------------------------->
219
+
220
+ def extract_similar_products(query):
221
+ print(f"\n--> Fetching similar items of - {query}")
222
+ results = DDGS().chat(f'{query} Similar Products')
223
+
224
+ pattern = r'^\d+\.\s(.+)$'
225
+ matches = re.findall(pattern, results, re.MULTILINE)
226
+ matches = [item.split(': ')[0] for item in matches]
227
+ return matches
228
+
229
+