That1BrainCell commited on
Commit
f8ed7e3
·
verified ·
1 Parent(s): 761fdd9

Upload 3 files

Browse files
Files changed (3) hide show
  1. embedding.py +255 -0
  2. preprocess.py +168 -0
  3. search.py +227 -0
embedding.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import os
4
+ import concurrent.futures
5
+ import random
6
+ from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from langchain_community.document_loaders import WebBaseLoader
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ import google.generativeai as genai
11
+
12
+
13
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
14
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
15
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
16
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
17
+
18
+ genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
19
+
20
+
21
+ def pdf_extractor(link):
22
+ text = ''
23
+
24
+ try:
25
+ loader = PyPDFLoader(link)
26
+ pages = loader.load_and_split()
27
+
28
+ for page in pages:
29
+ text+=page.page_content
30
+ except:
31
+ pass
32
+
33
+ return [text]
34
+
35
+ def web_extractor(link):
36
+ text = ''
37
+
38
+ try:
39
+ loader = WebBaseLoader(link)
40
+ pages = loader.load_and_split()
41
+
42
+ for page in pages:
43
+ text+=page.page_content
44
+ except:
45
+ pass
46
+
47
+ return [text]
48
+
49
+
50
+ def feature_extraction(tag, history , context):
51
+
52
+ prompt = f'''
53
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
54
+ 1. Tag_History: Previously gathered information about the product.
55
+ 2. Tag_Context: New data that might contain additional details.
56
+
57
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
58
+
59
+ Guidelines:
60
+ - Only add new details that are relevant to the {tag} FIELD.
61
+ - Do not add or modify any other fields in the Tag_History.
62
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
63
+
64
+ Here is the data:
65
+
66
+ Tag_Context: {str(context)}
67
+ Tag_History: {history}
68
+
69
+ Respond with the updated Tag_History.
70
+ '''
71
+
72
+ model = random.choice([gemini,gemini1])
73
+ result = model.invoke(prompt)
74
+
75
+ return result.content
76
+
77
+ def detailed_feature_extraction(find, context):
78
+
79
+ prompt = f'''
80
+ You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
81
+ 1. Context: The gathered information about the product.
82
+ 2. Format: Details which need to be filled based on Context.
83
+
84
+ Your job is to read the Context and update the relevant field in Format using Context.
85
+
86
+ Guidelines:
87
+ - Only add details that are relevant to the individual FIELD.
88
+ - Do not add or modify any other fields in the Format.
89
+ - If nothing found return None.
90
+
91
+ Here is the data:
92
+
93
+ The Context is {str(context)}
94
+ The Format is {str(find)}
95
+ '''
96
+
97
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
98
+ result = model.invoke(prompt)
99
+
100
+ return result.content
101
+
102
+ def detailed_history(history):
103
+
104
+ details = {
105
+ "Introduction": {
106
+ "Product Name": None,
107
+ "Overview of the product": None,
108
+ "Purpose of the manual": None,
109
+ "Audience": None,
110
+ "Additional Details": None
111
+ },
112
+ "Specifications": {
113
+ "Technical specifications": None,
114
+ "Performance metrics": None,
115
+ "Additional Details": None
116
+ },
117
+ "Product Overview": {
118
+ "Product features": None,
119
+ "Key components and parts": None,
120
+ "Additional Details": None
121
+ },
122
+ "Safety Information": {
123
+ "Safety warnings and precautions": None,
124
+ "Compliance and certification information": None,
125
+ "Additional Details": None
126
+ },
127
+ "Installation Instructions": {
128
+ "Unboxing and inventory checklist": None,
129
+ "Step-by-step installation guide": None,
130
+ "Required tools and materials": None,
131
+ "Additional Details": None
132
+ },
133
+ "Setup and Configuration": {
134
+ "Initial setup procedures": None,
135
+ "Configuration settings": None,
136
+ "Troubleshooting setup issues": None,
137
+ "Additional Details": None
138
+ },
139
+ "Operation Instructions": {
140
+ "How to use the product": None,
141
+ "Detailed instructions for different functionalities": None,
142
+ "User interface guide": None,
143
+ "Additional Details": None
144
+ },
145
+ "Maintenance and Care": {
146
+ "Cleaning instructions": None,
147
+ "Maintenance schedule": None,
148
+ "Replacement parts and accessories": None,
149
+ "Additional Details": None
150
+ },
151
+ "Troubleshooting": {
152
+ "Common issues and solutions": None,
153
+ "Error messages and their meanings": None,
154
+ "Support Information": None,
155
+ "Additional Details": None
156
+ },
157
+ "Warranty Information": {
158
+ "Terms and Conditions": None,
159
+ "Service and repair information": None,
160
+ "Additional Details": None
161
+ },
162
+ "Legal Information": {
163
+ "Copyright information": None,
164
+ "Trademarks and patents": None,
165
+ "Disclaimers": None,
166
+ "Additional Details": None
167
+
168
+ }
169
+ }
170
+
171
+ for key,val in history.items():
172
+
173
+ find = details[key]
174
+
175
+ details[key] = str(detailed_feature_extraction(find,val))
176
+
177
+ return details
178
+
179
+
180
+ def get_embeddings(link):
181
+
182
+ print(f"\nCreating Embeddings ----- {link}")
183
+ history = {
184
+ "Introduction": "",
185
+ "Specifications": "",
186
+ "Product Overview": "",
187
+ "Safety Information": "",
188
+ "Installation Instructions": "",
189
+ "Setup and Configuration": "",
190
+ "Operation Instructions": "",
191
+ "Maintenance and Care": "",
192
+ "Troubleshooting": "",
193
+ "Warranty Information": "",
194
+ "Legal Information": ""
195
+ }
196
+
197
+ # Extract Text -----------------------------
198
+ print("Extracting Text")
199
+ if link[-3:] == '.md' or link[8:11] == 'en.':
200
+ text = web_extractor(link)
201
+ else:
202
+ text = pdf_extractor(link)
203
+
204
+ # Create Chunks ----------------------------
205
+ print("Writing Tag Data")
206
+ chunks = text_splitter.create_documents(text)
207
+
208
+ for chunk in chunks:
209
+
210
+ with concurrent.futures.ThreadPoolExecutor() as executor:
211
+ future_to_key = {
212
+ executor.submit(
213
+ feature_extraction, f"Product {key}", history[key], chunk.page_content
214
+ ): key for key in history
215
+ }
216
+ for future in concurrent.futures.as_completed(future_to_key):
217
+ key = future_to_key[future]
218
+ try:
219
+ response = future.result()
220
+ history[key] = response
221
+ except Exception as e:
222
+ print(f"Error processing {key}: {e}")
223
+
224
+ # history = detailed_history(history)
225
+ print("Creating Vectors")
226
+ print(history)
227
+ genai_embeddings=[]
228
+
229
+ for tag in history:
230
+ try:
231
+ result = genai.embed_content(
232
+ model="models/embedding-001",
233
+ content=history[tag],
234
+ task_type="retrieval_document")
235
+ genai_embeddings.append(result['embedding'])
236
+ except:
237
+ genai_embeddings.append([0]*768)
238
+
239
+
240
+ return history,genai_embeddings
241
+
242
+ global text_splitter
243
+ global data
244
+ global history
245
+
246
+
247
+ text_splitter = RecursiveCharacterTextSplitter(
248
+ chunk_size = 10000,
249
+ chunk_overlap = 100,
250
+ separators = ["",''," "]
251
+ )
252
+
253
+
254
+ if __name__ == '__main__':
255
+ pass
preprocess.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import random
4
+ import concurrent.futures
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from langdetect import detect_langs
8
+ import requests
9
+ from PyPDF2 import PdfReader
10
+ from io import BytesIO
11
+ from langchain_community.document_loaders import WebBaseLoader
12
+ from langchain_google_genai import ChatGoogleGenerativeAI
13
+ import logging
14
+
15
+ data = False
16
+ seen = set()
17
+
18
+ # API Urls -----
19
+
20
+ # main_url = "http://127.0.0.1:5000/search/all"
21
+ main_url = "http://127.0.0.1:8000/search/all"
22
+ # main_product = "Samsung Galaxy s23 ultra"
23
+
24
+ # Revelevance Checking Models -----
25
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
26
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
27
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
28
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
29
+
30
+
31
+ API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
32
+ headers = {"Authorization": "Bearer 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"}
33
+
34
+ # Error Debug
35
+ logging.basicConfig(level=logging.INFO)
36
+
37
+
38
+ def get_links(main_product,api_key):
39
+ params = {
40
+ "API_KEY": f"{api_key}",
41
+ "product": f"{main_product}",
42
+ }
43
+
44
+ # Flask
45
+ response = requests.get(main_url, params=params)
46
+
47
+ # FastAPI
48
+ # response = requests.post(main_url, json=params)
49
+
50
+
51
+ if response.status_code == 200:
52
+ results = response.json()
53
+ with open('data.json', 'w') as f:
54
+ json.dump(results, f)
55
+ else:
56
+ print(f"Failed to fetch results: {response.status_code}")
57
+
58
+
59
+
60
+ def language_preprocess(text):
61
+ try:
62
+ if detect_langs(text)[0].lang == 'en':
63
+ return True
64
+ return False
65
+ except:
66
+ return False
67
+
68
+
69
+ def relevant(product, similar_product, content):
70
+
71
+ try:
72
+ payload = { "inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}'''}
73
+
74
+ # response = requests.post(API_URL, headers=headers, json=payload)
75
+ # output = response.json()
76
+ # return bool(output[0]['generated_text'])
77
+
78
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
79
+ result = model.invoke(f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content}''')
80
+ return bool(result)
81
+
82
+ except:
83
+ return False
84
+
85
+
86
+
87
+ def download_pdf(url, timeout=10):
88
+ try:
89
+ response = requests.get(url, timeout=timeout)
90
+ response.raise_for_status()
91
+ return BytesIO(response.content)
92
+
93
+ except requests.RequestException as e:
94
+ logging.error(f"PDF download error: {e}")
95
+ return None
96
+
97
+ def extract_text_from_pdf(pdf_file, pages):
98
+ reader = PdfReader(pdf_file)
99
+ extracted_text = ""
100
+
101
+ l = len(reader.pages)
102
+
103
+ try:
104
+ for page_num in pages:
105
+ if page_num < l:
106
+ page = reader.pages[page_num]
107
+ extracted_text += page.extract_text() + "\n"
108
+ else:
109
+ print(f"Page {page_num} does not exist in the document.")
110
+
111
+ return extracted_text
112
+
113
+ except:
114
+ return 'हे चालत नाही'
115
+
116
+ def extract_text_online(link):
117
+
118
+ loader = WebBaseLoader(link)
119
+ pages = loader.load_and_split()
120
+
121
+ text = ''
122
+
123
+ for page in pages[:3]:
124
+ text+=page.page_content
125
+
126
+ return text
127
+
128
+
129
+ def process_link(link, main_product, similar_product):
130
+ if link in seen:
131
+ return None
132
+ seen.add(link)
133
+ try:
134
+ if link[-3:]=='.md' or link[8:11] == 'en.':
135
+ text = extract_text_online(link)
136
+ else:
137
+ pdf_file = download_pdf(link)
138
+ text = extract_text_from_pdf(pdf_file, [0, 2, 4])
139
+
140
+ if language_preprocess(text):
141
+ if relevant(main_product, similar_product, text):
142
+ print("Accepted",link)
143
+ return link
144
+ except:
145
+ pass
146
+ print("NOT Accepted",link)
147
+ return None
148
+
149
+ def filtering(urls, main_product, similar_product):
150
+ res = []
151
+
152
+ print(f"Filtering Links of ---- {similar_product}")
153
+ # Main Preprocess ------------------------------
154
+ with ThreadPoolExecutor() as executor:
155
+ futures = {executor.submit(process_link, link, main_product, similar_product): link for link in urls}
156
+ for future in concurrent.futures.as_completed(futures):
157
+ result = future.result()
158
+ if result is not None:
159
+ res.append(result)
160
+
161
+ return res
162
+
163
+
164
+ # Main Functions -------------------------------------------------->
165
+
166
+ # get_links()
167
+ # preprocess()
168
+
search.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Library Imports
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from googlesearch import search
5
+ from duckduckgo_search import DDGS
6
+ import concurrent.futures
7
+ import re
8
+
9
+
10
+
11
+ # Search Functions -------------------------------------------------------------->
12
+
13
+ # Function to search DuckDuckGo
14
+ def search_duckduckgo(query):
15
+ print("Fetching Duckduckgo Links -----")
16
+ try:
17
+ results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
18
+ return [res['href'] for res in results]
19
+ except:
20
+ return []
21
+
22
+ # Function to search Google
23
+ def search_google(query):
24
+ print("Fetching Google Links -----")
25
+
26
+ links = []
27
+ try:
28
+ api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
29
+ search_engine_id = 'c4ca951b9fc6949cb'
30
+
31
+ url = f"https://www.googleapis.com/customsearch/v1"
32
+ params = {
33
+ "key": api_key,
34
+ "cx": search_engine_id,
35
+ "q": query + " manual filetype:pdf"
36
+ }
37
+
38
+ response = requests.get(url, params=params)
39
+ results = response.json()
40
+
41
+ for item in results.get('items', []):
42
+ links.append(item['link'])
43
+ except:
44
+ pass
45
+
46
+ try:
47
+ extension = "ext:pdf"
48
+ for result in search(query + " manual " + extension, num_results=5):
49
+ if result.endswith('.pdf'):
50
+ links.append(result)
51
+ except:
52
+ pass
53
+
54
+ return links
55
+
56
+ # Function to search Internet Archive
57
+ def search_archive(query):
58
+ print("Fetching Archieve Links -----")
59
+
60
+ try:
61
+ url = "https://archive.org/advancedsearch.php"
62
+ params = {
63
+ 'q': f'{query} manual',
64
+ 'fl[]': ['identifier', 'title', 'format'],
65
+ 'rows': 50,
66
+ 'page': 1,
67
+ 'output': 'json'
68
+ }
69
+
70
+ # Make the request
71
+ response = requests.get(url, params=params)
72
+ data = response.json()
73
+
74
+ # Function to extract hyperlinks from a webpage
75
+ def extract_hyperlinks(url):
76
+ # Send a GET request to the URL
77
+ response = requests.get(url)
78
+
79
+ # Check if the request was successful
80
+ if response.status_code == 200:
81
+ # Parse the HTML content of the page
82
+ soup = BeautifulSoup(response.text, 'html.parser')
83
+
84
+ # Find all <a> tags (hyperlinks)
85
+ for link in soup.find_all('a', href=True):
86
+ href = link['href']
87
+ if href.endswith('.pdf'):
88
+ pdf_files.append(url+'/'+href)
89
+ if href.endswith('.iso'):
90
+ # If the link ends with .iso, follow the link and extract .pdf hyperlinks
91
+ extract_pdf_from_iso(url+'/'+href+'/')
92
+
93
+ # Function to extract .pdf hyperlinks from an .iso file
94
+ def extract_pdf_from_iso(iso_url):
95
+ # Send a GET request to the ISO URL
96
+ iso_response = requests.get(iso_url)
97
+
98
+ # Check if the request was successful
99
+ if iso_response.status_code == 200:
100
+ # Parse the HTML content of the ISO page
101
+ iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
102
+
103
+ # Find all <a> tags (hyperlinks) in the ISO page
104
+ for link in iso_soup.find_all('a', href=True):
105
+ href = link['href']
106
+ if href.endswith('.pdf'):
107
+ pdf_files.append('https:'+href)
108
+
109
+ pdf_files = []
110
+
111
+ def process_doc(doc):
112
+ identifier = doc.get('identifier', 'N/A')
113
+ # title = doc.get('title', 'N/A')
114
+ # format = doc.get('format', 'N/A')
115
+ pdf_link = f"https://archive.org/download/{identifier}"
116
+ extract_hyperlinks(pdf_link)
117
+
118
+ with concurrent.futures.ThreadPoolExecutor() as executor:
119
+ futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
120
+
121
+ # Optionally, wait for all futures to complete and handle any exceptions
122
+ for future in concurrent.futures.as_completed(futures):
123
+ try:
124
+ future.result() # This will raise an exception if the function call raised
125
+ except Exception as exc:
126
+ print(f'Generated an exception: {exc}')
127
+
128
+
129
+ return pdf_files
130
+
131
+ except:
132
+ return []
133
+
134
+ def search_github(query):
135
+ print("Fetching Github Links -----")
136
+
137
+ try:
138
+ # GitHub Search API endpoint
139
+ url = f"https://api.github.com/search/code?q={query}+extension:md"
140
+
141
+ headers = {
142
+ 'Authorization': 'Token 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'
143
+ }
144
+
145
+ # Make the request
146
+ response = requests.get(url,headers=headers)
147
+ data = response.json()
148
+ links = [item['html_url'] for item in data['items']]
149
+
150
+ return links
151
+
152
+ except:
153
+ return []
154
+
155
+ def search_wikipedia(product):
156
+ print("Fetching Duckduckgo Links -----")
157
+
158
+ api_url = "https://en.wikipedia.org/w/api.php"
159
+ params = {
160
+ "action": "opensearch",
161
+ "search": product,
162
+ "limit": 5,
163
+ "namespace": 0,
164
+ "format": "json"
165
+ }
166
+
167
+ try:
168
+ response = requests.get(api_url, params=params)
169
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
170
+ data = response.json()
171
+
172
+ if data and len(data) > 3 and len(data[3]) > 0:
173
+ return data[3] # The URL is in the fourth element of the response array
174
+ else:
175
+ return []
176
+
177
+ except requests.RequestException as e:
178
+ print(f"An error occurred: {e}")
179
+ return []
180
+
181
+ # def search_all(product,num):
182
+
183
+ # similar_products = extract_similar_products(product)[num]
184
+
185
+ # # results = {
186
+ # # product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
187
+ # # }
188
+
189
+ # results = {}
190
+
191
+ # def search_product(p):
192
+ # return {
193
+ # 'product': p,
194
+ # 'duckduckgo': duckduckgo_search(p),
195
+ # 'google': google_search(p),
196
+ # 'github': github_search(p),
197
+ # 'archive': archive_search(p),
198
+ # 'wikipedia': wikipedia_search(p)
199
+ # }
200
+
201
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
202
+ # future_to_product = {executor.submit(search_product, p): p for p in similar_products}
203
+
204
+ # for future in concurrent.futures.as_completed(future_to_product):
205
+ # result = future.result()
206
+ # product = result['product']
207
+ # results[product] = [
208
+ # {'duckduckgo': result['duckduckgo']},
209
+ # {'google': result['google']},
210
+ # {'github': result['github']},
211
+ # {'archive': result['archive']},
212
+ # {'wikipedia': result['wikipedia']}
213
+ # ]
214
+
215
+ # return results
216
+
217
+ # Similarity Check -------------------------------------->
218
+
219
+ def extract_similar_products(query):
220
+ print(f"\nFetching similar items of -----> {query}")
221
+ results = DDGS().chat(f'{query} Similar Products')
222
+
223
+ pattern = r'^\d+\.\s(.+)$'
224
+ matches = re.findall(pattern, results, re.MULTILINE)
225
+ matches = [item.split(': ')[0] for item in matches]
226
+ return matches
227
+