Prathmesh48 commited on
Commit
5c5c323
1 Parent(s): 9ba9756

Update embedding.py

Browse files
Files changed (1) hide show
  1. embedding.py +424 -424
embedding.py CHANGED
@@ -1,425 +1,425 @@
1
- from PyPDF2 import PdfReader
2
- import requests
3
- import json
4
- import os
5
- import concurrent.futures
6
- import random
7
- from langchain_google_genai import ChatGoogleGenerativeAI
8
- from langchain_community.document_loaders import WebBaseLoader
9
- from langchain_community.document_loaders import PyPDFLoader
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- import google.generativeai as genai
12
- from langchain_core.messages import HumanMessage
13
- from io import BytesIO
14
- import numpy as np
15
- import re
16
- import torch
17
- from transformers import AutoTokenizer, AutoModel
18
- import numpy as np
19
- import onnxruntime as ort
20
- # import torch._dynamo
21
- import time
22
- # torch._dynamo.config.suppress_errors = True
23
-
24
- from search import search_images
25
-
26
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
27
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
28
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
29
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
30
-
31
- vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
32
- vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
33
- vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
34
- vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
35
-
36
- tokenizer = AutoTokenizer.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
37
- # model = AutoModel.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
38
- model_path = "model_opt2_QInt8.onnx"
39
-
40
- session = ort.InferenceSession(model_path)
41
- # model = torch.compile(model)
42
- # model.to('cpu') # Ensure the model is on the CPU
43
-
44
- from transformers import PreTrainedTokenizerFast
45
-
46
- class TokenBasedTextSplitter:
47
- def __init__(self, tokenizer_path='tokenizer.json', chunk_size=2000, chunk_overlap=50):
48
- self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
49
- self.chunk_size = chunk_size
50
- self.chunk_overlap = chunk_overlap
51
-
52
- def split_text(self, text):
53
- tokens = self.tokenizer.tokenize(text)
54
- chunks = []
55
-
56
- for i in range(0, len(tokens), self.chunk_size - self.chunk_overlap):
57
- chunk = tokens[i:i + self.chunk_size]
58
- chunks.append(self.tokenizer.convert_tokens_to_string(chunk))
59
-
60
- return chunks
61
-
62
-
63
-
64
- genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
65
-
66
- def pdf_extractor(link):
67
- text = ''
68
-
69
- try:
70
- # Fetch the PDF file from the URL
71
- response = requests.get(link)
72
- response.raise_for_status() # Raise an error for bad status codes
73
-
74
- # Use BytesIO to handle the PDF content in memory
75
- pdf_file = BytesIO(response.content)
76
-
77
- # Load the PDF file
78
- reader = PdfReader(pdf_file)
79
- for page in reader.pages:
80
- text += page.extract_text() # Extract text from each page
81
-
82
- except requests.exceptions.HTTPError as e:
83
- print(f'HTTP error occurred: {e}')
84
- except Exception as e:
85
- print(f'An error occurred: {e}')
86
-
87
- return text
88
-
89
- def web_extractor(link):
90
- text = ''
91
-
92
- try:
93
- loader = WebBaseLoader(link)
94
- pages = loader.load_and_split()
95
-
96
- for page in pages:
97
- text+=page.page_content
98
- except:
99
- pass
100
-
101
- return text
102
-
103
- def imporve_text(text):
104
-
105
- prompt = f'''
106
- Please rewrite the following text to make it short, descriptive, concise, and of high quality.
107
- Ensure that all essential information is retained.
108
- Focus on improving clarity, coherence, and word choice without altering the original meaning.
109
-
110
- text = {text}
111
- '''
112
-
113
- model = random.choice([gemini,gemini1,gemini2,gemini3])
114
- result = model.invoke(prompt)
115
-
116
- return result.content
117
-
118
- def feature_extraction(tag, history , context):
119
-
120
- prompt = f'''
121
- You are an intelligent assistant tasked with updating product information. You have two data sources:
122
- 1. Tag_History: Previously gathered information about the product.
123
- 2. Tag_Context: New data that might contain additional details.
124
- Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
125
- Guidelines:
126
- - Only add new details that are relevant to the {tag} FIELD.
127
- - Do not add or modify any other fields in the Tag_History.
128
- - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
129
- Here is the data:
130
- Tag_Context: {str(context)}
131
- Tag_History: {history}
132
- Respond with the updated Tag_History.
133
- '''
134
-
135
- model = random.choice([gemini,gemini1,gemini2,gemini3])
136
- result = model.invoke(prompt)
137
-
138
- return result.content
139
-
140
- def feature_extraction_image(url):
141
- text = ' '
142
- model = genai.GenerativeModel('gemini-1.5-flash-001')
143
- try:
144
- res = model.generate_content(['Describe this image to me',url])
145
- text = res.text
146
-
147
- except:
148
- pass
149
- return text
150
-
151
- def detailed_feature_extraction(find, context):
152
-
153
- prompt = f'''
154
- You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
155
- 1. Context: The gathered information about the product.
156
- 2. Format: Details which need to be filled based on Context.
157
- Your job is to read the Context and update the relevant field in Format using Context.
158
- Guidelines:
159
- - Only add details that are relevant to the individual FIELD.
160
- - Do not add or modify any other fields in the Format.
161
- - If nothing found return None.
162
- Here is the data:
163
- The Context is {str(context)}
164
- The Format is {str(find)}
165
- '''
166
-
167
- model = random.choice([gemini,gemini1,gemini2,gemini3])
168
- result = model.invoke(prompt)
169
-
170
- return result.content
171
-
172
- def detailed_history(history):
173
-
174
- details = {
175
- "Introduction": {
176
- "Product Name": None,
177
- "Overview of the product": None,
178
- "Purpose of the manual": None,
179
- "Audience": None,
180
- "Additional Details": None
181
- },
182
- "Specifications": {
183
- "Technical specifications": None,
184
- "Performance metrics": None,
185
- "Additional Details": None
186
- },
187
- "Product Overview": {
188
- "Product features": None,
189
- "Key components and parts": None,
190
- "Additional Details": None
191
- },
192
- "Safety Information": {
193
- "Safety warnings and precautions": None,
194
- "Compliance and certification information": None,
195
- "Additional Details": None
196
- },
197
- "Installation Instructions": {
198
- "Unboxing and inventory checklist": None,
199
- "Step-by-step installation guide": None,
200
- "Required tools and materials": None,
201
- "Additional Details": None
202
- },
203
- "Setup and Configuration": {
204
- "Initial setup procedures": None,
205
- "Configuration settings": None,
206
- "Troubleshooting setup issues": None,
207
- "Additional Details": None
208
- },
209
- "Operation Instructions": {
210
- "How to use the product": None,
211
- "Detailed instructions for different functionalities": None,
212
- "User interface guide": None,
213
- "Additional Details": None
214
- },
215
- "Maintenance and Care": {
216
- "Cleaning instructions": None,
217
- "Maintenance schedule": None,
218
- "Replacement parts and accessories": None,
219
- "Additional Details": None
220
- },
221
- "Troubleshooting": {
222
- "Common issues and solutions": None,
223
- "Error messages and their meanings": None,
224
- "Support Information": None,
225
- "Additional Details": None
226
- },
227
- "Warranty Information": {
228
- "Terms and Conditions": None,
229
- "Service and repair information": None,
230
- "Additional Details": None
231
- },
232
- "Legal Information": {
233
- "Copyright information": None,
234
- "Trademarks and patents": None,
235
- "Disclaimers": None,
236
- "Additional Details": None
237
-
238
- }
239
- }
240
-
241
- for key,val in history.items():
242
-
243
- find = details[key]
244
-
245
- details[key] = str(detailed_feature_extraction(find,val))
246
-
247
- return details
248
-
249
-
250
- def get_embeddings(link,tag_option):
251
-
252
- print(f"\n--> Creating Embeddings - {link}")
253
-
254
- if tag_option=='Complete Document Similarity':
255
- history = { "Details": "" }
256
-
257
- else:
258
- history = {
259
- "Introduction": "",
260
- "Specifications": "",
261
- "Product Overview": "",
262
- "Safety Information": "",
263
- "Installation Instructions": "",
264
- "Setup and Configuration": "",
265
- "Operation Instructions": "",
266
- "Maintenance and Care": "",
267
- "Troubleshooting": "",
268
- "Warranty Information": "",
269
- "Legal Information": ""
270
- }
271
-
272
- # Extract Text -----------------------------
273
- print("Extracting Text")
274
- if link[-3:] == '.md' or link[8:11] == 'en.':
275
- text = web_extractor(link)
276
- else:
277
- text = pdf_extractor(link)
278
-
279
- # Create Chunks ----------------------------
280
- print("Writing Tag Data")
281
-
282
-
283
- if tag_option=="Complete Document Similarity":
284
- history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
285
-
286
- else:
287
- chunks = text_splitter.create_documents(text)
288
-
289
- for chunk in chunks:
290
-
291
- with concurrent.futures.ThreadPoolExecutor() as executor:
292
- future_to_key = {
293
- executor.submit(
294
- feature_extraction, f"Product {key}", history[key], chunk.page_content
295
- ): key for key in history
296
- }
297
- for future in concurrent.futures.as_completed(future_to_key):
298
- key = future_to_key[future]
299
- try:
300
- response = future.result()
301
- history[key] = response
302
- except Exception as e:
303
- print(f"Error processing {key}: {e}")
304
-
305
- print("Creating Vectors")
306
- genai_embeddings=[]
307
-
308
- for tag in history:
309
- result = genai.embed_content(
310
- model="models/embedding-001",
311
- content=history[tag],
312
- task_type="retrieval_document")
313
- genai_embeddings.append(result['embedding'])
314
-
315
-
316
- return history,genai_embeddings
317
-
318
- def get_embed_chroma(link):
319
-
320
- print(f"\n--> Creating Embeddings - {link}")
321
-
322
- # Extract Text -----------------------------
323
- if link[-3:] == '.md' or link[8:11] == 'en.':
324
- text = web_extractor(link)
325
- else:
326
- text = pdf_extractor(link)
327
- print("\u2713 Extracting Text")
328
-
329
- # Create Chunks ----------------------------
330
-
331
- text = re.sub(r'\.{2,}', '.', text)
332
- text = re.sub(r'\s{2,}', ' ', text)
333
- text = re.sub(r'\d{7,}', '', text)
334
-
335
- text = re.sub(r'\n{2,}', '\n', text)
336
-
337
-
338
- chunks = text_splitter_small.split_text(text)
339
- # print(chunks[:2])
340
- print("\u2713 Writing Tag Data")
341
-
342
- # Creating Vector
343
- embedding_vectors=[]
344
- # textual_data = []
345
- print("\u2713 Creating Vectors")
346
-
347
-
348
- # batch_size = 1
349
- # # Process chunks in batches
350
- # for i in range(0, len(chunks), batch_size):
351
- # batch = chunks[i:i + batch_size]
352
-
353
- # # texts = [text for text in batch]
354
- # # print(texts)
355
-
356
- t1 = time.time()
357
- for chunk in chunks:
358
- # Tokenize the input text
359
- inputs = tokenizer(chunk, return_tensors="np", padding=True, truncation=True)
360
-
361
- # Convert inputs to int64
362
- input_ids = inputs['input_ids'].astype(np.int64)
363
- attention_mask = inputs['attention_mask'].astype(np.int64)
364
- token_type_ids = inputs.get('token_type_ids', np.zeros_like(input_ids)).astype(np.int64) # Some models might not use token_type_ids
365
-
366
- # Create the input feed dictionary
367
- input_feed = {
368
- 'input_ids': input_ids,
369
- 'attention_mask': attention_mask,
370
- 'token_type_ids': token_type_ids
371
- }
372
-
373
- # Get the model's outputs
374
- outputs = session.run(None, input_feed)
375
-
376
- # Convert the outputs to numpy and process as needed
377
- last_hidden_state = np.array(outputs[0])
378
- embeddings = last_hidden_state.mean(axis=1).tolist()
379
- embedding_vectors.append(embeddings)
380
- # textual_data.a(text)
381
-
382
- t2 = time.time()
383
- print(t2-t1)
384
- return chunks , embedding_vectors
385
-
386
-
387
- def get_image_embeddings(Product):
388
- image_embeddings = []
389
-
390
- links = search_images(Product)
391
- with concurrent.futures.ThreadPoolExecutor() as executor:
392
- descriptions = list(executor.map(feature_extraction_image, links))
393
-
394
- for description in descriptions:
395
- result = genai.embed_content(
396
- model="models/embedding-001",
397
- content=description,
398
- task_type="retrieval_document")
399
-
400
- image_embeddings.append(result['embedding'])
401
- # print(image_embeddings)
402
- return image_embeddings , links
403
-
404
- global text_splitter
405
- global data
406
- global history
407
-
408
- text_splitter = RecursiveCharacterTextSplitter(
409
- chunk_size = 10000,
410
- chunk_overlap = 100,
411
- separators = ["",''," "]
412
- )
413
-
414
- # text_splitter_small = RecursiveCharacterTextSplitter(
415
- # chunk_size = 2000,
416
- # chunk_overlap = 100,
417
- # separators = ["",''," "]
418
- # )
419
-
420
- text_splitter_small = TokenBasedTextSplitter(chunk_size=500, chunk_overlap=50)
421
- # chunks = splitter.split_text(text)
422
-
423
- if __name__ == '__main__':
424
- print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
425
  # print(get_image_embeddings(Product='Samsung Galaxy S24'))
 
1
+ from PyPDF2 import PdfReader
2
+ import requests
3
+ import json
4
+ import os
5
+ import concurrent.futures
6
+ import random
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ from langchain_community.document_loaders import WebBaseLoader
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ import google.generativeai as genai
12
+ from langchain_core.messages import HumanMessage
13
+ from io import BytesIO
14
+ import numpy as np
15
+ import re
16
+ import torch
17
+ from transformers import AutoTokenizer, AutoModel
18
+ import numpy as np
19
+ import onnxruntime as ort
20
+ # import torch._dynamo
21
+ import time
22
+ # torch._dynamo.config.suppress_errors = True
23
+
24
+ from search import search_images
25
+
26
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
27
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
28
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
29
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
30
+
31
+ vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
32
+ vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
33
+ vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
34
+ vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
35
+
36
+ tokenizer = AutoTokenizer.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
37
+ # model = AutoModel.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
38
+ model_path = "model_opt2_QInt8.onnx"
39
+
40
+ session = ort.InferenceSession(model_path)
41
+ # model = torch.compile(model)
42
+ # model.to('cpu') # Ensure the model is on the CPU
43
+
44
+ from transformers import PreTrainedTokenizerFast
45
+
46
+ class TokenBasedTextSplitter:
47
+ def __init__(self, tokenizer_path='tokenizer.json', chunk_size=2000, chunk_overlap=50):
48
+ self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
49
+ self.chunk_size = chunk_size
50
+ self.chunk_overlap = chunk_overlap
51
+
52
+ def split_text(self, text):
53
+ tokens = self.tokenizer.tokenize(text)
54
+ chunks = []
55
+
56
+ for i in range(0, len(tokens), self.chunk_size - self.chunk_overlap):
57
+ chunk = tokens[i:i + self.chunk_size]
58
+ chunks.append(self.tokenizer.convert_tokens_to_string(chunk))
59
+
60
+ return chunks
61
+
62
+
63
+
64
+ genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
65
+
66
+ def pdf_extractor(link):
67
+ text = ''
68
+
69
+ try:
70
+ # Fetch the PDF file from the URL
71
+ response = requests.get(link)
72
+ response.raise_for_status() # Raise an error for bad status codes
73
+
74
+ # Use BytesIO to handle the PDF content in memory
75
+ pdf_file = BytesIO(response.content)
76
+
77
+ # Load the PDF file
78
+ reader = PdfReader(pdf_file)
79
+ for page in reader.pages:
80
+ text += page.extract_text() # Extract text from each page
81
+
82
+ except requests.exceptions.HTTPError as e:
83
+ print(f'HTTP error occurred: {e}')
84
+ except Exception as e:
85
+ print(f'An error occurred: {e}')
86
+
87
+ return text
88
+
89
+ def web_extractor(link):
90
+ text = ''
91
+
92
+ try:
93
+ loader = WebBaseLoader(link)
94
+ pages = loader.load_and_split()
95
+
96
+ for page in pages:
97
+ text+=page.page_content
98
+ except:
99
+ pass
100
+
101
+ return text
102
+
103
+ def imporve_text(text):
104
+
105
+ prompt = f'''
106
+ Please rewrite the following text to make it short, descriptive, concise, and of high quality.
107
+ Ensure that all essential information is retained.
108
+ Focus on improving clarity, coherence, and word choice without altering the original meaning.
109
+
110
+ text = {text}
111
+ '''
112
+
113
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
114
+ result = model.invoke(prompt)
115
+
116
+ return result.content
117
+
118
+ def feature_extraction(tag, history , context):
119
+
120
+ prompt = f'''
121
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
122
+ 1. Tag_History: Previously gathered information about the product.
123
+ 2. Tag_Context: New data that might contain additional details.
124
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
125
+ Guidelines:
126
+ - Only add new details that are relevant to the {tag} FIELD.
127
+ - Do not add or modify any other fields in the Tag_History.
128
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
129
+ Here is the data:
130
+ Tag_Context: {str(context)}
131
+ Tag_History: {history}
132
+ Respond with the updated Tag_History.
133
+ '''
134
+
135
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
136
+ result = model.invoke(prompt)
137
+
138
+ return result.content
139
+
140
+ def feature_extraction_image(url):
141
+ text = ' '
142
+ model = genai.GenerativeModel('gemini-1.5-flash-001')
143
+ try:
144
+ res = model.generate_content(['Describe this image to me',url])
145
+ text = res.text
146
+
147
+ except:
148
+ pass
149
+ return text
150
+
151
+ def detailed_feature_extraction(find, context):
152
+
153
+ prompt = f'''
154
+ You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
155
+ 1. Context: The gathered information about the product.
156
+ 2. Format: Details which need to be filled based on Context.
157
+ Your job is to read the Context and update the relevant field in Format using Context.
158
+ Guidelines:
159
+ - Only add details that are relevant to the individual FIELD.
160
+ - Do not add or modify any other fields in the Format.
161
+ - If nothing found return None.
162
+ Here is the data:
163
+ The Context is {str(context)}
164
+ The Format is {str(find)}
165
+ '''
166
+
167
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
168
+ result = model.invoke(prompt)
169
+
170
+ return result.content
171
+
172
+ def detailed_history(history):
173
+
174
+ details = {
175
+ "Introduction": {
176
+ "Product Name": None,
177
+ "Overview of the product": None,
178
+ "Purpose of the manual": None,
179
+ "Audience": None,
180
+ "Additional Details": None
181
+ },
182
+ "Specifications": {
183
+ "Technical specifications": None,
184
+ "Performance metrics": None,
185
+ "Additional Details": None
186
+ },
187
+ "Product Overview": {
188
+ "Product features": None,
189
+ "Key components and parts": None,
190
+ "Additional Details": None
191
+ },
192
+ "Safety Information": {
193
+ "Safety warnings and precautions": None,
194
+ "Compliance and certification information": None,
195
+ "Additional Details": None
196
+ },
197
+ "Installation Instructions": {
198
+ "Unboxing and inventory checklist": None,
199
+ "Step-by-step installation guide": None,
200
+ "Required tools and materials": None,
201
+ "Additional Details": None
202
+ },
203
+ "Setup and Configuration": {
204
+ "Initial setup procedures": None,
205
+ "Configuration settings": None,
206
+ "Troubleshooting setup issues": None,
207
+ "Additional Details": None
208
+ },
209
+ "Operation Instructions": {
210
+ "How to use the product": None,
211
+ "Detailed instructions for different functionalities": None,
212
+ "User interface guide": None,
213
+ "Additional Details": None
214
+ },
215
+ "Maintenance and Care": {
216
+ "Cleaning instructions": None,
217
+ "Maintenance schedule": None,
218
+ "Replacement parts and accessories": None,
219
+ "Additional Details": None
220
+ },
221
+ "Troubleshooting": {
222
+ "Common issues and solutions": None,
223
+ "Error messages and their meanings": None,
224
+ "Support Information": None,
225
+ "Additional Details": None
226
+ },
227
+ "Warranty Information": {
228
+ "Terms and Conditions": None,
229
+ "Service and repair information": None,
230
+ "Additional Details": None
231
+ },
232
+ "Legal Information": {
233
+ "Copyright information": None,
234
+ "Trademarks and patents": None,
235
+ "Disclaimers": None,
236
+ "Additional Details": None
237
+
238
+ }
239
+ }
240
+
241
+ for key,val in history.items():
242
+
243
+ find = details[key]
244
+
245
+ details[key] = str(detailed_feature_extraction(find,val))
246
+
247
+ return details
248
+
249
+
250
+ def get_embeddings(link,tag_option):
251
+
252
+ print(f"\n--> Creating Embeddings - {link}")
253
+
254
+ if tag_option=='Complete Document Similarity':
255
+ history = { "Details": "" }
256
+
257
+ else:
258
+ history = {
259
+ "Introduction": "",
260
+ "Specifications": "",
261
+ "Product Overview": "",
262
+ "Safety Information": "",
263
+ "Installation Instructions": "",
264
+ "Setup and Configuration": "",
265
+ "Operation Instructions": "",
266
+ "Maintenance and Care": "",
267
+ "Troubleshooting": "",
268
+ "Warranty Information": "",
269
+ "Legal Information": ""
270
+ }
271
+
272
+ # Extract Text -----------------------------
273
+ print("Extracting Text")
274
+ if link[-3:] == '.md' or link[8:11] == 'en.':
275
+ text = web_extractor(link)
276
+ else:
277
+ text = pdf_extractor(link)
278
+
279
+ # Create Chunks ----------------------------
280
+ print("Writing Tag Data")
281
+
282
+
283
+ if tag_option=="Complete Document Similarity":
284
+ history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
285
+
286
+ else:
287
+ chunks = text_splitter.create_documents(text)
288
+
289
+ for chunk in chunks:
290
+
291
+ with concurrent.futures.ThreadPoolExecutor() as executor:
292
+ future_to_key = {
293
+ executor.submit(
294
+ feature_extraction, f"Product {key}", history[key], chunk.page_content
295
+ ): key for key in history
296
+ }
297
+ for future in concurrent.futures.as_completed(future_to_key):
298
+ key = future_to_key[future]
299
+ try:
300
+ response = future.result()
301
+ history[key] = response
302
+ except Exception as e:
303
+ print(f"Error processing {key}: {e}")
304
+
305
+ print("Creating Vectors")
306
+ genai_embeddings=[]
307
+
308
+ for tag in history:
309
+ result = genai.embed_content(
310
+ model="models/embedding-001",
311
+ content=history[tag],
312
+ task_type="retrieval_document")
313
+ genai_embeddings.append(result['embedding'])
314
+
315
+
316
+ return history,genai_embeddings
317
+
318
+ def get_embed_chroma(link):
319
+
320
+ print(f"\n--> Creating Embeddings - {link}")
321
+
322
+ # Extract Text -----------------------------
323
+ if link[-3:] == '.md' or link[8:11] == 'en.':
324
+ text = web_extractor(link)
325
+ else:
326
+ text = pdf_extractor(link)
327
+ print("\u2713 Extracting Text")
328
+
329
+ # Create Chunks ----------------------------
330
+
331
+ text = re.sub(r'\.{2,}', '.', text)
332
+ text = re.sub(r'\s{2,}', ' ', text)
333
+ text = re.sub(r'\d{7,}', '', text)
334
+
335
+ text = re.sub(r'\n{2,}', '\n', text)
336
+
337
+
338
+ chunks = text_splitter_small.split_text(text)
339
+ # print(chunks[:2])
340
+ print("\u2713 Writing Tag Data")
341
+
342
+ # Creating Vector
343
+ # embedding_vectors=[]
344
+ # textual_data = []
345
+ print("\u2713 Creating Vectors")
346
+
347
+
348
+ # batch_size = 1
349
+ # # Process chunks in batches
350
+ # for i in range(0, len(chunks), batch_size):
351
+ # batch = chunks[i:i + batch_size]
352
+
353
+ # # texts = [text for text in batch]
354
+ # # print(texts)
355
+
356
+ # t1 = time.time()
357
+ for chunk in chunks:
358
+ # Tokenize the input text
359
+ inputs = tokenizer(chunk, return_tensors="np", padding=True, truncation=True)
360
+
361
+ # Convert inputs to int64
362
+ input_ids = inputs['input_ids'].astype(np.int64)
363
+ attention_mask = inputs['attention_mask'].astype(np.int64)
364
+ token_type_ids = inputs.get('token_type_ids', np.zeros_like(input_ids)).astype(np.int64) # Some models might not use token_type_ids
365
+
366
+ # Create the input feed dictionary
367
+ input_feed = {
368
+ 'input_ids': input_ids,
369
+ 'attention_mask': attention_mask,
370
+ 'token_type_ids': token_type_ids
371
+ }
372
+
373
+ # Get the model's outputs
374
+ outputs = session.run(None, input_feed)
375
+
376
+ # Convert the outputs to numpy and process as needed
377
+ last_hidden_state = np.array(outputs[0])
378
+ embeddings = last_hidden_state.mean(axis=1).tolist()
379
+ # embedding_vectors.append(embeddings)
380
+ # textual_data.a(text)
381
+
382
+ # t2 = time.time()
383
+ # print(t2-t1)
384
+ return chunks , embeddings
385
+
386
+
387
+ def get_image_embeddings(Product):
388
+ image_embeddings = []
389
+
390
+ links = search_images(Product)
391
+ with concurrent.futures.ThreadPoolExecutor() as executor:
392
+ descriptions = list(executor.map(feature_extraction_image, links))
393
+
394
+ for description in descriptions:
395
+ result = genai.embed_content(
396
+ model="models/embedding-001",
397
+ content=description,
398
+ task_type="retrieval_document")
399
+
400
+ image_embeddings.append(result['embedding'])
401
+ # print(image_embeddings)
402
+ return image_embeddings , links
403
+
404
+ global text_splitter
405
+ global data
406
+ global history
407
+
408
+ text_splitter = RecursiveCharacterTextSplitter(
409
+ chunk_size = 10000,
410
+ chunk_overlap = 100,
411
+ separators = ["",''," "]
412
+ )
413
+
414
+ # text_splitter_small = RecursiveCharacterTextSplitter(
415
+ # chunk_size = 2000,
416
+ # chunk_overlap = 100,
417
+ # separators = ["",''," "]
418
+ # )
419
+
420
+ text_splitter_small = TokenBasedTextSplitter(chunk_size=500, chunk_overlap=50)
421
+ # chunks = splitter.split_text(text)
422
+
423
+ if __name__ == '__main__':
424
+ print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
425
  # print(get_image_embeddings(Product='Samsung Galaxy S24'))