That1BrainCell commited on
Commit
32bb13b
1 Parent(s): 30032ef

Update embedding.py

Browse files
Files changed (1) hide show
  1. embedding.py +378 -369
embedding.py CHANGED
@@ -1,370 +1,379 @@
1
- from PyPDF2 import PdfReader
2
- import requests
3
- import json
4
- import os
5
- import concurrent.futures
6
- import random
7
- from langchain_google_genai import ChatGoogleGenerativeAI
8
- from langchain_community.document_loaders import WebBaseLoader
9
- from langchain_community.document_loaders import PyPDFLoader
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- import google.generativeai as genai
12
- from langchain_core.messages import HumanMessage
13
- from io import BytesIO
14
- import numpy as np
15
- import re
16
- import torch
17
- from transformers import AutoTokenizer, AutoModel
18
-
19
- from search import search_images
20
-
21
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
22
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
23
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
24
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
25
-
26
- vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
27
- vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
28
- vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
29
- vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
30
-
31
- tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
32
- model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
33
- model.to('cpu') # Ensure the model is on the CPU
34
-
35
-
36
- genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
37
-
38
- def pdf_extractor(link):
39
- text = ''
40
-
41
- try:
42
- # Fetch the PDF file from the URL
43
- response = requests.get(link)
44
- response.raise_for_status() # Raise an error for bad status codes
45
-
46
- # Use BytesIO to handle the PDF content in memory
47
- pdf_file = BytesIO(response.content)
48
-
49
- # Load the PDF file
50
- reader = PdfReader(pdf_file)
51
- for page in reader.pages:
52
- text += page.extract_text() # Extract text from each page
53
-
54
- except requests.exceptions.HTTPError as e:
55
- print(f'HTTP error occurred: {e}')
56
- except Exception as e:
57
- print(f'An error occurred: {e}')
58
-
59
- return text
60
-
61
- def web_extractor(link):
62
- text = ''
63
-
64
- try:
65
- loader = WebBaseLoader(link)
66
- pages = loader.load_and_split()
67
-
68
- for page in pages:
69
- text+=page.page_content
70
- except:
71
- pass
72
-
73
- return text
74
-
75
- def imporve_text(text):
76
-
77
- prompt = f'''
78
- Please rewrite the following text to make it short, concise, and of high quality.
79
- Ensure that all essential information and key points are retained.
80
- Focus on improving clarity, coherence, and word choice without altering the original meaning.
81
-
82
- text = {text}
83
- '''
84
-
85
- model = random.choice([gemini,gemini1,gemini2,gemini3])
86
- result = model.invoke(prompt)
87
-
88
- return result.content
89
-
90
- def feature_extraction(tag, history , context):
91
-
92
- prompt = f'''
93
- You are an intelligent assistant tasked with updating product information. You have two data sources:
94
- 1. Tag_History: Previously gathered information about the product.
95
- 2. Tag_Context: New data that might contain additional details.
96
- Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
97
- Guidelines:
98
- - Only add new details that are relevant to the {tag} FIELD.
99
- - Do not add or modify any other fields in the Tag_History.
100
- - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
101
- Here is the data:
102
- Tag_Context: {str(context)}
103
- Tag_History: {history}
104
- Respond with the updated Tag_History.
105
- '''
106
-
107
- model = random.choice([gemini,gemini1,gemini2,gemini3])
108
- result = model.invoke(prompt)
109
-
110
- return result.content
111
-
112
- def feature_extraction_image(url):
113
- text = ' '
114
- model = genai.GenerativeModel('gemini-1.5-flash-001')
115
- try:
116
- res = model.generate_content(['Describe this image to me',url])
117
- text = res.text
118
-
119
- except:
120
- pass
121
- return text
122
-
123
- def detailed_feature_extraction(find, context):
124
-
125
- prompt = f'''
126
- You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
127
- 1. Context: The gathered information about the product.
128
- 2. Format: Details which need to be filled based on Context.
129
- Your job is to read the Context and update the relevant field in Format using Context.
130
- Guidelines:
131
- - Only add details that are relevant to the individual FIELD.
132
- - Do not add or modify any other fields in the Format.
133
- - If nothing found return None.
134
- Here is the data:
135
- The Context is {str(context)}
136
- The Format is {str(find)}
137
- '''
138
-
139
- model = random.choice([gemini,gemini1,gemini2,gemini3])
140
- result = model.invoke(prompt)
141
-
142
- return result.content
143
-
144
- def detailed_history(history):
145
-
146
- details = {
147
- "Introduction": {
148
- "Product Name": None,
149
- "Overview of the product": None,
150
- "Purpose of the manual": None,
151
- "Audience": None,
152
- "Additional Details": None
153
- },
154
- "Specifications": {
155
- "Technical specifications": None,
156
- "Performance metrics": None,
157
- "Additional Details": None
158
- },
159
- "Product Overview": {
160
- "Product features": None,
161
- "Key components and parts": None,
162
- "Additional Details": None
163
- },
164
- "Safety Information": {
165
- "Safety warnings and precautions": None,
166
- "Compliance and certification information": None,
167
- "Additional Details": None
168
- },
169
- "Installation Instructions": {
170
- "Unboxing and inventory checklist": None,
171
- "Step-by-step installation guide": None,
172
- "Required tools and materials": None,
173
- "Additional Details": None
174
- },
175
- "Setup and Configuration": {
176
- "Initial setup procedures": None,
177
- "Configuration settings": None,
178
- "Troubleshooting setup issues": None,
179
- "Additional Details": None
180
- },
181
- "Operation Instructions": {
182
- "How to use the product": None,
183
- "Detailed instructions for different functionalities": None,
184
- "User interface guide": None,
185
- "Additional Details": None
186
- },
187
- "Maintenance and Care": {
188
- "Cleaning instructions": None,
189
- "Maintenance schedule": None,
190
- "Replacement parts and accessories": None,
191
- "Additional Details": None
192
- },
193
- "Troubleshooting": {
194
- "Common issues and solutions": None,
195
- "Error messages and their meanings": None,
196
- "Support Information": None,
197
- "Additional Details": None
198
- },
199
- "Warranty Information": {
200
- "Terms and Conditions": None,
201
- "Service and repair information": None,
202
- "Additional Details": None
203
- },
204
- "Legal Information": {
205
- "Copyright information": None,
206
- "Trademarks and patents": None,
207
- "Disclaimers": None,
208
- "Additional Details": None
209
-
210
- }
211
- }
212
-
213
- for key,val in history.items():
214
-
215
- find = details[key]
216
-
217
- details[key] = str(detailed_feature_extraction(find,val))
218
-
219
- return details
220
-
221
-
222
- def get_embeddings(link,tag_option):
223
-
224
- print(f"\n--> Creating Embeddings - {link}")
225
-
226
- if tag_option=='Complete Document Similarity':
227
- history = { "Details": "" }
228
-
229
- else:
230
- history = {
231
- "Introduction": "",
232
- "Specifications": "",
233
- "Product Overview": "",
234
- "Safety Information": "",
235
- "Installation Instructions": "",
236
- "Setup and Configuration": "",
237
- "Operation Instructions": "",
238
- "Maintenance and Care": "",
239
- "Troubleshooting": "",
240
- "Warranty Information": "",
241
- "Legal Information": ""
242
- }
243
-
244
- # Extract Text -----------------------------
245
- print("Extracting Text")
246
- if link[-3:] == '.md' or link[8:11] == 'en.':
247
- text = web_extractor(link)
248
- else:
249
- text = pdf_extractor(link)
250
-
251
- # Create Chunks ----------------------------
252
- print("Writing Tag Data")
253
-
254
- if tag_option=="Complete Document Similarity":
255
- history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
256
-
257
- else:
258
- chunks = text_splitter.create_documents(text)
259
-
260
- for chunk in chunks:
261
-
262
- with concurrent.futures.ThreadPoolExecutor() as executor:
263
- future_to_key = {
264
- executor.submit(
265
- feature_extraction, f"Product {key}", history[key], chunk.page_content
266
- ): key for key in history
267
- }
268
- for future in concurrent.futures.as_completed(future_to_key):
269
- key = future_to_key[future]
270
- try:
271
- response = future.result()
272
- history[key] = response
273
- except Exception as e:
274
- print(f"Error processing {key}: {e}")
275
-
276
- print("Creating Vectors")
277
- genai_embeddings=[]
278
-
279
- for tag in history:
280
- result = genai.embed_content(
281
- model="models/embedding-001",
282
- content=history[tag],
283
- task_type="retrieval_document")
284
- genai_embeddings.append(result['embedding'])
285
-
286
-
287
- return history,genai_embeddings
288
-
289
- def get_embed_chroma(link):
290
-
291
- print(f"\n--> Creating Embeddings - {link}")
292
-
293
- # Extract Text -----------------------------
294
- if link[-3:] == '.md' or link[8:11] == 'en.':
295
- text = web_extractor(link)
296
- else:
297
- text = pdf_extractor(link)
298
- print("\u2713 Extracting Text")
299
-
300
- # Create Chunks ----------------------------
301
-
302
- text = re.sub(r'\.{2,}', '.', text)
303
- text = re.sub(r'\s{2,}', ' ', text)
304
- text = [re.sub(r'\n{2,}', '\n', text)]
305
-
306
- chunks = text_splitter_small.create_documents(text)
307
- print("\u2713 Writing Tag Data")
308
-
309
- # Creating Vector
310
- embedding_vectors=[]
311
- textual_data = []
312
- print("\u2713 Creating Vectors")
313
-
314
-
315
- for text in chunks:
316
-
317
- inputs = tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True)
318
- inputs = {k: v.to('cpu') for k, v in inputs.items()}
319
-
320
- # Get the model's outputs
321
- with torch.no_grad():
322
- outputs = model(**inputs)
323
-
324
- embeddings = outputs.last_hidden_state.mean(dim=1)
325
- embedding_vectors.append(embeddings.squeeze().cpu().numpy().tolist())
326
- textual_data.append(text.page_content)
327
-
328
- return textual_data , embedding_vectors
329
-
330
-
331
-
332
- def get_image_embeddings(Product):
333
- image_embeddings = []
334
-
335
- links = search_images(Product)
336
- with concurrent.futures.ThreadPoolExecutor() as executor:
337
- descriptions = list(executor.map(feature_extraction_image, links))
338
-
339
- for description in descriptions:
340
- result = genai.embed_content(
341
- model="models/embedding-001",
342
- content=description,
343
- task_type="retrieval_document")
344
-
345
- image_embeddings.append(result['embedding'])
346
- # print(image_embeddings)
347
- return image_embeddings , links
348
-
349
-
350
-
351
- global text_splitter
352
- global data
353
- global history
354
-
355
-
356
- text_splitter = RecursiveCharacterTextSplitter(
357
- chunk_size = 10000,
358
- chunk_overlap = 100,
359
- separators = ["",''," "]
360
- )
361
-
362
- text_splitter_small = RecursiveCharacterTextSplitter(
363
- chunk_size = 2000,
364
- chunk_overlap = 100,
365
- separators = ["",''," "]
366
- )
367
-
368
- if __name__ == '__main__':
369
- print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
 
 
 
 
 
 
 
 
 
370
  # print(get_image_embeddings(Product='Samsung Galaxy S24'))
 
1
+ from PyPDF2 import PdfReader
2
+ import requests
3
+ import json
4
+ import os
5
+ import concurrent.futures
6
+ import random
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ from langchain_community.document_loaders import WebBaseLoader
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ import google.generativeai as genai
12
+ from langchain_core.messages import HumanMessage
13
+ from io import BytesIO
14
+ import numpy as np
15
+ import re
16
+ import torch
17
+ from transformers import AutoTokenizer, AutoModel
18
+ # import torch._dynamo
19
+ import time
20
+ # torch._dynamo.config.suppress_errors = True
21
+
22
+ from search import search_images
23
+
24
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
25
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
26
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
27
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
28
+
29
+ vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
30
+ vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
31
+ vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
32
+ vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
33
+
34
+ tokenizer = AutoTokenizer.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
35
+ model = AutoModel.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
36
+ # model = torch.compile(model)
37
+ model.to('cpu') # Ensure the model is on the CPU
38
+
39
+
40
+ genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
41
+
42
+ def pdf_extractor(link):
43
+ text = ''
44
+
45
+ try:
46
+ # Fetch the PDF file from the URL
47
+ response = requests.get(link)
48
+ response.raise_for_status() # Raise an error for bad status codes
49
+
50
+ # Use BytesIO to handle the PDF content in memory
51
+ pdf_file = BytesIO(response.content)
52
+
53
+ # Load the PDF file
54
+ reader = PdfReader(pdf_file)
55
+ for page in reader.pages:
56
+ text += page.extract_text() # Extract text from each page
57
+
58
+ except requests.exceptions.HTTPError as e:
59
+ print(f'HTTP error occurred: {e}')
60
+ except Exception as e:
61
+ print(f'An error occurred: {e}')
62
+
63
+ return text
64
+
65
+ def web_extractor(link):
66
+ text = ''
67
+
68
+ try:
69
+ loader = WebBaseLoader(link)
70
+ pages = loader.load_and_split()
71
+
72
+ for page in pages:
73
+ text+=page.page_content
74
+ except:
75
+ pass
76
+
77
+ return text
78
+
79
+ def imporve_text(text):
80
+
81
+ prompt = f'''
82
+ Please rewrite the following text to make it short, descriptive, concise, and of high quality.
83
+ Ensure that all essential information is retained.
84
+ Focus on improving clarity, coherence, and word choice without altering the original meaning.
85
+
86
+ text = {text}
87
+ '''
88
+
89
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
90
+ result = model.invoke(prompt)
91
+
92
+ return result.content
93
+
94
+ def feature_extraction(tag, history , context):
95
+
96
+ prompt = f'''
97
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
98
+ 1. Tag_History: Previously gathered information about the product.
99
+ 2. Tag_Context: New data that might contain additional details.
100
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
101
+ Guidelines:
102
+ - Only add new details that are relevant to the {tag} FIELD.
103
+ - Do not add or modify any other fields in the Tag_History.
104
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
105
+ Here is the data:
106
+ Tag_Context: {str(context)}
107
+ Tag_History: {history}
108
+ Respond with the updated Tag_History.
109
+ '''
110
+
111
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
112
+ result = model.invoke(prompt)
113
+
114
+ return result.content
115
+
116
+ def feature_extraction_image(url):
117
+ text = ' '
118
+ model = genai.GenerativeModel('gemini-1.5-flash-001')
119
+ try:
120
+ res = model.generate_content(['Describe this image to me',url])
121
+ text = res.text
122
+
123
+ except:
124
+ pass
125
+ return text
126
+
127
+ def detailed_feature_extraction(find, context):
128
+
129
+ prompt = f'''
130
+ You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
131
+ 1. Context: The gathered information about the product.
132
+ 2. Format: Details which need to be filled based on Context.
133
+ Your job is to read the Context and update the relevant field in Format using Context.
134
+ Guidelines:
135
+ - Only add details that are relevant to the individual FIELD.
136
+ - Do not add or modify any other fields in the Format.
137
+ - If nothing found return None.
138
+ Here is the data:
139
+ The Context is {str(context)}
140
+ The Format is {str(find)}
141
+ '''
142
+
143
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
144
+ result = model.invoke(prompt)
145
+
146
+ return result.content
147
+
148
+ def detailed_history(history):
149
+
150
+ details = {
151
+ "Introduction": {
152
+ "Product Name": None,
153
+ "Overview of the product": None,
154
+ "Purpose of the manual": None,
155
+ "Audience": None,
156
+ "Additional Details": None
157
+ },
158
+ "Specifications": {
159
+ "Technical specifications": None,
160
+ "Performance metrics": None,
161
+ "Additional Details": None
162
+ },
163
+ "Product Overview": {
164
+ "Product features": None,
165
+ "Key components and parts": None,
166
+ "Additional Details": None
167
+ },
168
+ "Safety Information": {
169
+ "Safety warnings and precautions": None,
170
+ "Compliance and certification information": None,
171
+ "Additional Details": None
172
+ },
173
+ "Installation Instructions": {
174
+ "Unboxing and inventory checklist": None,
175
+ "Step-by-step installation guide": None,
176
+ "Required tools and materials": None,
177
+ "Additional Details": None
178
+ },
179
+ "Setup and Configuration": {
180
+ "Initial setup procedures": None,
181
+ "Configuration settings": None,
182
+ "Troubleshooting setup issues": None,
183
+ "Additional Details": None
184
+ },
185
+ "Operation Instructions": {
186
+ "How to use the product": None,
187
+ "Detailed instructions for different functionalities": None,
188
+ "User interface guide": None,
189
+ "Additional Details": None
190
+ },
191
+ "Maintenance and Care": {
192
+ "Cleaning instructions": None,
193
+ "Maintenance schedule": None,
194
+ "Replacement parts and accessories": None,
195
+ "Additional Details": None
196
+ },
197
+ "Troubleshooting": {
198
+ "Common issues and solutions": None,
199
+ "Error messages and their meanings": None,
200
+ "Support Information": None,
201
+ "Additional Details": None
202
+ },
203
+ "Warranty Information": {
204
+ "Terms and Conditions": None,
205
+ "Service and repair information": None,
206
+ "Additional Details": None
207
+ },
208
+ "Legal Information": {
209
+ "Copyright information": None,
210
+ "Trademarks and patents": None,
211
+ "Disclaimers": None,
212
+ "Additional Details": None
213
+
214
+ }
215
+ }
216
+
217
+ for key,val in history.items():
218
+
219
+ find = details[key]
220
+
221
+ details[key] = str(detailed_feature_extraction(find,val))
222
+
223
+ return details
224
+
225
+
226
+ def get_embeddings(link,tag_option):
227
+
228
+ print(f"\n--> Creating Embeddings - {link}")
229
+
230
+ if tag_option=='Complete Document Similarity':
231
+ history = { "Details": "" }
232
+
233
+ else:
234
+ history = {
235
+ "Introduction": "",
236
+ "Specifications": "",
237
+ "Product Overview": "",
238
+ "Safety Information": "",
239
+ "Installation Instructions": "",
240
+ "Setup and Configuration": "",
241
+ "Operation Instructions": "",
242
+ "Maintenance and Care": "",
243
+ "Troubleshooting": "",
244
+ "Warranty Information": "",
245
+ "Legal Information": ""
246
+ }
247
+
248
+ # Extract Text -----------------------------
249
+ print("Extracting Text")
250
+ if link[-3:] == '.md' or link[8:11] == 'en.':
251
+ text = web_extractor(link)
252
+ else:
253
+ text = pdf_extractor(link)
254
+
255
+ # Create Chunks ----------------------------
256
+ print("Writing Tag Data")
257
+
258
+ if tag_option=="Complete Document Similarity":
259
+ history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
260
+
261
+ else:
262
+ chunks = text_splitter.create_documents(text)
263
+
264
+ for chunk in chunks:
265
+
266
+ with concurrent.futures.ThreadPoolExecutor() as executor:
267
+ future_to_key = {
268
+ executor.submit(
269
+ feature_extraction, f"Product {key}", history[key], chunk.page_content
270
+ ): key for key in history
271
+ }
272
+ for future in concurrent.futures.as_completed(future_to_key):
273
+ key = future_to_key[future]
274
+ try:
275
+ response = future.result()
276
+ history[key] = response
277
+ except Exception as e:
278
+ print(f"Error processing {key}: {e}")
279
+
280
+ print("Creating Vectors")
281
+ genai_embeddings=[]
282
+
283
+ for tag in history:
284
+ result = genai.embed_content(
285
+ model="models/embedding-001",
286
+ content=history[tag],
287
+ task_type="retrieval_document")
288
+ genai_embeddings.append(result['embedding'])
289
+
290
+
291
+ return history,genai_embeddings
292
+
293
+ def get_embed_chroma(link):
294
+
295
+ print(f"\n--> Creating Embeddings - {link}")
296
+
297
+ # Extract Text -----------------------------
298
+ if link[-3:] == '.md' or link[8:11] == 'en.':
299
+ text = web_extractor(link)
300
+ else:
301
+ text = pdf_extractor(link)
302
+ print("\u2713 Extracting Text")
303
+
304
+ # Create Chunks ----------------------------
305
+
306
+ text = re.sub(r'\.{2,}', '.', text)
307
+ text = re.sub(r'\s{2,}', ' ', text)
308
+ text = [re.sub(r'\n{2,}', '\n', text)]
309
+
310
+ chunks = text_splitter_small.create_documents(text)
311
+ print("\u2713 Writing Tag Data")
312
+
313
+ # Creating Vector
314
+ embedding_vectors=[]
315
+ textual_data = []
316
+ print("\u2713 Creating Vectors")
317
+
318
+
319
+ batch_size = 1
320
+ # Process chunks in batches
321
+ for i in range(0, len(chunks), batch_size):
322
+ batch = chunks[i:i + batch_size]
323
+
324
+ texts = [text.page_content for text in batch]
325
+ inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
326
+ inputs = {k: v.to('cpu') for k, v in inputs.items()}
327
+
328
+ # Get the model's outputs
329
+ with torch.no_grad():
330
+ outputs = model(**inputs)
331
+
332
+ embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy().tolist()
333
+ embedding_vectors.extend(embeddings)
334
+ textual_data.extend(texts)
335
+
336
+
337
+ return textual_data , embedding_vectors
338
+
339
+
340
+
341
+ def get_image_embeddings(Product):
342
+ image_embeddings = []
343
+
344
+ links = search_images(Product)
345
+ with concurrent.futures.ThreadPoolExecutor() as executor:
346
+ descriptions = list(executor.map(feature_extraction_image, links))
347
+
348
+ for description in descriptions:
349
+ result = genai.embed_content(
350
+ model="models/embedding-001",
351
+ content=description,
352
+ task_type="retrieval_document")
353
+
354
+ image_embeddings.append(result['embedding'])
355
+ # print(image_embeddings)
356
+ return image_embeddings , links
357
+
358
+
359
+
360
+ global text_splitter
361
+ global data
362
+ global history
363
+
364
+
365
+ text_splitter = RecursiveCharacterTextSplitter(
366
+ chunk_size = 10000,
367
+ chunk_overlap = 100,
368
+ separators = ["",''," "]
369
+ )
370
+
371
+ text_splitter_small = RecursiveCharacterTextSplitter(
372
+ chunk_size = 2000,
373
+ chunk_overlap = 100,
374
+ separators = ["",''," "]
375
+ )
376
+
377
+ if __name__ == '__main__':
378
+ print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
379
  # print(get_image_embeddings(Product='Samsung Galaxy S24'))