akshansh36 commited on
Commit
1fd56ea
·
verified ·
1 Parent(s): 70288fd

Update helper/process_pdf.py

Browse files
Files changed (1) hide show
  1. helper/process_pdf.py +351 -347
helper/process_pdf.py CHANGED
@@ -1,347 +1,351 @@
1
- import requests
2
- from pdf2image import convert_from_path
3
- import base64
4
- from pymongo import MongoClient
5
- from langchain_google_genai import ChatGoogleGenerativeAI
6
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
7
- from langchain_core.messages import HumanMessage
8
- import os
9
- import re
10
- import json
11
- import uuid
12
- from dotenv import load_dotenv
13
- import pinecone
14
- load_dotenv()
15
- MONGO_URI = os.getenv("MONGO_URI")
16
- DB_NAME = os.getenv("DB_NAME")
17
- COLLECTION_NAME = os.getenv("COLLECTION_NAME")
18
- FLASH_API = os.getenv("FLASH_API")
19
- mongo_client = MongoClient(MONGO_URI)
20
- db = mongo_client[DB_NAME]
21
- collection = db[COLLECTION_NAME]
22
- collection2=db['about_company']
23
- model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-002", temperature=0.2, max_tokens=None, google_api_key=FLASH_API)
24
- google_embeddings = GoogleGenerativeAIEmbeddings(
25
- model="models/embedding-001", # Correct model name
26
- google_api_key="AIzaSyANNRKfEb-YnVIBaSAq6hQ38XpxxGwvaws" # Your API key
27
- )
28
- pc = pinecone.Pinecone(
29
- api_key="4a80f293-ae6d-489a-a7d8-33ea3fcdd26b" # Your Pinecone API key
30
- )
31
- index_name = "mospi"
32
- index = pc.Index(index_name)
33
-
34
- about_company_doc=collection2.find_one({"type":"about_company"})
35
- if about_company_doc:
36
- about_company=about_company_doc.get('company_description','')
37
-
38
- pdf_temp_dir = 'temp/pdf_files'
39
- image_temp_dir = 'temp/page_images'
40
-
41
- os.makedirs(pdf_temp_dir, exist_ok=True)
42
- os.makedirs(image_temp_dir, exist_ok=True)
43
-
44
- pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
45
-
46
- def download_and_split_pdf_to_image(url):
47
- try:
48
- response = requests.get(url)
49
- with open(pdf_path, 'wb') as pdf_file:
50
- pdf_file.write(response.content)
51
-
52
-
53
- except Exception as e:
54
- print(f"error occured during downloading pdf from object url : {e}")
55
- return None
56
-
57
- try:
58
- images = convert_from_path(pdf_path)
59
- for i, image in enumerate(images):
60
- image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
61
- image.save(image_path, 'PNG')
62
- print(f'Saved image: {image_path}')
63
- return True
64
-
65
- except Exception as e:
66
- print(f"error occured in converting pdf pages to image : {e}")
67
- return None
68
-
69
-
70
-
71
- system_prompt_text = f"""Given is an image of a PDF page.Your task is to extract all the information from this image and give a detailed summary of the page, do not miss out on any information, include keywords or any terms mentioned in the pdf.Its possible that sometimes the information might look redundant or repetitive still extract it.'
72
- Given below is a company information whose pdf page is givn to you, to understand the context.
73
- - About Company: {about_company}
74
- Follow this Expected output format given below:
75
- Expected Output format : {{"description":"String"}}
76
-
77
- """
78
-
79
-
80
-
81
- # def process_image_using_llm(image, page_number, url):
82
- # try:
83
- # message = HumanMessage(
84
- # content=[
85
- # {"type": "text", "text": system_prompt_text},
86
- # {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
87
- # ],
88
- # )
89
- # response = model.invoke([message])
90
- # print(f"LLM response for page {page_number}: {response}")
91
- #
92
- # # Extract JSON from the response content using regex
93
- # match = re.search(r"\{.*\}", response.content.strip())
94
- # if match:
95
- # json_data = match.group(0)
96
- #
97
- # # Step 1: Locate the "description" field and escape all single quotes within it
98
- # description_match = re.search(r"'description'\s*:\s*('.*?'|\".*?\")", json_data)
99
- #
100
- # if description_match:
101
- # description_text = description_match.group(1)
102
- #
103
- # # Replace outer single quotes with double quotes if necessary
104
- # if description_text.startswith("'") and description_text.endswith("'"):
105
- # description_text = f'"{description_text[1:-1]}"'
106
- # elif description_text.startswith('"') and description_text.endswith('"'):
107
- # pass # No change needed if already double quotes
108
- #
109
- # # Escape all single quotes within the description text
110
- # description_text = description_text.replace("'", "\\'")
111
- #
112
- # # Replace the original match with the updated description text
113
- # json_data = (
114
- # json_data[:description_match.start(1)] +
115
- # description_text +
116
- # json_data[description_match.end(1):]
117
- # )
118
- #
119
- # # Step 2: Attempt to load the cleaned JSON string
120
- # try:
121
- # data = json.loads(json_data) # Load as JSON
122
- # description = data.get("description", "None").strip()
123
- # can_find_description = description != "None"
124
- #
125
- # return {
126
- # "page_number": page_number,
127
- # "description": description if can_find_description else None,
128
- # "can_find_description": can_find_description
129
- # }
130
- # except json.JSONDecodeError as e:
131
- # print(f"Error decoding JSON for page {page_number}: {e}")
132
- # return {
133
- # "page_number": page_number,
134
- # "description": None,
135
- # "can_find_description": False
136
- # }
137
- # else:
138
- # print(f"No valid JSON found in the response for page {page_number}")
139
- # return {
140
- # "page_number": page_number,
141
- # "description": None,
142
- # "can_find_description": False
143
- # }
144
- #
145
- # except Exception as e:
146
- # print(f"Error processing page {page_number}: {e}")
147
- # return {
148
- # "page_number": page_number,
149
- # "description": None,
150
- # "can_find_description": False
151
- # }
152
- #
153
-
154
- def process_image_using_llm(image, page_number, url, max_retries=3):
155
- for attempt in range(1, max_retries + 1):
156
- try:
157
- # Send the image and system prompt to the LLM
158
- message = HumanMessage(
159
- content=[
160
- {"type": "text", "text": system_prompt_text},
161
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
162
- ],
163
- )
164
- response = model.invoke([message])
165
- print(f"Attempt {attempt}: LLM response for page {page_number}: {response}")
166
-
167
- # Retry only if JSON data is missing or JSON decode error occurs
168
- try:
169
- # Check for JSON content in the response
170
- match = re.search(r"\{.*\}", response.content.strip())
171
- if match:
172
- json_data = match.group(0)
173
-
174
- # Locate the "description" field and escape single quotes within it
175
- description_match = re.search(r"'description'\s*:\s*('.*?'|\".*?\")", json_data)
176
- if description_match:
177
- description_text = description_match.group(1)
178
- if description_text.startswith("'") and description_text.endswith("'"):
179
- description_text = f'"{description_text[1:-1]}"'
180
- description_text = description_text.replace("'", "\\'")
181
-
182
- # Replace the original match with the updated description text
183
- json_data = (
184
- json_data[:description_match.start(1)] +
185
- description_text +
186
- json_data[description_match.end(1):]
187
- )
188
-
189
- # Load the cleaned JSON data
190
- data = json.loads(json_data)
191
- description = data.get("description", "None").strip()
192
- can_find_description = description != "None"
193
-
194
- return {
195
- "page_number": page_number,
196
- "description": description if can_find_description else None,
197
- "can_find_description": can_find_description
198
- }
199
- else:
200
- print(f"No valid JSON found in the response for page {page_number} on attempt {attempt}")
201
-
202
- # Trigger retry if no JSON data is found
203
- if attempt == max_retries:
204
- print(f"Max retries reached for page {page_number}. Skipping.")
205
- return {
206
- "page_number": page_number,
207
- "description": None,
208
- "can_find_description": False
209
- }
210
-
211
- except json.JSONDecodeError as e:
212
- print(f"JSON decode error on attempt {attempt} for page {page_number}: {e}")
213
- if attempt == max_retries:
214
- return {
215
- "page_number": page_number,
216
- "description": None,
217
- "can_find_description": False
218
- }
219
-
220
- # Handle any other exceptions without retrying
221
- except Exception as e:
222
- print(f"Outer exception for page {page_number}: {e}")
223
- return {
224
- "page_number": page_number,
225
- "description": None,
226
- "can_find_description": False
227
- }
228
-
229
-
230
- def create_embedding_for_pdf_chunks(page,description,url,tags,categories):
231
- try:
232
- document = collection.find_one({'object_url': url})
233
- file_type = document.get("type")
234
- mongo_id = str(document.get('_id'))
235
- embedding = google_embeddings.embed_query(description)
236
- pinecone_id = str(uuid.uuid4())
237
-
238
- vectors = [{
239
- 'id': pinecone_id,
240
- 'values': embedding,
241
- 'metadata': {
242
- 'description': description,
243
- "url": url,
244
- "page_number":page,
245
- "tag": file_type,
246
- "mongo_id": mongo_id,
247
- "tags": ','.join(tags),
248
- "categories": ','.join(categories) # Store MongoDB ID in metadata
249
- }
250
- }]
251
- index.upsert(vectors)
252
- print(f"Inserted: page {page} in Pinecone with MongoDB ID {mongo_id} in metadata")
253
-
254
- collection.update_one(
255
- {
256
- "_id": document["_id"],
257
- "chunks.page_number": page # Match document and specific chunk by page number
258
- },
259
- {
260
- "$set": {
261
- "chunks.$.pinecone_id": pinecone_id,
262
- "chunks.$.successfully_embedding_created": True
263
- }
264
- }
265
- )
266
- return True
267
-
268
- except Exception as e:
269
- print(f"error occured in creating embedding for pdf with mongo id {mongo_id} for page {page}")
270
- collection.update_one(
271
- {
272
- "_id": document["_id"],
273
- "chunks.page_number": page # Match document and specific chunk by page number
274
- },
275
- {
276
- "$set": {
277
- "chunks.$.successfully_embedding_created": False
278
- }
279
- }
280
- )
281
- return False
282
-
283
-
284
-
285
- def cleanup_directory(directory_path):
286
- try:
287
- for filename in os.listdir(directory_path):
288
- file_path = os.path.join(directory_path, filename)
289
- if os.path.isfile(file_path):
290
- os.remove(file_path)
291
- print(f"Cleaned up files in {directory_path}")
292
- except Exception as e:
293
- print(f"Error cleaning up directory {directory_path}: {e}")
294
-
295
-
296
- def process_pdf(url,tags,categories):
297
- print(f"Processing PDF with URL: {url}")
298
- if download_and_split_pdf_to_image(url):
299
- chunks = []
300
- image_files = sorted(
301
- os.listdir(image_temp_dir),
302
- key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
303
- )
304
- for count, image_name in enumerate(image_files, start=1):
305
- print(f"Processing page {count} of the PDF")
306
- image_path = os.path.join(image_temp_dir, image_name)
307
- with open(image_path, "rb") as image_file:
308
- image_data = base64.b64encode(image_file.read()).decode("utf-8")
309
- page_result = process_image_using_llm(image_data,count, url)
310
- chunks.append(page_result)
311
-
312
- # Update the MongoDB document with the chunks array
313
- collection.update_one(
314
- {"object_url": url},
315
- {"$set": {"chunks": chunks}},
316
- upsert=True
317
- )
318
- print("Saved chunks to MongoDB.")
319
-
320
- total_pages=len(chunks)
321
- failed_pages=0
322
- for chunk in chunks:
323
- page_number=chunk.get('page_number')
324
- description=chunk.get("description")
325
- if description:
326
- embedding_created=create_embedding_for_pdf_chunks(page_number,description,url,tags,categories)
327
- if not embedding_created:
328
- failed_pages+=1
329
- else:
330
- print(f"skipping the page {page_number} as description is None")
331
- failed_pages+=1
332
- continue
333
-
334
- cleanup_directory(pdf_temp_dir)
335
- cleanup_directory(image_temp_dir)
336
-
337
- return failed_pages < total_pages
338
-
339
-
340
-
341
-
342
-
343
-
344
-
345
-
346
-
347
-
 
 
 
 
 
1
+ import requests
2
+ from pdf2image import convert_from_path
3
+ import base64
4
+ from pymongo import MongoClient
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
7
+ from langchain_core.messages import HumanMessage
8
+ import os
9
+ import re
10
+ import json
11
+ import uuid
12
+ from dotenv import load_dotenv
13
+ import pinecone
14
+ load_dotenv()
15
+ MONGO_URI = os.getenv("MONGO_URI")
16
+ DB_NAME = os.getenv("DB_NAME")
17
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
18
+ FLASH_API = os.getenv("FLASH_API")
19
+ PINECONE_API=os.getenv("PINECONE_API")
20
+ PINECONE_INDEX=os.getenv("PINECONE_INDEX")
21
+
22
+
23
+ mongo_client = MongoClient(MONGO_URI)
24
+ db = mongo_client[DB_NAME]
25
+ collection = db[COLLECTION_NAME]
26
+ collection2=db['about_company']
27
+ model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-002", temperature=0.2, max_tokens=None, google_api_key=FLASH_API)
28
+ google_embeddings = GoogleGenerativeAIEmbeddings(
29
+ model="models/embedding-001", # Correct model name
30
+ google_api_key=FLASH_API # Your API key
31
+ )
32
+ pc = pinecone.Pinecone(
33
+ api_key=PINECONE_API # Your Pinecone API key
34
+ )
35
+
36
+ index = pc.Index(PINECONE_INDEX)
37
+
38
+ about_company_doc=collection2.find_one({"type":"about_company"})
39
+ if about_company_doc:
40
+ about_company=about_company_doc.get('company_description','')
41
+
42
+ pdf_temp_dir = 'temp/pdf_files'
43
+ image_temp_dir = 'temp/page_images'
44
+
45
+ os.makedirs(pdf_temp_dir, exist_ok=True)
46
+ os.makedirs(image_temp_dir, exist_ok=True)
47
+
48
+ pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
49
+
50
+ def download_and_split_pdf_to_image(url):
51
+ try:
52
+ response = requests.get(url)
53
+ with open(pdf_path, 'wb') as pdf_file:
54
+ pdf_file.write(response.content)
55
+
56
+
57
+ except Exception as e:
58
+ print(f"error occured during downloading pdf from object url : {e}")
59
+ return None
60
+
61
+ try:
62
+ images = convert_from_path(pdf_path)
63
+ for i, image in enumerate(images):
64
+ image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
65
+ image.save(image_path, 'PNG')
66
+ print(f'Saved image: {image_path}')
67
+ return True
68
+
69
+ except Exception as e:
70
+ print(f"error occured in converting pdf pages to image : {e}")
71
+ return None
72
+
73
+
74
+
75
+ system_prompt_text = f"""Given is an image of a PDF page.Your task is to extract all the information from this image and give a detailed summary of the page, do not miss out on any information, include keywords or any terms mentioned in the pdf.Its possible that sometimes the information might look redundant or repetitive still extract it.'
76
+ Given below is a company information whose pdf page is givn to you, to understand the context.
77
+ - About Company: {about_company}
78
+ Follow this Expected output format given below:
79
+ Expected Output format : {{"description":"String"}}
80
+
81
+ """
82
+
83
+
84
+
85
+ # def process_image_using_llm(image, page_number, url):
86
+ # try:
87
+ # message = HumanMessage(
88
+ # content=[
89
+ # {"type": "text", "text": system_prompt_text},
90
+ # {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
91
+ # ],
92
+ # )
93
+ # response = model.invoke([message])
94
+ # print(f"LLM response for page {page_number}: {response}")
95
+ #
96
+ # # Extract JSON from the response content using regex
97
+ # match = re.search(r"\{.*\}", response.content.strip())
98
+ # if match:
99
+ # json_data = match.group(0)
100
+ #
101
+ # # Step 1: Locate the "description" field and escape all single quotes within it
102
+ # description_match = re.search(r"'description'\s*:\s*('.*?'|\".*?\")", json_data)
103
+ #
104
+ # if description_match:
105
+ # description_text = description_match.group(1)
106
+ #
107
+ # # Replace outer single quotes with double quotes if necessary
108
+ # if description_text.startswith("'") and description_text.endswith("'"):
109
+ # description_text = f'"{description_text[1:-1]}"'
110
+ # elif description_text.startswith('"') and description_text.endswith('"'):
111
+ # pass # No change needed if already double quotes
112
+ #
113
+ # # Escape all single quotes within the description text
114
+ # description_text = description_text.replace("'", "\\'")
115
+ #
116
+ # # Replace the original match with the updated description text
117
+ # json_data = (
118
+ # json_data[:description_match.start(1)] +
119
+ # description_text +
120
+ # json_data[description_match.end(1):]
121
+ # )
122
+ #
123
+ # # Step 2: Attempt to load the cleaned JSON string
124
+ # try:
125
+ # data = json.loads(json_data) # Load as JSON
126
+ # description = data.get("description", "None").strip()
127
+ # can_find_description = description != "None"
128
+ #
129
+ # return {
130
+ # "page_number": page_number,
131
+ # "description": description if can_find_description else None,
132
+ # "can_find_description": can_find_description
133
+ # }
134
+ # except json.JSONDecodeError as e:
135
+ # print(f"Error decoding JSON for page {page_number}: {e}")
136
+ # return {
137
+ # "page_number": page_number,
138
+ # "description": None,
139
+ # "can_find_description": False
140
+ # }
141
+ # else:
142
+ # print(f"No valid JSON found in the response for page {page_number}")
143
+ # return {
144
+ # "page_number": page_number,
145
+ # "description": None,
146
+ # "can_find_description": False
147
+ # }
148
+ #
149
+ # except Exception as e:
150
+ # print(f"Error processing page {page_number}: {e}")
151
+ # return {
152
+ # "page_number": page_number,
153
+ # "description": None,
154
+ # "can_find_description": False
155
+ # }
156
+ #
157
+
158
+ def process_image_using_llm(image, page_number, url, max_retries=3):
159
+ for attempt in range(1, max_retries + 1):
160
+ try:
161
+ # Send the image and system prompt to the LLM
162
+ message = HumanMessage(
163
+ content=[
164
+ {"type": "text", "text": system_prompt_text},
165
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
166
+ ],
167
+ )
168
+ response = model.invoke([message])
169
+ print(f"Attempt {attempt}: LLM response for page {page_number}: {response}")
170
+
171
+ # Retry only if JSON data is missing or JSON decode error occurs
172
+ try:
173
+ # Check for JSON content in the response
174
+ match = re.search(r"\{.*\}", response.content.strip())
175
+ if match:
176
+ json_data = match.group(0)
177
+
178
+ # Locate the "description" field and escape single quotes within it
179
+ description_match = re.search(r"'description'\s*:\s*('.*?'|\".*?\")", json_data)
180
+ if description_match:
181
+ description_text = description_match.group(1)
182
+ if description_text.startswith("'") and description_text.endswith("'"):
183
+ description_text = f'"{description_text[1:-1]}"'
184
+ description_text = description_text.replace("'", "\\'")
185
+
186
+ # Replace the original match with the updated description text
187
+ json_data = (
188
+ json_data[:description_match.start(1)] +
189
+ description_text +
190
+ json_data[description_match.end(1):]
191
+ )
192
+
193
+ # Load the cleaned JSON data
194
+ data = json.loads(json_data)
195
+ description = data.get("description", "None").strip()
196
+ can_find_description = description != "None"
197
+
198
+ return {
199
+ "page_number": page_number,
200
+ "description": description if can_find_description else None,
201
+ "can_find_description": can_find_description
202
+ }
203
+ else:
204
+ print(f"No valid JSON found in the response for page {page_number} on attempt {attempt}")
205
+
206
+ # Trigger retry if no JSON data is found
207
+ if attempt == max_retries:
208
+ print(f"Max retries reached for page {page_number}. Skipping.")
209
+ return {
210
+ "page_number": page_number,
211
+ "description": None,
212
+ "can_find_description": False
213
+ }
214
+
215
+ except json.JSONDecodeError as e:
216
+ print(f"JSON decode error on attempt {attempt} for page {page_number}: {e}")
217
+ if attempt == max_retries:
218
+ return {
219
+ "page_number": page_number,
220
+ "description": None,
221
+ "can_find_description": False
222
+ }
223
+
224
+ # Handle any other exceptions without retrying
225
+ except Exception as e:
226
+ print(f"Outer exception for page {page_number}: {e}")
227
+ return {
228
+ "page_number": page_number,
229
+ "description": None,
230
+ "can_find_description": False
231
+ }
232
+
233
+
234
+ def create_embedding_for_pdf_chunks(page,description,url,tags,categories):
235
+ try:
236
+ document = collection.find_one({'object_url': url})
237
+ file_type = document.get("type")
238
+ mongo_id = str(document.get('_id'))
239
+ embedding = google_embeddings.embed_query(description)
240
+ pinecone_id = str(uuid.uuid4())
241
+
242
+ vectors = [{
243
+ 'id': pinecone_id,
244
+ 'values': embedding,
245
+ 'metadata': {
246
+ 'description': description,
247
+ "url": url,
248
+ "page_number":page,
249
+ "tag": file_type,
250
+ "mongo_id": mongo_id,
251
+ "tags": ','.join(tags),
252
+ "categories": ','.join(categories) # Store MongoDB ID in metadata
253
+ }
254
+ }]
255
+ index.upsert(vectors)
256
+ print(f"Inserted: page {page} in Pinecone with MongoDB ID {mongo_id} in metadata")
257
+
258
+ collection.update_one(
259
+ {
260
+ "_id": document["_id"],
261
+ "chunks.page_number": page # Match document and specific chunk by page number
262
+ },
263
+ {
264
+ "$set": {
265
+ "chunks.$.pinecone_id": pinecone_id,
266
+ "chunks.$.successfully_embedding_created": True
267
+ }
268
+ }
269
+ )
270
+ return True
271
+
272
+ except Exception as e:
273
+ print(f"error occured in creating embedding for pdf with mongo id {mongo_id} for page {page}")
274
+ collection.update_one(
275
+ {
276
+ "_id": document["_id"],
277
+ "chunks.page_number": page # Match document and specific chunk by page number
278
+ },
279
+ {
280
+ "$set": {
281
+ "chunks.$.successfully_embedding_created": False
282
+ }
283
+ }
284
+ )
285
+ return False
286
+
287
+
288
+
289
+ def cleanup_directory(directory_path):
290
+ try:
291
+ for filename in os.listdir(directory_path):
292
+ file_path = os.path.join(directory_path, filename)
293
+ if os.path.isfile(file_path):
294
+ os.remove(file_path)
295
+ print(f"Cleaned up files in {directory_path}")
296
+ except Exception as e:
297
+ print(f"Error cleaning up directory {directory_path}: {e}")
298
+
299
+
300
+ def process_pdf(url,tags,categories):
301
+ print(f"Processing PDF with URL: {url}")
302
+ if download_and_split_pdf_to_image(url):
303
+ chunks = []
304
+ image_files = sorted(
305
+ os.listdir(image_temp_dir),
306
+ key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
307
+ )
308
+ for count, image_name in enumerate(image_files, start=1):
309
+ print(f"Processing page {count} of the PDF")
310
+ image_path = os.path.join(image_temp_dir, image_name)
311
+ with open(image_path, "rb") as image_file:
312
+ image_data = base64.b64encode(image_file.read()).decode("utf-8")
313
+ page_result = process_image_using_llm(image_data,count, url)
314
+ chunks.append(page_result)
315
+
316
+ # Update the MongoDB document with the chunks array
317
+ collection.update_one(
318
+ {"object_url": url},
319
+ {"$set": {"chunks": chunks}},
320
+ upsert=True
321
+ )
322
+ print("Saved chunks to MongoDB.")
323
+
324
+ total_pages=len(chunks)
325
+ failed_pages=0
326
+ for chunk in chunks:
327
+ page_number=chunk.get('page_number')
328
+ description=chunk.get("description")
329
+ if description:
330
+ embedding_created=create_embedding_for_pdf_chunks(page_number,description,url,tags,categories)
331
+ if not embedding_created:
332
+ failed_pages+=1
333
+ else:
334
+ print(f"skipping the page {page_number} as description is None")
335
+ failed_pages+=1
336
+ continue
337
+
338
+ cleanup_directory(pdf_temp_dir)
339
+ cleanup_directory(image_temp_dir)
340
+
341
+ return failed_pages < total_pages
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+