Spaces:
Sleeping
Sleeping
Update helper/process_pdf.py
Browse files- helper/process_pdf.py +351 -347
helper/process_pdf.py
CHANGED
@@ -1,347 +1,351 @@
|
|
1 |
-
import requests
|
2 |
-
from pdf2image import convert_from_path
|
3 |
-
import base64
|
4 |
-
from pymongo import MongoClient
|
5 |
-
from langchain_google_genai import ChatGoogleGenerativeAI
|
6 |
-
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
7 |
-
from langchain_core.messages import HumanMessage
|
8 |
-
import os
|
9 |
-
import re
|
10 |
-
import json
|
11 |
-
import uuid
|
12 |
-
from dotenv import load_dotenv
|
13 |
-
import pinecone
|
14 |
-
load_dotenv()
|
15 |
-
MONGO_URI = os.getenv("MONGO_URI")
|
16 |
-
DB_NAME = os.getenv("DB_NAME")
|
17 |
-
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
|
18 |
-
FLASH_API = os.getenv("FLASH_API")
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
)
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
return
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
#
|
86 |
-
#
|
87 |
-
#
|
88 |
-
#
|
89 |
-
#
|
90 |
-
#
|
91 |
-
#
|
92 |
-
#
|
93 |
-
#
|
94 |
-
#
|
95 |
-
#
|
96 |
-
#
|
97 |
-
#
|
98 |
-
#
|
99 |
-
#
|
100 |
-
#
|
101 |
-
#
|
102 |
-
#
|
103 |
-
#
|
104 |
-
#
|
105 |
-
#
|
106 |
-
#
|
107 |
-
#
|
108 |
-
#
|
109 |
-
#
|
110 |
-
# description_text
|
111 |
-
#
|
112 |
-
#
|
113 |
-
#
|
114 |
-
#
|
115 |
-
#
|
116 |
-
#
|
117 |
-
#
|
118 |
-
#
|
119 |
-
#
|
120 |
-
#
|
121 |
-
#
|
122 |
-
#
|
123 |
-
#
|
124 |
-
#
|
125 |
-
#
|
126 |
-
#
|
127 |
-
#
|
128 |
-
#
|
129 |
-
#
|
130 |
-
#
|
131 |
-
#
|
132 |
-
#
|
133 |
-
#
|
134 |
-
#
|
135 |
-
#
|
136 |
-
#
|
137 |
-
#
|
138 |
-
#
|
139 |
-
#
|
140 |
-
#
|
141 |
-
#
|
142 |
-
#
|
143 |
-
#
|
144 |
-
#
|
145 |
-
#
|
146 |
-
#
|
147 |
-
#
|
148 |
-
#
|
149 |
-
#
|
150 |
-
#
|
151 |
-
#
|
152 |
-
#
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
print(f"
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
"
|
248 |
-
"
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
{
|
260 |
-
"
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
{
|
276 |
-
"
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from pdf2image import convert_from_path
|
3 |
+
import base64
|
4 |
+
from pymongo import MongoClient
|
5 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
6 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
7 |
+
from langchain_core.messages import HumanMessage
|
8 |
+
import os
|
9 |
+
import re
|
10 |
+
import json
|
11 |
+
import uuid
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
import pinecone
|
14 |
+
load_dotenv()
|
15 |
+
MONGO_URI = os.getenv("MONGO_URI")
|
16 |
+
DB_NAME = os.getenv("DB_NAME")
|
17 |
+
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
|
18 |
+
FLASH_API = os.getenv("FLASH_API")
|
19 |
+
PINECONE_API=os.getenv("PINECONE_API")
|
20 |
+
PINECONE_INDEX=os.getenv("PINECONE_INDEX")
|
21 |
+
|
22 |
+
|
23 |
+
mongo_client = MongoClient(MONGO_URI)
|
24 |
+
db = mongo_client[DB_NAME]
|
25 |
+
collection = db[COLLECTION_NAME]
|
26 |
+
collection2=db['about_company']
|
27 |
+
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-002", temperature=0.2, max_tokens=None, google_api_key=FLASH_API)
|
28 |
+
google_embeddings = GoogleGenerativeAIEmbeddings(
|
29 |
+
model="models/embedding-001", # Correct model name
|
30 |
+
google_api_key=FLASH_API # Your API key
|
31 |
+
)
|
32 |
+
pc = pinecone.Pinecone(
|
33 |
+
api_key=PINECONE_API # Your Pinecone API key
|
34 |
+
)
|
35 |
+
|
36 |
+
index = pc.Index(PINECONE_INDEX)
|
37 |
+
|
38 |
+
about_company_doc=collection2.find_one({"type":"about_company"})
|
39 |
+
if about_company_doc:
|
40 |
+
about_company=about_company_doc.get('company_description','')
|
41 |
+
|
42 |
+
pdf_temp_dir = 'temp/pdf_files'
|
43 |
+
image_temp_dir = 'temp/page_images'
|
44 |
+
|
45 |
+
os.makedirs(pdf_temp_dir, exist_ok=True)
|
46 |
+
os.makedirs(image_temp_dir, exist_ok=True)
|
47 |
+
|
48 |
+
pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
|
49 |
+
|
50 |
+
def download_and_split_pdf_to_image(url):
|
51 |
+
try:
|
52 |
+
response = requests.get(url)
|
53 |
+
with open(pdf_path, 'wb') as pdf_file:
|
54 |
+
pdf_file.write(response.content)
|
55 |
+
|
56 |
+
|
57 |
+
except Exception as e:
|
58 |
+
print(f"error occured during downloading pdf from object url : {e}")
|
59 |
+
return None
|
60 |
+
|
61 |
+
try:
|
62 |
+
images = convert_from_path(pdf_path)
|
63 |
+
for i, image in enumerate(images):
|
64 |
+
image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
|
65 |
+
image.save(image_path, 'PNG')
|
66 |
+
print(f'Saved image: {image_path}')
|
67 |
+
return True
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
print(f"error occured in converting pdf pages to image : {e}")
|
71 |
+
return None
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
system_prompt_text = f"""Given is an image of a PDF page.Your task is to extract all the information from this image and give a detailed summary of the page, do not miss out on any information, include keywords or any terms mentioned in the pdf.Its possible that sometimes the information might look redundant or repetitive still extract it.'
|
76 |
+
Given below is a company information whose pdf page is givn to you, to understand the context.
|
77 |
+
- About Company: {about_company}
|
78 |
+
Follow this Expected output format given below:
|
79 |
+
Expected Output format : {{"description":"String"}}
|
80 |
+
|
81 |
+
"""
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
# def process_image_using_llm(image, page_number, url):
|
86 |
+
# try:
|
87 |
+
# message = HumanMessage(
|
88 |
+
# content=[
|
89 |
+
# {"type": "text", "text": system_prompt_text},
|
90 |
+
# {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
|
91 |
+
# ],
|
92 |
+
# )
|
93 |
+
# response = model.invoke([message])
|
94 |
+
# print(f"LLM response for page {page_number}: {response}")
|
95 |
+
#
|
96 |
+
# # Extract JSON from the response content using regex
|
97 |
+
# match = re.search(r"\{.*\}", response.content.strip())
|
98 |
+
# if match:
|
99 |
+
# json_data = match.group(0)
|
100 |
+
#
|
101 |
+
# # Step 1: Locate the "description" field and escape all single quotes within it
|
102 |
+
# description_match = re.search(r"'description'\s*:\s*('.*?'|\".*?\")", json_data)
|
103 |
+
#
|
104 |
+
# if description_match:
|
105 |
+
# description_text = description_match.group(1)
|
106 |
+
#
|
107 |
+
# # Replace outer single quotes with double quotes if necessary
|
108 |
+
# if description_text.startswith("'") and description_text.endswith("'"):
|
109 |
+
# description_text = f'"{description_text[1:-1]}"'
|
110 |
+
# elif description_text.startswith('"') and description_text.endswith('"'):
|
111 |
+
# pass # No change needed if already double quotes
|
112 |
+
#
|
113 |
+
# # Escape all single quotes within the description text
|
114 |
+
# description_text = description_text.replace("'", "\\'")
|
115 |
+
#
|
116 |
+
# # Replace the original match with the updated description text
|
117 |
+
# json_data = (
|
118 |
+
# json_data[:description_match.start(1)] +
|
119 |
+
# description_text +
|
120 |
+
# json_data[description_match.end(1):]
|
121 |
+
# )
|
122 |
+
#
|
123 |
+
# # Step 2: Attempt to load the cleaned JSON string
|
124 |
+
# try:
|
125 |
+
# data = json.loads(json_data) # Load as JSON
|
126 |
+
# description = data.get("description", "None").strip()
|
127 |
+
# can_find_description = description != "None"
|
128 |
+
#
|
129 |
+
# return {
|
130 |
+
# "page_number": page_number,
|
131 |
+
# "description": description if can_find_description else None,
|
132 |
+
# "can_find_description": can_find_description
|
133 |
+
# }
|
134 |
+
# except json.JSONDecodeError as e:
|
135 |
+
# print(f"Error decoding JSON for page {page_number}: {e}")
|
136 |
+
# return {
|
137 |
+
# "page_number": page_number,
|
138 |
+
# "description": None,
|
139 |
+
# "can_find_description": False
|
140 |
+
# }
|
141 |
+
# else:
|
142 |
+
# print(f"No valid JSON found in the response for page {page_number}")
|
143 |
+
# return {
|
144 |
+
# "page_number": page_number,
|
145 |
+
# "description": None,
|
146 |
+
# "can_find_description": False
|
147 |
+
# }
|
148 |
+
#
|
149 |
+
# except Exception as e:
|
150 |
+
# print(f"Error processing page {page_number}: {e}")
|
151 |
+
# return {
|
152 |
+
# "page_number": page_number,
|
153 |
+
# "description": None,
|
154 |
+
# "can_find_description": False
|
155 |
+
# }
|
156 |
+
#
|
157 |
+
|
158 |
+
def process_image_using_llm(image, page_number, url, max_retries=3):
|
159 |
+
for attempt in range(1, max_retries + 1):
|
160 |
+
try:
|
161 |
+
# Send the image and system prompt to the LLM
|
162 |
+
message = HumanMessage(
|
163 |
+
content=[
|
164 |
+
{"type": "text", "text": system_prompt_text},
|
165 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
|
166 |
+
],
|
167 |
+
)
|
168 |
+
response = model.invoke([message])
|
169 |
+
print(f"Attempt {attempt}: LLM response for page {page_number}: {response}")
|
170 |
+
|
171 |
+
# Retry only if JSON data is missing or JSON decode error occurs
|
172 |
+
try:
|
173 |
+
# Check for JSON content in the response
|
174 |
+
match = re.search(r"\{.*\}", response.content.strip())
|
175 |
+
if match:
|
176 |
+
json_data = match.group(0)
|
177 |
+
|
178 |
+
# Locate the "description" field and escape single quotes within it
|
179 |
+
description_match = re.search(r"'description'\s*:\s*('.*?'|\".*?\")", json_data)
|
180 |
+
if description_match:
|
181 |
+
description_text = description_match.group(1)
|
182 |
+
if description_text.startswith("'") and description_text.endswith("'"):
|
183 |
+
description_text = f'"{description_text[1:-1]}"'
|
184 |
+
description_text = description_text.replace("'", "\\'")
|
185 |
+
|
186 |
+
# Replace the original match with the updated description text
|
187 |
+
json_data = (
|
188 |
+
json_data[:description_match.start(1)] +
|
189 |
+
description_text +
|
190 |
+
json_data[description_match.end(1):]
|
191 |
+
)
|
192 |
+
|
193 |
+
# Load the cleaned JSON data
|
194 |
+
data = json.loads(json_data)
|
195 |
+
description = data.get("description", "None").strip()
|
196 |
+
can_find_description = description != "None"
|
197 |
+
|
198 |
+
return {
|
199 |
+
"page_number": page_number,
|
200 |
+
"description": description if can_find_description else None,
|
201 |
+
"can_find_description": can_find_description
|
202 |
+
}
|
203 |
+
else:
|
204 |
+
print(f"No valid JSON found in the response for page {page_number} on attempt {attempt}")
|
205 |
+
|
206 |
+
# Trigger retry if no JSON data is found
|
207 |
+
if attempt == max_retries:
|
208 |
+
print(f"Max retries reached for page {page_number}. Skipping.")
|
209 |
+
return {
|
210 |
+
"page_number": page_number,
|
211 |
+
"description": None,
|
212 |
+
"can_find_description": False
|
213 |
+
}
|
214 |
+
|
215 |
+
except json.JSONDecodeError as e:
|
216 |
+
print(f"JSON decode error on attempt {attempt} for page {page_number}: {e}")
|
217 |
+
if attempt == max_retries:
|
218 |
+
return {
|
219 |
+
"page_number": page_number,
|
220 |
+
"description": None,
|
221 |
+
"can_find_description": False
|
222 |
+
}
|
223 |
+
|
224 |
+
# Handle any other exceptions without retrying
|
225 |
+
except Exception as e:
|
226 |
+
print(f"Outer exception for page {page_number}: {e}")
|
227 |
+
return {
|
228 |
+
"page_number": page_number,
|
229 |
+
"description": None,
|
230 |
+
"can_find_description": False
|
231 |
+
}
|
232 |
+
|
233 |
+
|
234 |
+
def create_embedding_for_pdf_chunks(page,description,url,tags,categories):
|
235 |
+
try:
|
236 |
+
document = collection.find_one({'object_url': url})
|
237 |
+
file_type = document.get("type")
|
238 |
+
mongo_id = str(document.get('_id'))
|
239 |
+
embedding = google_embeddings.embed_query(description)
|
240 |
+
pinecone_id = str(uuid.uuid4())
|
241 |
+
|
242 |
+
vectors = [{
|
243 |
+
'id': pinecone_id,
|
244 |
+
'values': embedding,
|
245 |
+
'metadata': {
|
246 |
+
'description': description,
|
247 |
+
"url": url,
|
248 |
+
"page_number":page,
|
249 |
+
"tag": file_type,
|
250 |
+
"mongo_id": mongo_id,
|
251 |
+
"tags": ','.join(tags),
|
252 |
+
"categories": ','.join(categories) # Store MongoDB ID in metadata
|
253 |
+
}
|
254 |
+
}]
|
255 |
+
index.upsert(vectors)
|
256 |
+
print(f"Inserted: page {page} in Pinecone with MongoDB ID {mongo_id} in metadata")
|
257 |
+
|
258 |
+
collection.update_one(
|
259 |
+
{
|
260 |
+
"_id": document["_id"],
|
261 |
+
"chunks.page_number": page # Match document and specific chunk by page number
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"$set": {
|
265 |
+
"chunks.$.pinecone_id": pinecone_id,
|
266 |
+
"chunks.$.successfully_embedding_created": True
|
267 |
+
}
|
268 |
+
}
|
269 |
+
)
|
270 |
+
return True
|
271 |
+
|
272 |
+
except Exception as e:
|
273 |
+
print(f"error occured in creating embedding for pdf with mongo id {mongo_id} for page {page}")
|
274 |
+
collection.update_one(
|
275 |
+
{
|
276 |
+
"_id": document["_id"],
|
277 |
+
"chunks.page_number": page # Match document and specific chunk by page number
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"$set": {
|
281 |
+
"chunks.$.successfully_embedding_created": False
|
282 |
+
}
|
283 |
+
}
|
284 |
+
)
|
285 |
+
return False
|
286 |
+
|
287 |
+
|
288 |
+
|
289 |
+
def cleanup_directory(directory_path):
|
290 |
+
try:
|
291 |
+
for filename in os.listdir(directory_path):
|
292 |
+
file_path = os.path.join(directory_path, filename)
|
293 |
+
if os.path.isfile(file_path):
|
294 |
+
os.remove(file_path)
|
295 |
+
print(f"Cleaned up files in {directory_path}")
|
296 |
+
except Exception as e:
|
297 |
+
print(f"Error cleaning up directory {directory_path}: {e}")
|
298 |
+
|
299 |
+
|
300 |
+
def process_pdf(url,tags,categories):
|
301 |
+
print(f"Processing PDF with URL: {url}")
|
302 |
+
if download_and_split_pdf_to_image(url):
|
303 |
+
chunks = []
|
304 |
+
image_files = sorted(
|
305 |
+
os.listdir(image_temp_dir),
|
306 |
+
key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
|
307 |
+
)
|
308 |
+
for count, image_name in enumerate(image_files, start=1):
|
309 |
+
print(f"Processing page {count} of the PDF")
|
310 |
+
image_path = os.path.join(image_temp_dir, image_name)
|
311 |
+
with open(image_path, "rb") as image_file:
|
312 |
+
image_data = base64.b64encode(image_file.read()).decode("utf-8")
|
313 |
+
page_result = process_image_using_llm(image_data,count, url)
|
314 |
+
chunks.append(page_result)
|
315 |
+
|
316 |
+
# Update the MongoDB document with the chunks array
|
317 |
+
collection.update_one(
|
318 |
+
{"object_url": url},
|
319 |
+
{"$set": {"chunks": chunks}},
|
320 |
+
upsert=True
|
321 |
+
)
|
322 |
+
print("Saved chunks to MongoDB.")
|
323 |
+
|
324 |
+
total_pages=len(chunks)
|
325 |
+
failed_pages=0
|
326 |
+
for chunk in chunks:
|
327 |
+
page_number=chunk.get('page_number')
|
328 |
+
description=chunk.get("description")
|
329 |
+
if description:
|
330 |
+
embedding_created=create_embedding_for_pdf_chunks(page_number,description,url,tags,categories)
|
331 |
+
if not embedding_created:
|
332 |
+
failed_pages+=1
|
333 |
+
else:
|
334 |
+
print(f"skipping the page {page_number} as description is None")
|
335 |
+
failed_pages+=1
|
336 |
+
continue
|
337 |
+
|
338 |
+
cleanup_directory(pdf_temp_dir)
|
339 |
+
cleanup_directory(image_temp_dir)
|
340 |
+
|
341 |
+
return failed_pages < total_pages
|
342 |
+
|
343 |
+
|
344 |
+
|
345 |
+
|
346 |
+
|
347 |
+
|
348 |
+
|
349 |
+
|
350 |
+
|
351 |
+
|