HarshilRamiAISV commited on
Commit
79ac2cd
·
verified ·
1 Parent(s): 615f50c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +529 -0
  2. process.py +376 -0
  3. requirements.txt +120 -0
app.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from process import process,process_using_llm
3
+ import os
4
+ import shutil
5
+ TEMP_DIR = "temp_files"
6
+
7
+ import PyPDF2
8
+
9
+ st.set_page_config(layout="wide",page_title="KYC Doc AI")
10
+
11
+ def cleanup_temp_files():
12
+ try:
13
+ if os.path.exists(TEMP_DIR):
14
+ # Remove all files in TEMP_DIR
15
+ shutil.rmtree(TEMP_DIR)
16
+ print(f"Temporary files in {TEMP_DIR} have been deleted.")
17
+ # Re-create the temp directory after cleanup
18
+ os.makedirs(TEMP_DIR)
19
+ except Exception as e:
20
+ print(f"An error occurred during cleanup: {e}")
21
+
22
+ def extract_pages(input_pdf_path, output_pdf_path, start_page=None, end_page=None):
23
+ try:
24
+ # Open the PDF file
25
+ with open(input_pdf_path, 'rb') as input_pdf:
26
+ reader = PyPDF2.PdfReader(input_pdf)
27
+ total_pages = len(reader.pages)
28
+
29
+ # Create a PDF writer object for the new PDF
30
+ writer = PyPDF2.PdfWriter()
31
+
32
+ # Default: extract only the first page if no specific input is provided
33
+ if start_page is None and end_page is None:
34
+ start_page, end_page = 0, 0
35
+
36
+ # Special case: if user specifies 0 for start_page and -1 for end_page
37
+ # Extract the first and last page only
38
+ if start_page == 0 and end_page == -1:
39
+ writer.add_page(reader.pages[0]) # First page
40
+ writer.add_page(reader.pages[-1]) # Last page
41
+ # If only first page is required
42
+ elif start_page == 0 and end_page == 0:
43
+ writer.add_page(reader.pages[0]) # First page
44
+ else:
45
+ print("Invalid input. Only first page or (first and last page) extraction is allowed.")
46
+ return
47
+
48
+ # Write the combined PDF to a new file
49
+ with open(output_pdf_path, 'wb') as output_pdf:
50
+ writer.write(output_pdf)
51
+
52
+ print(f"PDF saved as {output_pdf_path}")
53
+
54
+ except Exception as e:
55
+ print(f"An error occurred: {e}")
56
+
57
+
58
+ def merge_dicts_by_aadhaar(data):
59
+ new_dic = {}
60
+ for dic in data:
61
+ aadhar = dic.get("Aadhaar Number")
62
+ aadhar = aadhar.replace(" ", "")
63
+ if aadhar in new_dic:
64
+ if dic.get('Gender') and dic.get('Date of birth'):
65
+ new_dic[aadhar]['Gender'] = dic.get('Gender', None)
66
+ new_dic[aadhar]['Date of birth'] = dic.get('Date of birth', None)
67
+ new_dic[aadhar]['Name'] = dic.get('Name', None)
68
+ new_dic[aadhar]['Aadhaar Number'] = dic.get('Aadhaar Number', None).replace(" ", "")
69
+
70
+ else:
71
+ new_dic[aadhar]['Address'] = dic.get('Address', None)
72
+
73
+ else:
74
+ new_dic[aadhar] = {}
75
+ if dic.get('Gender') and dic.get('Date of birth'):
76
+ new_dic[aadhar]['Gender'] = dic.get('Gender', None)
77
+ new_dic[aadhar]['Date of birth'] = dic.get('Date of birth', None)
78
+ new_dic[aadhar]['Name'] = dic.get('Name', None)
79
+ new_dic[aadhar]['Aadhaar Number'] = dic.get('Aadhaar Number', None).replace(" ", "")
80
+ else:
81
+ new_dic[aadhar]['Address'] = dic.get('Address', None)
82
+ new_data = []
83
+ for key in new_dic:
84
+ new_data.append(new_dic[key])
85
+
86
+ return new_data
87
+
88
+
89
+ def process_uploads(uploaded_files):
90
+ try:
91
+ company_name=""
92
+ company_name_legal=""
93
+ company_trade_name=""
94
+ gst_number=""
95
+ pan_number_company=""
96
+ coi_number=""
97
+ director_names=[]
98
+
99
+ extracted_results={}
100
+ if not os.path.exists(TEMP_DIR):
101
+ os.makedirs(TEMP_DIR)
102
+
103
+ # director pans: -> can be individual files, single file
104
+ director_pans=uploaded_files.get('director_pans',None)
105
+ if director_pans:
106
+ director_pan_data=[]
107
+ for pan in director_pans:
108
+ file_path = os.path.join(TEMP_DIR, pan.name)
109
+
110
+ with open(file_path, "wb") as temp_file:
111
+ temp_file.write(pan.getbuffer())
112
+ ocr_data=process(file_path)
113
+ content = ""
114
+ for page_num, text in ocr_data.items():
115
+ content += text + '\n'
116
+
117
+ dict=process_using_llm(content,"pan_user")
118
+
119
+ if dict:
120
+ director_pan_data.append(dict)
121
+ for pan_data in director_pan_data:
122
+ if "Name" in pan_data:
123
+ director_names.append(pan_data.get('Name').strip().lower())
124
+ extracted_results['Pan Cards of Directors']=director_pan_data
125
+
126
+ director_aadhars=uploaded_files.get("director_aadhars",None)
127
+ if director_aadhars:
128
+ director_aadhars_data=[]
129
+ for aadhar in director_aadhars:
130
+ file_path = os.path.join(TEMP_DIR, aadhar.name)
131
+
132
+ with open(file_path, "wb") as temp_file:
133
+ temp_file.write(aadhar.getbuffer())
134
+ ocr_data = process(file_path)
135
+ content = ""
136
+ for page_num, text in ocr_data.items():
137
+ content += text + '\n'
138
+ dict = process_using_llm(content, "aadhar_user")
139
+
140
+ if dict:
141
+ director_aadhars_data.append(dict)
142
+
143
+ director_aadhars_data_new=merge_dicts_by_aadhaar(director_aadhars_data)
144
+ for direc_adhar in director_aadhars_data_new:
145
+ if "Name" in direc_adhar:
146
+ name=direc_adhar.get('Name').strip().lower()
147
+ if name not in director_names:
148
+ director_names.append(name)
149
+ extracted_results["Aadhaar Cards of Directors"]=director_aadhars_data_new
150
+ print(director_aadhars_data_new)
151
+
152
+
153
+ gst_cert=uploaded_files.get('gst_certificate',None)
154
+ if gst_cert:
155
+ file_path = os.path.join(TEMP_DIR, gst_cert.name)
156
+
157
+ with open(file_path, "wb") as temp_file:
158
+ temp_file.write(gst_cert.getbuffer())
159
+ ocr_data = process(file_path)
160
+ content = ""
161
+ for page_num, text in ocr_data.items():
162
+ content += text + '\n'
163
+ dict = process_using_llm(content, "gst")
164
+ if "Legal Name" in dict:
165
+ company_name_legal=dict.get('Legal Name','')
166
+ dict['Is Company Matching'] = "Yes"
167
+ elif "Trade Name" in dict:
168
+ company_trade_name=dict.get('Trade Name','')
169
+ dict['Is Company Matching'] = "Yes"
170
+
171
+ if "Registration Number" in dict:
172
+ gst_number=dict.get('Registration Number')
173
+
174
+ if "Names of directors" in dict:
175
+ gst_director_names = dict.get("Names of directors", [])
176
+ dict["Names of directors"]=",".join(gst_director_names)
177
+ missing_directors = []
178
+ gst_director_names_lower = [name.strip().lower() for name in gst_director_names]
179
+ for direc_name in director_names:
180
+ if direc_name not in gst_director_names_lower:
181
+ missing_directors.append(direc_name)
182
+
183
+ if not missing_directors:
184
+ dict["All director names present?"] = "<span style='color: green;'><strong>Yes</strong></span>"
185
+
186
+ else:
187
+ # List missing director names in red
188
+ missing_directors_text = ', '.join(
189
+ [f"<span style='color: red;'>{name}</span>" for name in missing_directors])
190
+ dict[
191
+ "All director names present?"] = f"<span style='color: red;'><strong>No</strong></span> (Missing: {missing_directors_text})"
192
+
193
+
194
+ extracted_results['GST Certificate Details']=dict
195
+
196
+ company_pan = uploaded_files.get('company_pan',None)
197
+ if company_pan:
198
+ file_path = os.path.join(TEMP_DIR, company_pan.name)
199
+
200
+ with open(file_path, "wb") as temp_file:
201
+ temp_file.write(company_pan.getbuffer())
202
+ ocr_data = process(file_path)
203
+ content = ""
204
+ for page_num, text in ocr_data.items():
205
+ content += text + '\n'
206
+ dict = process_using_llm(content, "company_pan")
207
+ if "Company Name" in dict:
208
+ name=dict.get("Company Name")
209
+ if name==company_trade_name or name == company_name_legal:
210
+ dict['Is Company Matching']="Yes"
211
+ company_name=name.strip()
212
+
213
+ if "PAN Number" in dict:
214
+ pan_number_company=dict.get('PAN Number').strip()
215
+ extracted_results['Company PAN Details']=dict
216
+
217
+ coi = uploaded_files.get('coi',None)
218
+ if coi:
219
+ file_path = os.path.join(TEMP_DIR, coi.name)
220
+
221
+ with open(file_path, "wb") as temp_file:
222
+ temp_file.write(coi.getbuffer())
223
+ ocr_data = process(file_path)
224
+ content = ""
225
+ for page_num, text in ocr_data.items():
226
+ content += text + '\n'
227
+ dict = process_using_llm(content, "coi")
228
+ if "Company Name" in dict:
229
+ name=dict.get("Company Name")
230
+ if name==company_trade_name or name == company_name_legal or name==company_name:
231
+ dict['Is Company Matching']="Yes"
232
+ company_name=name.strip()
233
+ else:
234
+ dict['Is Company Matching'] = "No"
235
+
236
+ if "PAN Number" in dict and dict.get('PAN Number','').strip()==pan_number_company:
237
+ dict['Is Company PAN Number Matching']="Yes"
238
+ elif "PAN Number" in dict and dict.get('PAN Number','').strip()!=pan_number_company:
239
+ dict['Is Company PAN Number Matching'] = "No"
240
+
241
+ if "Corporate Identity Number" in dict:
242
+ coi_number=dict.get("Corporate Identity Number").strip()
243
+
244
+ extracted_results['COI Details']=dict
245
+ print(dict)
246
+
247
+ aoa = uploaded_files.get('aoa',None)
248
+ if aoa:
249
+ file_path = os.path.join(TEMP_DIR, aoa.name)
250
+
251
+ with open(file_path, "wb") as temp_file:
252
+ temp_file.write(aoa.getbuffer())
253
+
254
+ extract_pages(file_path,file_path,0,-1)
255
+ ocr_data = process(file_path)
256
+ content = ""
257
+ for page_num, text in ocr_data.items():
258
+ content += text + '\n'
259
+
260
+ dict = process_using_llm(content, "aoa")
261
+ if "Company Name" in dict:
262
+ name=dict.get("Company Name").strip()
263
+ if name==company_trade_name or name == company_name_legal or name==company_name:
264
+ dict['Is Company Matching']="Yes"
265
+ company_name=name
266
+ else:
267
+ dict['Is Company Matching'] = "No"
268
+ if "Share Holders" in dict:
269
+ share_holders=dict.get("Share Holders",[])
270
+ dict["Share Holders"]=",".join(share_holders)
271
+ extracted_results['AOA Details']=dict
272
+ print(dict)
273
+
274
+ moa = uploaded_files.get('moa',None)
275
+ if moa:
276
+ file_path = os.path.join(TEMP_DIR, moa.name)
277
+
278
+ with open(file_path, "wb") as temp_file:
279
+ temp_file.write(moa.getbuffer())
280
+
281
+ extract_pages(file_path, file_path,0,-1)
282
+ ocr_data = process(file_path)
283
+ content = ""
284
+ for page_num, text in ocr_data.items():
285
+ content += text + '\n'
286
+
287
+ dict = process_using_llm(content, "moa")
288
+ if "Company Name" in dict:
289
+ name=dict.get("Company Name").strip()
290
+ if name==company_trade_name or name == company_name_legal or name==company_name:
291
+ dict['Is Company Matching']="Yes"
292
+ company_name=name
293
+ else:
294
+ dict['Is Company Matching'] = "No"
295
+ if "Share Holders" in dict:
296
+ share_holders=dict.get("Share Holders",[])
297
+ dict["Share Holders"]=",".join(share_holders)
298
+ extracted_results['MOA Details']=dict
299
+ print(dict)
300
+
301
+
302
+ share=uploaded_files.get('share',None)
303
+ if share:
304
+ file_path = os.path.join(TEMP_DIR, share.name)
305
+
306
+ with open(file_path, "wb") as temp_file:
307
+ temp_file.write(share.getbuffer())
308
+
309
+
310
+ ocr_data = process(file_path)
311
+ content = ""
312
+ for page_num, text in ocr_data.items():
313
+ content += text + '\n'
314
+
315
+ dict = process_using_llm(content, "share")
316
+ if "Company Name" in dict:
317
+ name=dict.get("Company Name").strip()
318
+ if name==company_trade_name or name == company_name_legal or name==company_name:
319
+ dict['Is Company Matching']="Yes"
320
+ company_name=name
321
+ else:
322
+ dict['Is Company Matching'] = "No"
323
+
324
+
325
+
326
+ if "Corporate Identity Number" in dict and dict.get("Corporate Identity Number").strip()==coi_number:
327
+ dict['Is Corporate Identity Number Matching']="Yes"
328
+ elif "Corporate Identity Number" in dict and dict.get("Corporate Identity Number").strip()!=coi_number:
329
+ dict['Is Corporate Identity Number Matching'] = "No"
330
+
331
+ if "Share Holders" in dict:
332
+ share_holders=dict.get("Share Holders",[])
333
+ dict["Share Holders"]=",".join(share_holders)
334
+
335
+ extracted_results['Shareholding Details']=dict
336
+ print(dict)
337
+
338
+ address_proof = uploaded_files.get('address_proof', None)
339
+ if address_proof:
340
+ file_path = os.path.join(TEMP_DIR, address_proof.name)
341
+
342
+ with open(file_path, "wb") as temp_file:
343
+ temp_file.write(address_proof.getbuffer())
344
+ extract_pages(file_path, file_path)
345
+ ocr_data = process(file_path)
346
+ content = ""
347
+ for page_num, text in ocr_data.items():
348
+ content += text + '\n'
349
+
350
+ dict = process_using_llm(content, "stamp")
351
+ if "Stamp Duty" in dict:
352
+ duty=dict.get("Stamp Duty",None)
353
+ if duty>=100:
354
+ dict['Valid Stamp']="Yes"
355
+ subword = "nota"
356
+ if subword in content.lower():
357
+ dict['Notary Stamp']="Present"
358
+
359
+ extracted_results['Address Proof Details(Non Judicial Stamp)']=dict
360
+ print(dict)
361
+
362
+ return extracted_results
363
+ except Exception as e:
364
+ print(f"error occured in processing files {e}")
365
+ finally:
366
+ cleanup_temp_files()
367
+
368
+
369
+ def display_results_in_cards(extracted_results):
370
+ col1, col2, col3 = st.columns(3) # Create three columns
371
+ count = 0 # To keep track of the row/column positioning
372
+
373
+ for key in extracted_results:
374
+ # Determine which column to use (cycle through 3 columns)
375
+ current_col = col1 if count % 3 == 0 else col2 if count % 3 == 1 else col3
376
+
377
+ with current_col:
378
+ # Process director PAN data
379
+ if key == "Pan Cards of Directors":
380
+ d_pans = extracted_results[key]
381
+ text = ""
382
+
383
+ # Build the text for each director's PAN information
384
+ for count, d_pan in enumerate(d_pans):
385
+ text += f"<h4 style='color:black;'>Pan Information of Director {count + 1}</h4>"
386
+ for key2 in d_pan:
387
+ # Add each field to the text
388
+ field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {d_pan[key2]}<br>"
389
+ text += field_text
390
+
391
+ # Display in a custom card-like layout
392
+ current_col.markdown(f"""
393
+ <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
394
+ <div style="color: #333; height: 150px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
395
+ <p>{text}</p>
396
+ </div>
397
+ </div>
398
+ """, unsafe_allow_html=True)
399
+
400
+ # Process director Aadhaar data
401
+ elif key == "Aadhaar Cards of Directors":
402
+ d_aadhars = extracted_results[key]
403
+ text = ""
404
+
405
+ for count, d_aadhar in enumerate(d_aadhars):
406
+ text += f"<h4 style='color:black;'>Aadhaar Information of Director {count + 1}</h4>"
407
+ for key2 in d_aadhar:
408
+ field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {d_aadhar[key2]}<br>"
409
+ text += field_text
410
+
411
+ current_col.markdown(f"""
412
+ <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
413
+ <div style="color: #333; height: 230px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
414
+ <p>{text}</p>
415
+ </div>
416
+ </div>
417
+ """, unsafe_allow_html=True)
418
+
419
+ # Process other documents and check for 'Yes'/'No' status
420
+ else:
421
+ data = extracted_results[key]
422
+ text = f"<h4 style='color:black;'>{key}</h4>"
423
+
424
+ for key2 in data:
425
+ # Apply color and bold formatting for "Yes" and "No" values
426
+ if key2 in ["Is Company Matching", "Is Corporate Identity Number Matching",
427
+ "Is Company PAN Number Matching"]:
428
+ if data[key2] == "Yes":
429
+ field_text = f"<strong style='color: #3498db;'>{key2}</strong>: <strong><span style='color: green;'>{data[key2]}</span></strong><br>"
430
+ elif data[key2] == "No":
431
+ field_text = f"<strong style='color: #3498db;'>{key2}</strong>: <strong><span style='color: red;'>{data[key2]}</span></strong><br>"
432
+ else:
433
+ field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {data[key2]}<br>"
434
+ else:
435
+ field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {data[key2]}<br>"
436
+
437
+ text += field_text
438
+
439
+ current_col.markdown(f"""
440
+ <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;">
441
+ <div style="color: #333; height: 230px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;">
442
+ <p>{text}</p>
443
+ </div>
444
+ </div>
445
+ """, unsafe_allow_html=True)
446
+
447
+ count += 1
448
+
449
+
450
+ def main():
451
+ if "submitted" not in st.session_state:
452
+ st.session_state["submitted"] = False
453
+
454
+ if not st.session_state["submitted"]:
455
+ st.title("Welcome to KYC DOC AI")
456
+
457
+ # Create a form to handle all file uploads and submission
458
+ with st.form("kyc_form"):
459
+ uploaded_files = {}
460
+
461
+ # Director PAN Cards (Multiple files)
462
+ director_pans = st.file_uploader("Upload PAN Cards of Directors", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=True)
463
+ if director_pans:
464
+ uploaded_files['director_pans'] = director_pans
465
+
466
+ # Director Aadhar Cards (Multiple files)
467
+ director_aadhars = st.file_uploader("Upload Aadhar Cards of Directors", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=True)
468
+ if director_aadhars:
469
+ uploaded_files['director_aadhars'] = director_aadhars
470
+
471
+ # GST Certificate (Single file)
472
+ gst_certificate = st.file_uploader("Upload GST Certificate", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
473
+ if gst_certificate:
474
+ uploaded_files['gst_certificate'] = gst_certificate
475
+
476
+ # Company PAN Card (Single file)
477
+ company_pan = st.file_uploader("Upload PAN Card of Company", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
478
+ if company_pan:
479
+ uploaded_files['company_pan'] = company_pan
480
+
481
+ # Certificate of Incorporation (Single file)
482
+ coi_certificate = st.file_uploader("Upload CERTIFICATE OF INCORPORATION", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
483
+ if coi_certificate:
484
+ uploaded_files['coi'] = coi_certificate
485
+
486
+ # AOA Document (Single file)
487
+ aoa = st.file_uploader("Upload AOA document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
488
+ if aoa:
489
+ uploaded_files['aoa'] = aoa
490
+
491
+ # COA Document (Single file)
492
+ moa = st.file_uploader("Upload MOA document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
493
+ if moa:
494
+ uploaded_files['moa'] = moa
495
+
496
+ # Shareholding Document (Single file)
497
+ share = st.file_uploader("Upload Shareholding Document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
498
+ if share:
499
+ uploaded_files['share'] = share
500
+
501
+ # Address Proof of Company Office (Single file)
502
+ address = st.file_uploader("Upload Address Proof of Company Office", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False)
503
+ if address:
504
+ uploaded_files['address_proof'] = address
505
+
506
+ # Submit button for the form
507
+ submitted = st.form_submit_button("Process Files")
508
+ if submitted:
509
+ with st.spinner('Processing files...'):
510
+ extracted_results=process_uploads(uploaded_files)
511
+
512
+ st.session_state["extracted_results"] = extracted_results
513
+ st.session_state["submitted"] = True
514
+ st.rerun()
515
+
516
+ else:
517
+ st.title("KYC Document Results")
518
+ if "extracted_results" in st.session_state:
519
+ extracted_results = st.session_state["extracted_results"]
520
+ display_results_in_cards(extracted_results)
521
+
522
+ if st.button("Back"):
523
+ st.session_state["submitted"] = False
524
+ st.rerun()
525
+
526
+
527
+
528
+ if __name__ == "__main__":
529
+ main()
process.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import PromptTemplate
2
+ from langchain_google_genai import ChatGoogleGenerativeAI
3
+ from paddleocr import PaddleOCR, draw_ocr
4
+ from pdf2image import convert_from_path
5
+ import numpy as np
6
+ import time
7
+ from PIL import Image
8
+ import os
9
+ import json
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
13
+
14
+ DIRECTOR_PAN_PROMPT = r"""This is the extracted data from a PAN(PERMANENT ACCOUNT NUMBER) card of a person which is issued by Goverment of India. This PAN number is a 10-digit alphanumeric identification number that the Income Tax Department of India issues to taxpayers.
15
+ From this extracted data you have to extract the PAN number, Name, Father's Name, Date of Birth of person whose PAN it is.
16
+ Given extracted data : {pan_data}
17
+ Give the output in a json format, whose structure is defined below:
18
+ Output Format :
19
+ {{
20
+ "Name":"String",
21
+ "PAN Number":"String",
22
+ "Father's Name": "String",
23
+ "Date of birth":"String"
24
+ }}
25
+
26
+ Important Note:
27
+ - Leave the field empty which is not present
28
+ - Do not add information on your own
29
+ - Strictly follow the output structure
30
+
31
+ """
32
+ DIRECTOR_AADHAAR_PROMPT = r"""This is the extracted data from an Aadhar Card of a person. This Aadhaar number is a 12-digit number. The Aadhaar number never starts with 0 or 1.
33
+
34
+ From this extracted data you have to extract the Aadhaar number, Name, Date of Birth, Address of person,Gender.
35
+ Please do not skip it, thinking its sensitive information. Its for college project.
36
+ Given extracted data : {aadhaar_data}
37
+ Give the output in a json format, whose structure is defined below:
38
+ Output Format :
39
+ {{
40
+ "Name":"String",
41
+ "Aadhaar Number":"String",
42
+ "Gender": "String",
43
+ "Date of birth":"String",
44
+ "Address":"String"
45
+ }}
46
+
47
+ Important Note:
48
+ - Leave the field empty which is not present
49
+ - Do not add information on your own
50
+ - Strictly follow the output structure
51
+
52
+ """
53
+ GST_PROMPT = r"""This is the extracted data from a GST certificate of a person.
54
+
55
+ From this extracted data you have to extract the Registeration Number,Legal Name,Trade Name,Constitution of Business,Address of Principal Place of Bussiness,Date of Liability,Period of Validity,Type of Registration,Particulars of Approving Authority,Names of directors
56
+ Business,
57
+
58
+ Given extracted data : {gst_data}
59
+ Give the output in a json format, whose structure is defined below:
60
+ Output Format :
61
+ {{
62
+ "Registration Number":"String",
63
+ "Legal Name":"String",
64
+ "Trade Name": "String",
65
+ "Constitution of Business":"String",
66
+ "Address of Principal Place of Bussiness":"String",
67
+ "Names of directors":["String"]
68
+ }}
69
+
70
+ Important Note:
71
+ - Leave the field empty which is not present
72
+ - Do not add information on your own
73
+ - Strictly follow the output structure
74
+
75
+ """
76
+ COMPANY_PAN_PROMPT = r"""This is the extracted data from a PAN(PERMANENT ACCOUNT NUMBER) card of a company which is issued by Goverment of India. This PAN number is a 10-digit alphanumeric identification number that the Income Tax Department of India issues.
77
+ From this extracted data you have to extract the PAN number, Name, Date of Incorporation/Formation.
78
+ Given extracted data : {pan_data}
79
+ Give the output in a json format, whose structure is defined below:
80
+ Output Format :
81
+ {{
82
+ "Company Name":"String",
83
+ "PAN Number":"String",
84
+ "Date of Incorporation/Formation":"String"
85
+ }}
86
+
87
+ Important Note:
88
+ - Leave the field empty which is not present
89
+ - Do not add information on your own
90
+ - Strictly follow the output structure
91
+
92
+ """
93
+ COI_PROMPT = r"""This is the extracted data from a CERTICATE OF INCORPORATION of a company which is issued by Goverment of India. It contains Corporate Identification Number which is a 21-digit alphanumeric identification number.
94
+ From this extracted data you have to extract the PAN number of the company , Name of the company ,Corporate Identity Number of the company.
95
+ Given extracted data : {coi_data}
96
+ Give the output in a json format, whose structure is defined below:
97
+ Output Format :
98
+ {{
99
+ "Company Name":"String",
100
+ "PAN Number":"String",
101
+ "Corporate Identity Number":"String"
102
+ }}
103
+
104
+ Important Note:
105
+ - Leave the field empty which is not present
106
+ - Do not add information on your own
107
+ - Strictly follow the output structure
108
+
109
+ """
110
+
111
+ SHARE_PROMPT = r"""This is the extracted data from a SHAREHOLDING document of a company.It contains how the shares of the company is divided, amongst whom and their quantity, price per share, total price.
112
+ from this extracted data you have to extract Company Name, Name of share holder, Corporate Identity Number of the company (its a 21 digit alphanumeric number)
113
+ Given extracted data : {share_data}
114
+ Give the output in a json format, whose structure is defined below:
115
+ Output Format :
116
+ {{
117
+ "Company Name":"String",
118
+ "Corporate Identity Number":"String",
119
+ "Share Holders":["String"]
120
+ }}
121
+
122
+ Important Note:
123
+ - Leave the field empty which is not present
124
+ - Do not add information on your own
125
+ - Strictly follow the output structure
126
+
127
+ """
128
+ AOA_PROMPT = r"""This is the extracted data from the AOA(Articles of Associaton) document of a company. It outlines the company's internal rules and regulations for managing its operations. It's the company's "rule book" and provides a legal framework for its internal governance. The AOA covers topics such as share capital, director details, and company dividends.
129
+ From this extracted data you have to extract Company Name, Name of share holders(Mentioned under Subscriber Details heading not Signed Before Me heading).
130
+ Given extracted data : {aoa_data}
131
+ Give the output in a json format, whose structure is defined below:
132
+ Output Format :
133
+ {{
134
+ "Company Name":"String",
135
+ "Share Holders":["String"]
136
+ }}
137
+
138
+ Important Note:
139
+ - Leave the field empty which is not present
140
+ - Do not add information on your own
141
+ - Strictly follow the output structure
142
+
143
+ """
144
+ MOA_PROMPT = r"""This is the extracted data from the MOA(Memorandum of Association) document of a company. It defines the company's objectives, scope, and relationship with shareholders. It's the company's foundational document and charter. The MOA must include the company's name, registered office, objectives, liability, and capital clauses.
145
+
146
+ From this extracted data you have to extract Company Name, Name of share holders(Mentioned under Subscriber Details heading not Signed Before Me heading, get only the name).
147
+ Given extracted data : {moa_data}
148
+ Give the output in a json format, whose structure is defined below:
149
+ Output Format :
150
+ {{
151
+ "Company Name":"String",
152
+ "Share Holders":["String"]
153
+ }}
154
+
155
+ Important Note:
156
+ - Leave the field empty which is not present
157
+ - Do not add information on your own
158
+ - Strictly follow the output structure
159
+
160
+ """
161
+ STAMP_PROMPT = r"""This is the extracted data from the a Non Judicial Stamp.
162
+ From this extracted data you have to extract Certificate No, Certificate Issued Date,First Party,Second Party,Stamp Duty
163
+ Given extracted data : {stamp_data}
164
+ Give the output in a json format, whose structure is defined below:
165
+ Output Format :
166
+ {{
167
+ "Certificate No":"String",
168
+ "Certificate Issued Date":"String",
169
+ "First Party":"String",
170
+ "Second Party":"String",
171
+ "Stamp Duty":Integer
172
+ }}
173
+
174
+ Important Note:
175
+ - Leave the field empty which is not present
176
+ - Do not add information on your own
177
+ - Strictly follow the output structure
178
+
179
+ """
180
+
181
+
182
+ poppler_path = r"C:\Program Files\poppler-24.07.0\Library\bin"
183
+
184
+
185
+ def extract_from_image(img):
186
+ try:
187
+ ocr = PaddleOCR(use_angle_cls=True, lang="en", show_logs=False) # Initialize PaddleOCR
188
+ result = ocr.ocr(img, cls=True) # Perform OCR on the image
189
+ return result
190
+ except Exception as e:
191
+ print(f"Error occurred in processing image using OCR: {e}")
192
+ return None
193
+
194
+
195
+ def extract_from_result(result):
196
+ content = ""
197
+ for r in result:
198
+ for r2 in r:
199
+ value = r2[-1][0]
200
+ if value.startswith('/'):
201
+ value = value.replace('/', '', 1)
202
+ content += value + '\n'
203
+ return content
204
+
205
+
206
+ def process(file_path):
207
+ """
208
+ This function processes either a PDF or an image.
209
+ It detects the file type by checking its extension.
210
+ """
211
+ start_time = time.time()
212
+ results = {}
213
+
214
+ # Check if the file is a PDF or an image
215
+ file_extension = os.path.splitext(file_path)[-1].lower()
216
+
217
+ if file_extension == '.pdf':
218
+ # Process as PDF
219
+ print("Processing PDF...")
220
+ images = convert_from_path(file_path, poppler_path=poppler_path) # Convert PDF to images
221
+ elif file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
222
+ # Process as image
223
+ print("Processing Image...")
224
+ images = [Image.open(file_path)] # Open the image and process it as a single-page list
225
+ else:
226
+ print("Unsupported file type. Please provide a PDF or an image.")
227
+ return None
228
+
229
+ # Process each image (either from a PDF or a single image)
230
+ for i, image in enumerate(images):
231
+ image_np = np.array(image) # Convert image to numpy array
232
+ result = extract_from_image(image_np) # Extract text using PaddleOCR
233
+ if result:
234
+ result_extracted = extract_from_result(result)
235
+ results[i] = result_extracted
236
+ else:
237
+ results[i] = "OCR extraction failed for this page."
238
+
239
+ end_time = time.time()
240
+ print(f"\nTotal processing time: {end_time - start_time:.2f} seconds")
241
+ return results
242
+
243
+
244
+ def chat_gemini(prompt):
245
+ print("Entered chat_gemini helper")
246
+ try:
247
+ print("entering in try")
248
+ llm = ChatGoogleGenerativeAI(
249
+ model="gemini-1.5-flash",
250
+ temperature=0,
251
+ max_tokens=None,
252
+ timeout=None,
253
+ max_retries=2,
254
+ google_api_key=GOOGLE_API_KEY
255
+ )
256
+ result = llm.invoke(prompt)
257
+ print(result)
258
+ if result.content:
259
+ json_content = json.loads(result.content.replace("```json", "").replace("```", ""))
260
+ return json_content
261
+
262
+ except Exception as e:
263
+ return e
264
+
265
+
266
+ def process_using_llm(input_info, type_data):
267
+ if type_data == "pan_user":
268
+ pan_user_prompt = PromptTemplate(
269
+ input_variables=["pan_data"],
270
+ template=DIRECTOR_PAN_PROMPT
271
+ )
272
+ prompt_formatted = pan_user_prompt.format(
273
+ pan_data=input_info,
274
+ )
275
+
276
+ result = chat_gemini(prompt_formatted)
277
+ return result
278
+ if type_data == "aadhar_user":
279
+ aadhar_user_prompt = PromptTemplate(
280
+ input_variables=["aadhaar_data"],
281
+ template=DIRECTOR_AADHAAR_PROMPT
282
+ )
283
+ prompt_formatted = aadhar_user_prompt.format(
284
+ aadhaar_data=input_info,
285
+ )
286
+
287
+ result = chat_gemini(prompt_formatted)
288
+ return result
289
+
290
+ if type_data == "gst":
291
+ gst_prompt = PromptTemplate(
292
+ input_variables=["gst_data"],
293
+ template=GST_PROMPT
294
+ )
295
+ prompt_formatted = gst_prompt.format(
296
+ gst_data=input_info,
297
+ )
298
+
299
+ result = chat_gemini(prompt_formatted)
300
+ return result
301
+
302
+ if type_data == "company_pan":
303
+ pan_prompt = PromptTemplate(
304
+ input_variables=["pan_data"],
305
+ template=COMPANY_PAN_PROMPT
306
+ )
307
+ prompt_formatted = pan_prompt.format(
308
+ pan_data=input_info,
309
+ )
310
+
311
+ result = chat_gemini(prompt_formatted)
312
+ return result
313
+
314
+ if type_data == "coi":
315
+ coi_prompt = PromptTemplate(
316
+ input_variables=["coi_data"],
317
+ template=COI_PROMPT
318
+ )
319
+ prompt_formatted = coi_prompt.format(
320
+ coi_data=input_info,
321
+ )
322
+
323
+ result = chat_gemini(prompt_formatted)
324
+ return result
325
+
326
+ if type_data == "share":
327
+ share_prompt = PromptTemplate(
328
+ input_variables=["share_data"],
329
+ template=SHARE_PROMPT
330
+ )
331
+ prompt_formatted = share_prompt.format(
332
+ share_data=input_info,
333
+ )
334
+
335
+ result = chat_gemini(prompt_formatted)
336
+ return result
337
+
338
+ if type_data == "aoa":
339
+ aoa_prompt = PromptTemplate(
340
+ input_variables=["aoa_data"],
341
+ template=AOA_PROMPT
342
+ )
343
+ prompt_formatted = aoa_prompt.format(
344
+ aoa_data=input_info,
345
+ )
346
+
347
+ result = chat_gemini(prompt_formatted)
348
+ return result
349
+
350
+ if type_data == "moa":
351
+ moa_prompt = PromptTemplate(
352
+ input_variables=["moa_data"],
353
+ template=MOA_PROMPT
354
+ )
355
+ prompt_formatted = moa_prompt.format(
356
+ moa_data=input_info,
357
+ )
358
+
359
+ result = chat_gemini(prompt_formatted)
360
+ return result
361
+ if type_data == "stamp":
362
+ stamp_prompt = PromptTemplate(
363
+ input_variables=["stamp_data"],
364
+ template=STAMP_PROMPT
365
+ )
366
+ prompt_formatted = stamp_prompt.format(
367
+ stamp_data=input_info,
368
+ )
369
+
370
+ result = chat_gemini(prompt_formatted)
371
+ return result
372
+
373
+
374
+
375
+
376
+
requirements.txt ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.0
2
+ aiohttp==3.10.5
3
+ aiosignal==1.3.1
4
+ altair==5.4.1
5
+ annotated-types==0.7.0
6
+ anyio==4.6.0
7
+ astor==0.8.1
8
+ async-timeout==4.0.3
9
+ attrs==24.2.0
10
+ beautifulsoup4==4.12.3
11
+ blinker==1.8.2
12
+ cachetools==5.5.0
13
+ certifi==2024.8.30
14
+ charset-normalizer==3.3.2
15
+ click==8.1.7
16
+ colorama==0.4.6
17
+ contourpy==1.3.0
18
+ cycler==0.12.1
19
+ Cython==3.0.11
20
+ decorator==5.1.1
21
+ exceptiongroup==1.2.2
22
+ fire==0.6.0
23
+ fonttools==4.54.0
24
+ frozenlist==1.4.1
25
+ gitdb==4.0.11
26
+ GitPython==3.1.43
27
+ google-ai-generativelanguage==0.6.6
28
+ google-api-core==2.20.0
29
+ google-api-python-client==2.146.0
30
+ google-auth==2.35.0
31
+ google-auth-httplib2==0.2.0
32
+ google-generativeai==0.7.2
33
+ googleapis-common-protos==1.65.0
34
+ greenlet==3.1.1
35
+ grpcio==1.66.1
36
+ grpcio-status==1.62.3
37
+ h11==0.14.0
38
+ httpcore==1.0.5
39
+ httplib2==0.22.0
40
+ httpx==0.27.2
41
+ idna==3.10
42
+ imageio==2.35.1
43
+ imgaug==0.4.0
44
+ Jinja2==3.1.4
45
+ jsonpatch==1.33
46
+ jsonpointer==3.0.0
47
+ jsonschema==4.23.0
48
+ jsonschema-specifications==2023.12.1
49
+ kiwisolver==1.4.7
50
+ langchain==0.3.0
51
+ langchain-core==0.3.5
52
+ langchain-google-genai==2.0.0
53
+ langchain-text-splitters==0.3.0
54
+ langsmith==0.1.125
55
+ lazy_loader==0.4
56
+ lmdb==1.5.1
57
+ lxml==5.3.0
58
+ markdown-it-py==3.0.0
59
+ MarkupSafe==2.1.5
60
+ matplotlib==3.9.2
61
+ mdurl==0.1.2
62
+ multidict==6.1.0
63
+ narwhals==1.8.2
64
+ networkx==3.3
65
+ numpy==1.26.4
66
+ opencv-contrib-python==4.10.0.84
67
+ opencv-python==4.10.0.84
68
+ opt-einsum==3.3.0
69
+ orjson==3.10.7
70
+ packaging==24.1
71
+ paddleocr==2.8.1
72
+ paddlepaddle==2.6.2
73
+ pandas==2.2.3
74
+ pdf2image==1.17.0
75
+ pillow==10.4.0
76
+ proto-plus==1.24.0
77
+ protobuf==4.25.5
78
+ pyarrow==17.0.0
79
+ pyasn1==0.6.1
80
+ pyasn1_modules==0.4.1
81
+ pyclipper==1.3.0.post5
82
+ pydantic==2.9.2
83
+ pydantic_core==2.23.4
84
+ pydeck==0.9.1
85
+ Pygments==2.18.0
86
+ pyparsing==3.1.4
87
+ PyPDF2==3.0.1
88
+ python-dateutil==2.9.0.post0
89
+ python-docx==1.1.2
90
+ python-dotenv==1.0.1
91
+ pytz==2024.2
92
+ PyYAML==6.0.2
93
+ RapidFuzz==3.10.0
94
+ referencing==0.35.1
95
+ requests==2.32.3
96
+ rich==13.8.1
97
+ rpds-py==0.20.0
98
+ rsa==4.9
99
+ scikit-image==0.24.0
100
+ scipy==1.14.1
101
+ shapely==2.0.6
102
+ six==1.16.0
103
+ smmap==5.0.1
104
+ sniffio==1.3.1
105
+ soupsieve==2.6
106
+ SQLAlchemy==2.0.35
107
+ streamlit==1.38.0
108
+ streamlit-card==1.0.2
109
+ tenacity==8.5.0
110
+ termcolor==2.4.0
111
+ tifffile==2024.9.20
112
+ toml==0.10.2
113
+ tornado==6.4.1
114
+ tqdm==4.66.5
115
+ typing_extensions==4.12.2
116
+ tzdata==2024.2
117
+ uritemplate==4.1.1
118
+ urllib3==2.2.3
119
+ watchdog==4.0.2
120
+ yarl==1.12.0