import streamlit as st from process import process,process_using_llm import os import shutil import PyPDF2 TEMP_DIR = "temp_files" st.set_page_config(layout="wide",page_title="KYC Doc AI") def cleanup_temp_files(): try: if os.path.exists(TEMP_DIR): # Remove all files in TEMP_DIR shutil.rmtree(TEMP_DIR) print(f"Temporary files in {TEMP_DIR} have been deleted.") # Re-create the temp directory after cleanup os.makedirs(TEMP_DIR) except Exception as e: print(f"An error occurred during cleanup: {e}") def extract_pages(input_pdf_path, output_pdf_path, start_page=None, end_page=None): try: # Open the PDF file with open(input_pdf_path, 'rb') as input_pdf: reader = PyPDF2.PdfReader(input_pdf) total_pages = len(reader.pages) # Create a PDF writer object for the new PDF writer = PyPDF2.PdfWriter() # Default: extract only the first page if no specific input is provided if start_page is None and end_page is None: start_page, end_page = 0, 0 # Special case: if user specifies 0 for start_page and -1 for end_page # Extract the first and last page only if start_page == 0 and end_page == -1: writer.add_page(reader.pages[0]) # First page writer.add_page(reader.pages[-1]) # Last page # If only first page is required elif start_page == 0 and end_page == 0: writer.add_page(reader.pages[0]) # First page else: print("Invalid input. Only first page or (first and last page) extraction is allowed.") return # Write the combined PDF to a new file with open(output_pdf_path, 'wb') as output_pdf: writer.write(output_pdf) print(f"PDF saved as {output_pdf_path}") except Exception as e: print(f"An error occurred: {e}") def merge_dicts_by_aadhaar(data): new_dic = {} for dic in data: aadhar = dic.get("Aadhaar Number") aadhar = aadhar.replace(" ", "") if aadhar in new_dic: if dic.get('Gender') and dic.get('Date of birth'): new_dic[aadhar]['Gender'] = dic.get('Gender', None) new_dic[aadhar]['Date of birth'] = dic.get('Date of birth', None) new_dic[aadhar]['Name'] = dic.get('Name', None) new_dic[aadhar]['Aadhaar Number'] = dic.get('Aadhaar Number', None).replace(" ", "") else: new_dic[aadhar]['Address'] = dic.get('Address', None) else: new_dic[aadhar] = {} if dic.get('Gender') and dic.get('Date of birth'): new_dic[aadhar]['Gender'] = dic.get('Gender', None) new_dic[aadhar]['Date of birth'] = dic.get('Date of birth', None) new_dic[aadhar]['Name'] = dic.get('Name', None) new_dic[aadhar]['Aadhaar Number'] = dic.get('Aadhaar Number', None).replace(" ", "") else: new_dic[aadhar]['Address'] = dic.get('Address', None) new_data = [] for key in new_dic: new_data.append(new_dic[key]) return new_data def process_uploads(uploaded_files): try: company_name=None company_name_legal=None company_trade_name=None gst_number=None pan_number_company=None coi_number=None director_names=[] extracted_results={} gst_dict={} pan_dict={} coi_dict={} moa_dict={} aoa_dict={} add_dict={} share_dict={} total_pan_number=0 total_company_names=0 total_coi_numbers=0 if not os.path.exists(TEMP_DIR): os.makedirs(TEMP_DIR) # director pans: -> can be individual files, single file director_pans=uploaded_files.get('director_pans',None) if director_pans: director_pan_data=[] for pan in director_pans: file_path = os.path.join(TEMP_DIR, pan.name) with open(file_path, "wb") as temp_file: temp_file.write(pan.getbuffer()) ocr_data=process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' dict=process_using_llm(content,"pan_user") if dict: director_pan_data.append(dict) for pan_data in director_pan_data: if "Name" in pan_data: director_names.append(pan_data.get('Name').strip().lower()) extracted_results['Pan Cards of Directors']=director_pan_data director_aadhars=uploaded_files.get("director_aadhars",None) if director_aadhars: director_aadhars_data=[] for aadhar in director_aadhars: file_path = os.path.join(TEMP_DIR, aadhar.name) with open(file_path, "wb") as temp_file: temp_file.write(aadhar.getbuffer()) ocr_data = process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' dict = process_using_llm(content, "aadhar_user") if dict: director_aadhars_data.append(dict) director_aadhars_data_new=merge_dicts_by_aadhaar(director_aadhars_data) for direc_adhar in director_aadhars_data_new: if "Name" in direc_adhar: name=direc_adhar.get('Name').strip().lower() if name not in director_names: director_names.append(name) extracted_results["Aadhaar Cards of Directors"]=director_aadhars_data_new gst_cert=uploaded_files.get('gst_certificate',None) if gst_cert: file_path = os.path.join(TEMP_DIR, gst_cert.name) with open(file_path, "wb") as temp_file: temp_file.write(gst_cert.getbuffer()) ocr_data = process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' gst_dict = process_using_llm(content, "gst") if "Legal Name" in gst_dict: company_name_legal=gst_dict.get('Legal Name','').strip() elif "Trade Name" in gst_dict: company_trade_name=gst_dict.get('Trade Name','').strip() if "Registration Number"in gst_dict: gst_number=gst_dict.get('Registration Number').strip() if company_name_legal or company_trade_name: total_company_names+=1 company_pan = uploaded_files.get('company_pan',None) if company_pan: file_path = os.path.join(TEMP_DIR, company_pan.name) with open(file_path, "wb") as temp_file: temp_file.write(company_pan.getbuffer()) ocr_data = process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' pan_dict = process_using_llm(content, "company_pan") if "Company Name" in pan_dict: name=pan_dict.get("Company Name").strip() company_name=name total_company_names+=1 if "PAN Number" in pan_dict: pan_number_company=pan_dict.get('PAN Number').strip() total_pan_number+=1 coi = uploaded_files.get('coi',None) if coi: file_path = os.path.join(TEMP_DIR, coi.name) with open(file_path, "wb") as temp_file: temp_file.write(coi.getbuffer()) ocr_data = process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' coi_dict = process_using_llm(content, "coi") if "Company Name" in coi_dict: name=coi_dict.get("Company Name").strip() company_name=name total_company_names+=1 if "Corporate Identity Number" in coi_dict: coi_number=coi_dict.get("Corporate Identity Number").strip() total_coi_numbers+=1 if "PAN Number" in coi_dict: total_pan_number+=1 pan_number_company=coi_dict.get("PAN Number").strip() aoa = uploaded_files.get('aoa',None) if aoa: file_path = os.path.join(TEMP_DIR, aoa.name) with open(file_path, "wb") as temp_file: temp_file.write(aoa.getbuffer()) extract_pages(file_path,file_path,0,-1) ocr_data = process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' aoa_dict = process_using_llm(content, "aoa") if "Share Holders" in aoa_dict: share_holders=aoa_dict.get("Share Holders",[]) aoa_dict["Share Holders"]=",".join(share_holders) moa = uploaded_files.get('moa',None) if moa: file_path = os.path.join(TEMP_DIR, moa.name) with open(file_path, "wb") as temp_file: temp_file.write(moa.getbuffer()) extract_pages(file_path, file_path,0,-1) ocr_data = process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' moa_dict = process_using_llm(content, "moa") if "Share Holders" in moa_dict: share_holders=moa_dict.get("Share Holders",[]) moa_dict["Share Holders"]=",".join(share_holders) share=uploaded_files.get('share',None) if share: file_path = os.path.join(TEMP_DIR, share.name) with open(file_path, "wb") as temp_file: temp_file.write(share.getbuffer()) ocr_data = process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' share_dict = process_using_llm(content, "share") if "Share Holders" in share_dict: share_holders=share_dict.get("Share Holders",[]) share_dict["Share Holders"]=",".join(share_holders) if "Corporate Identity Number" in share_dict: coi_number=share_dict.get('Corporate Identity Number').strip() total_coi_numbers+=1 address_proof = uploaded_files.get('address_proof', None) if address_proof: file_path = os.path.join(TEMP_DIR, address_proof.name) with open(file_path, "wb") as temp_file: temp_file.write(address_proof.getbuffer()) extract_pages(file_path, file_path) ocr_data = process(file_path) content = "" for page_num, text in ocr_data.items(): content += text + '\n' add_dict = process_using_llm(content, "stamp") if "Stamp Duty" in add_dict: duty=add_dict.get("Stamp Duty",None) if duty>=100: add_dict['Valid Stamp']="Yes" subword = "nota" if subword in content.lower(): add_dict['Notary Stamp']="Present" extracted_results['Address Proof Details(Non Judicial Stamp)']=add_dict if company_name is not None or company_name_legal is not None or company_trade_name is not None: if total_company_names>1: if pan_dict: name=pan_dict.get("Company Name",None) if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name): pan_dict['Is Company Matching']="Yes" else: pan_dict['Is Company Matching']="No" if coi_dict: name = coi_dict.get("Company Name",None) if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name): coi_dict['Is Company Matching'] = "Yes" else: coi_dict['Is Company Matching'] = "No" if gst_dict: name1 = gst_dict.get("Legal Name",None) name2 = gst_dict.get("Trade Name",None) if name1 and (name1.strip() == company_name or name1.strip() == company_name_legal or name1.strip() == company_trade_name): gst_dict['Is Company Matching'] = "Yes" elif name2 and (name2.strip() == company_name or name2.strip() == company_name_legal or name2.strip() == company_trade_name): gst_dict['Is Company Matching'] = "Yes" else: gst_dict['Is Company Matching']="No" if moa_dict: name = moa_dict.get("Company Name", None) if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name): moa_dict['Is Company Matching'] = "Yes" else: moa_dict['Is Company Matching'] = "No" if aoa_dict: name = moa_dict.get("Company Name", None) if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name): moa_dict['Is Company Matching'] = "Yes" else: moa_dict['Is Company Matching'] = "No" if share_dict: name = share_dict.get("Company Name",None) if name and (name.strip() == company_name or name.strip() == company_name_legal or name.strip() == company_trade_name): share_dict['Is Company Matching'] = "Yes" else: share_dict['Is Company Matching'] = "No" else: # if total count is less than or equal to 1 then we cannot validate if pan_dict: pan_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if coi_dict: coi_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if gst_dict: gst_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if aoa_dict: aoa_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if moa_dict: moa_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if share_dict: share_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" else: if pan_dict: pan_dict['More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if coi_dict: coi_dict['More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if gst_dict: gst_dict['More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if aoa_dict: aoa_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if moa_dict: moa_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if share_dict: share_dict[ 'More information needed to validate company name'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if pan_number_company is not None: if total_pan_number>1: if pan_dict: pan_number=pan_dict.get('PAN Number',None) if pan_number is not None and pan_number.strip()==pan_number_company: pan_dict['Is Company PAN Number Matching']="Yes" else: pan_dict['Is Company PAN Number Matching'] = "No" if coi_dict: pan_number = coi_dict.get('PAN Number',None) if pan_number is not None and pan_number.strip() == pan_number_company: coi_dict['Is Company PAN Number Matching'] = "Yes" else: coi_dict['Is Company PAN Number Matching'] = "No" else: if pan_dict: pan_dict[ 'More information needed to validate company PAN number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if coi_dict: coi_dict[ 'More information needed to validate company PAN number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" else: if pan_dict: pan_dict['More information needed to validate company PAN number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if coi_dict: coi_dict['More information needed to validate company PAN number']= f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if coi_number is not None: if total_coi_numbers>1: if coi_dict: coi_number_check=coi_dict.get('Corporate Identity Number',None) if coi_number_check is not None and coi_number_check.strip()==coi_number: coi_dict['Is Corporate Identity Number Matching']="Yes" else: coi_dict['Is Corporate Identity Number Matching']="No" if share_dict: coi_number_check=share_dict.get('Corporate Identity Number',None) if coi_number_check is not None and coi_number_check.strip()==coi_number: share_dict['Is Corporate Identity Number Matching']="Yes" else: share_dict['Is Corporate Identity Number Matching'] = "No" else: if coi_dict: coi_dict[ 'More information needed to validate company COI number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if share_dict: share_dict[ 'More information needed to validate company COI number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" else: if coi_dict: coi_dict['More information needed to validate company COI number']=f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if share_dict: share_dict['More information needed to validate company COI number'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if "Names of directors" in gst_dict: gst_director_names = gst_dict.get("Names of directors", []) if gst_director_names: gst_dict["Names of directors"]=",".join(gst_director_names) missing_directors = [] gst_director_names_lower = [name.strip().lower() for name in gst_director_names] if director_names: for direc_name in director_names: if direc_name not in gst_director_names_lower: missing_directors.append(direc_name) if not missing_directors: gst_dict["All director names present?"] = "<span style='color: green;'><strong>Yes</strong></span>" else: # List missing director names in red missing_directors_text = ', '.join( [f"<span style='color: red;'>{name}</span>" for name in missing_directors]) gst_dict["All director names present?"] = f"<span style='color: red;'><strong>No</strong></span> (Missing: {missing_directors_text})" else: gst_dict['More information needed to validate Director names'] = f"<span style='color: #d4ac0d;'><strong>Yes</strong></span>" if pan_dict: extracted_results['Company PAN Details']=pan_dict if coi_dict: extracted_results['COI Details']=coi_dict if gst_dict: extracted_results['GST Certificate Details']=gst_dict if moa_dict: extracted_results['MOA Details']=gst_dict if aoa_dict: extracted_results['AOA Details']=aoa_dict if share_dict: extracted_results['Shareholding Details']=share_dict return extracted_results except Exception as e: print(f"error occured in processing files {e}") finally: cleanup_temp_files() def display_results_in_cards(extracted_results): col1, col2, col3 = st.columns(3) # Create three columns count = 0 # To keep track of the row/column positioning for key in extracted_results: # Determine which column to use (cycle through 3 columns) current_col = col1 if count % 3 == 0 else col2 if count % 3 == 1 else col3 with current_col: # Process director PAN data if key == "Pan Cards of Directors": d_pans = extracted_results[key] text = "" # Build the text for each director's PAN information for count, d_pan in enumerate(d_pans): text += f"<h4 style='color:black;'>Pan Information of Director {count + 1}</h4>" for key2 in d_pan: # Add each field to the text field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {d_pan[key2]}<br>" text += field_text # Display in a custom card-like layout current_col.markdown(f""" <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;"> <div style="color: #333; height: 150px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;"> <p>{text}</p> </div> </div> """, unsafe_allow_html=True) # Process director Aadhaar data elif key == "Aadhaar Cards of Directors": d_aadhars = extracted_results[key] text = "" for count, d_aadhar in enumerate(d_aadhars): text += f"<h4 style='color:black;'>Aadhaar Information of Director {count + 1}</h4>" for key2 in d_aadhar: field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {d_aadhar[key2]}<br>" text += field_text current_col.markdown(f""" <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;"> <div style="color: #333; height: 230px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;"> <p>{text}</p> </div> </div> """, unsafe_allow_html=True) # Process other documents and check for 'Yes'/'No' status else: data = extracted_results[key] text = f"<h4 style='color:black;'>{key}</h4>" for key2 in data: # Apply color and bold formatting for "Yes" and "No" values if key2 in ["Is Company Matching", "Is Corporate Identity Number Matching", "Is Company PAN Number Matching"]: if data[key2] == "Yes": field_text = f"<strong style='color: #3498db;'>{key2}</strong>: <strong><span style='color: green;'>{data[key2]}</span></strong><br>" elif data[key2] == "No": field_text = f"<strong style='color: #3498db;'>{key2}</strong>: <strong><span style='color: red;'>{data[key2]}</span></strong><br>" else: field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {data[key2]}<br>" else: field_text = f"<strong style='color: #3498db;'>{key2}</strong>: {data[key2]}<br>" text += field_text current_col.markdown(f""" <div style="border: 1px solid #ddd; padding: 15px; border-radius: 10px; margin-bottom: 20px; background-color: #f9f9f9; height: 250px; display: flex; flex-direction: column; justify-content: space-between;"> <div style="color: #333; height: 230px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #888 #ddd;"> <p>{text}</p> </div> </div> """, unsafe_allow_html=True) count += 1 def main(): if "submitted" not in st.session_state: st.session_state["submitted"] = False if not st.session_state["submitted"]: st.title("Welcome to KYC DOC AI") # Create a form to handle all file uploads and submission with st.form("kyc_form"): uploaded_files = {} # Director PAN Cards (Multiple files) director_pans = st.file_uploader("Upload PAN Cards of Directors", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=True) if director_pans: uploaded_files['director_pans'] = director_pans # Director Aadhar Cards (Multiple files) director_aadhars = st.file_uploader("Upload Aadhar Cards of Directors", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=True) if director_aadhars: uploaded_files['director_aadhars'] = director_aadhars # GST Certificate (Single file) gst_certificate = st.file_uploader("Upload GST Certificate", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False) if gst_certificate: uploaded_files['gst_certificate'] = gst_certificate # Company PAN Card (Single file) company_pan = st.file_uploader("Upload PAN Card of Company", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False) if company_pan: uploaded_files['company_pan'] = company_pan # Certificate of Incorporation (Single file) coi_certificate = st.file_uploader("Upload CERTIFICATE OF INCORPORATION", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False) if coi_certificate: uploaded_files['coi'] = coi_certificate # AOA Document (Single file) aoa = st.file_uploader("Upload AOA document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False) if aoa: uploaded_files['aoa'] = aoa # COA Document (Single file) moa = st.file_uploader("Upload MOA document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False) if moa: uploaded_files['moa'] = moa # Shareholding Document (Single file) share = st.file_uploader("Upload Shareholding Document", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False) if share: uploaded_files['share'] = share # Address Proof of Company Office (Single file) address = st.file_uploader("Upload Address Proof of Company Office", type=['png', 'jpg', 'jpeg', 'pdf'], accept_multiple_files=False) if address: uploaded_files['address_proof'] = address # Submit button for the form submitted = st.form_submit_button("Process Files") if submitted: with st.spinner('Processing files...'): extracted_results=process_uploads(uploaded_files) st.session_state["extracted_results"] = extracted_results st.session_state["submitted"] = True st.rerun() else: st.title("KYC Document Results") if "extracted_results" in st.session_state: extracted_results = st.session_state["extracted_results"] display_results_in_cards(extracted_results) if st.button("Back"): st.session_state["submitted"] = False st.rerun() if __name__ == "__main__": main()