Spaces:

chandanzeon
/

Fetch_Employer_Name

Sleeping

App Files Files Community

chandanzeon commited on Oct 15, 2024

Commit

7eab253

1 Parent(s): 891d816

removed commented code

Browse files

Files changed (2) hide show

app.py +0 -46
helper.py +0 -163

app.py CHANGED Viewed

@@ -80,49 +80,3 @@ if uploaded_excel and uploaded_text:
                        data=excel_data,
                        file_name='Fetch_Employer_Output.xlsx',
                        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
-# import streamlit as st
-# import pandas as pd
-# from io import BytesIO
-# from helper import get_res_df
-# def to_excel(df):
-#     output = BytesIO()
-#     with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
-#         df.to_excel(writer, index=False)
-#     processed_data = output.getvalue()
-#     return processed_data
-# def process_files(excel_file, text_file):
-#     print(excel_file,text_file)
-#     if excel_file.name.endswith('.csv'):
-#         df_excel = pd.read_csv(excel_file)
-#     else:
-#         df_excel = pd.read_excel(excel_file)
-#     df_excel['Employer Number']=[str(number).zfill(6) for number in df_excel['Employer Number']]
-#     lines = text_file.read().decode('utf-8').splitlines()
-#     data = [line.strip().split(',') for line in lines]
-#     df = pd.DataFrame(data)
-#     return df_excel,df
-# st.title("Fetch Employer")
-# uploaded_excel = st.file_uploader("Upload the Master file(.xls or .csv)", type=["csv", "xls", "xlsx"])
-# uploaded_text = st.file_uploader("Upload your Text file(.txt)", type=["txt"])
-# if uploaded_excel and uploaded_text:
-#     st.write("Processing the files...")
-#     master_data, df = process_files(uploaded_excel, uploaded_text)
-#     st.write("Final Output")
-#     res = get_res_df(master_data,df)
-#     st.dataframe(res)
-#     excel_data = to_excel(res)
-#     st.download_button(label="Download Excel",
-#                     data=excel_data,
-#                     file_name='Fetch_Employer_Output.xlsx',
-#                     mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')

                        data=excel_data,
                        file_name='Fetch_Employer_Output.xlsx',
                        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')

helper.py CHANGED Viewed

@@ -234,166 +234,3 @@ def get_res_df(master_data, df):
     res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
     return res_df
-# import pandas as pd
-# import numpy as np
-# from rank_bm25 import BM25Okapi
-# import re
-# from nltk.stem import WordNetLemmatizer,PorterStemmer
-# from datetime import datetime
-# lemmatizer = WordNetLemmatizer()
-# threshold = 11
-# def clean_text(text):
-#     cleaned_text = text.lower()
-#     cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text)
-#     cleaned_text = re.sub(r'\.', '', cleaned_text)
-#     cleaned_text = re.sub(r'\/', '', cleaned_text)
-#     cleaned_text = re.sub(r'\d{3,}', '', cleaned_text)
-#     cleaned_text = re.sub('pvt','private',cleaned_text)
-#     cleaned_text = re.sub('ltd','limited',cleaned_text)
-#     cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development',cleaned_text)
-#     cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation',cleaned_text)
-#     cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
-#     cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()])
-#     # cleaned_text = ' '.join([stemmer.stem(word) for word in cleaned_text.split()])
-#     return cleaned_text.strip()
-# def fetch_empno(text):
-#     return re.findall(r'\b\d{6}\b', text)
-# def preprocess_query(query):
-#     new_query = query
-#     if '||' in query:
-#         ind = query.find('||')
-#         new_query=query[ind+2:]
-#     elif '-' in query:
-#         ind = query.find('-')
-#         new_query=query[ind:]
-#     if len(new_query) < 20:
-#         new_query = query
-#     new_query = clean_text(new_query)
-#     return new_query
-# def parse_date(date_str):
-#     try:
-#         return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y')
-#     except ValueError:
-#         try:
-#             return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y')
-#         except ValueError:
-#             return date_str.strftime('%m/%d/%Y')
-# def generate_df(master_data, df, employer_names):
-#     dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
-#     bank_desc = list(df[9])
-#     accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
-#     credits = list(df[7])
-#     employer_codes = []
-#     bank_statemnt_ref = []
-#     account_mgr = []
-#     emp_province = []
-#     region = []
-#     industry = []
-#     contributing_stts = []
-#     date_joined = []
-#     termination_date = []
-#     email_addr = []
-#     for name in employer_names:
-#         if name=="NOT FOUND":
-#             employer_codes.append(np.nan)
-#             bank_statemnt_ref.append(np.nan)
-#             account_mgr.append(np.nan)
-#             emp_province.append(np.nan)
-#             region.append(np.nan)
-#             industry.append(np.nan)
-#             contributing_stts.append(np.nan)
-#             date_joined.append(np.nan)
-#             termination_date.append(np.nan)
-#             email_addr.append(np.nan)
-#         else:
-#             tmp = master_data[master_data['Employer Name']==name]
-#             if tmp.empty:
-#                 employer_codes.append(np.nan)
-#                 bank_statemnt_ref.append(np.nan)
-#                 account_mgr.append(np.nan)
-#                 emp_province.append(np.nan)
-#                 region.append(np.nan)
-#                 industry.append(np.nan)
-#                 contributing_stts.append(np.nan)
-#                 date_joined.append(np.nan)
-#                 termination_date.append(np.nan)
-#                 email_addr.append(np.nan)
-#             else:
-#                 employer_codes.append(list(tmp['Employer Number'])[-1])
-#                 bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1])
-#                 account_mgr.append(list(tmp['NASFUNDContact'])[-1])
-#                 emp_province.append(list(tmp['Employer Province'])[-1])
-#                 region.append(list(tmp['Region'])[-1])
-#                 industry.append(list(tmp['Industry'])[-1])
-#                 contributing_stts.append(list(tmp['Contributing Status'])[-1])
-#                 date = str(list(tmp['Date Joined Plan'])[-1])
-#                 date_joined.append(parse_date(date))
-#                 termination_date.append(list(tmp['Termination Date'])[-1])
-#                 email_addr.append(list(tmp['Email Addresses'])[-1])
-#     res_df  = pd.DataFrame()
-#     res_df['Receipt Date'] = dates
-#     res_df['Bank Description'] = bank_desc
-#     res_df['Account'] = accounts
-#     res_df['  Credit  '] = credits
-#     res_df['Employer Code'] = employer_codes
-#     res_df['Employer Name'] = employer_names
-#     res_df['Bank Statement Reference'] = bank_statemnt_ref
-#     res_df['Account Manager'] = account_mgr
-#     res_df['Employer Province'] = emp_province
-#     res_df['Region'] = region
-#     res_df['Industry'] = industry
-#     res_df['Contributing Status'] = contributing_stts
-#     res_df['Date Joined Plan'] = date_joined
-#     res_df['Termination Date'] = termination_date
-#     res_df['Email Addresses'] = email_addr
-#     res_df['First Name'] = np.nan
-#     res_df['Surname'] = np.nan
-#     res_df['Membership#'] = np.nan
-#     return res_df
-# def get_res_df(master_data,df):
-#     corpus = list(master_data['Employer Name'])
-#     lower_case_corpus = [clean_text(name) for name in corpus]
-#     corpus = corpus[1:]
-#     lower_case_corpus = lower_case_corpus[1:]
-#     tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus]
-#     bm25 = BM25Okapi(tokenized_corpus)
-#     queries = list(df[9])
-#     queries = [query[:query.rindex('-')] for query in queries]
-#     empnos = [fetch_empno(text) for text in queries]
-#     new_queries = [preprocess_query(query) for query in queries]
-#     res_names = []
-#     scores = []
-#     for query,empno_arr in zip(new_queries,empnos):
-#         name = ""
-#         if len(empno_arr) != 0:
-#             for empno in empno_arr:
-#                 names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
-#                 if len(names)!=0:
-#                     name=names[0]
-#                     scores.append(100)
-#                     res_names.append(name)
-#                     break
-#         if name=="":
-#             tokenized_query = query.split(" ")
-#             name = bm25.get_top_n(tokenized_query, corpus, n=1)
-#             doc_score = max(bm25.get_scores(tokenized_query))
-#             scores.append(doc_score)
-#             res_names.append(name[0] if doc_score>threshold else "NOT FOUND")
-#     not_found=0
-#     for score in scores:
-#         if score<threshold:
-#             not_found+=1
-#     res_df = generate_df(master_data=master_data,df=df,employer_names=res_names)
-#     return res_df


234	res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
235
236	return res_df