Spaces:
Sleeping
Sleeping
Commit
·
4efd35b
1
Parent(s):
80b184f
ignore other accounts
Browse files- __pycache__/helper.cpython-312.pyc +0 -0
- app.py +2 -0
- helper.py +54 -37
__pycache__/helper.cpython-312.pyc
CHANGED
Binary files a/__pycache__/helper.cpython-312.pyc and b/__pycache__/helper.cpython-312.pyc differ
|
|
app.py
CHANGED
@@ -44,6 +44,7 @@ def process_files(excel_file, text_file):
|
|
44 |
|
45 |
# Ensure the 'Employer Number' column values are formatted as zero-padded 6-digit strings
|
46 |
df_excel['Employer Number'] = [str(number).zfill(6) for number in df_excel['Employer Number']]
|
|
|
47 |
|
48 |
# Read and process the text file content into a list of lines
|
49 |
lines = text_file.read().decode('utf-8').splitlines()
|
@@ -51,6 +52,7 @@ def process_files(excel_file, text_file):
|
|
51 |
|
52 |
# Create a DataFrame from the parsed text file data
|
53 |
df = pd.DataFrame(data)
|
|
|
54 |
|
55 |
return df_excel, df
|
56 |
|
|
|
44 |
|
45 |
# Ensure the 'Employer Number' column values are formatted as zero-padded 6-digit strings
|
46 |
df_excel['Employer Number'] = [str(number).zfill(6) for number in df_excel['Employer Number']]
|
47 |
+
df_excel = df_excel.dropna(subset=['Employer Name'])
|
48 |
|
49 |
# Read and process the text file content into a list of lines
|
50 |
lines = text_file.read().decode('utf-8').splitlines()
|
|
|
52 |
|
53 |
# Create a DataFrame from the parsed text file data
|
54 |
df = pd.DataFrame(data)
|
55 |
+
df = df[df[1].isin(['1001010071', '1001233102'])]
|
56 |
|
57 |
return df_excel, df
|
58 |
|
helper.py
CHANGED
@@ -107,7 +107,7 @@ def generate_df(master_data, df, employer_names):
|
|
107 |
"""
|
108 |
dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
|
109 |
bank_desc = list(df[9])
|
110 |
-
accounts = ['NASA' if i == '
|
111 |
credits = list(df[7])
|
112 |
|
113 |
# Initialize lists for employer-related fields
|
@@ -128,6 +128,17 @@ def generate_df(master_data, df, employer_names):
|
|
128 |
date_joined.append(np.nan)
|
129 |
termination_date.append(np.nan)
|
130 |
email_addr.append(np.nan)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
else:
|
132 |
tmp = master_data[master_data['Employer Name'] == name]
|
133 |
if tmp.empty:
|
@@ -203,6 +214,7 @@ def get_res_df(master_data, df, thrshld):
|
|
203 |
# Preprocess queries from transaction data
|
204 |
queries = list(df[9])
|
205 |
queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-'
|
|
|
206 |
empnos = [fetch_empno(text) for text in queries]
|
207 |
new_queries = [preprocess_query(query) for query in queries]
|
208 |
|
@@ -218,46 +230,51 @@ def get_res_df(master_data, df, thrshld):
|
|
218 |
exact_matches.append('')
|
219 |
|
220 |
res_names, found_by, scores = [], [], []
|
221 |
-
found_by_direct_search, found_by_emp_no, found_by_bm5, not_found = 0, 0, 0, 0
|
222 |
|
223 |
# Match each query to an employer
|
224 |
-
for query,empno_arr,exact_match in zip(new_queries,empnos,exact_matches):
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
|
260 |
# Generate the final result DataFrame
|
261 |
res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
|
262 |
-
|
263 |
return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found
|
|
|
107 |
"""
|
108 |
dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
|
109 |
bank_desc = list(df[9])
|
110 |
+
accounts = ['NASA' if i == '1001010071' else 'EDAS' if i == '1001233102' else '' for i in df[1]]
|
111 |
credits = list(df[7])
|
112 |
|
113 |
# Initialize lists for employer-related fields
|
|
|
128 |
date_joined.append(np.nan)
|
129 |
termination_date.append(np.nan)
|
130 |
email_addr.append(np.nan)
|
131 |
+
elif name == "EDAS":
|
132 |
+
employer_codes.append(np.nan)
|
133 |
+
bank_statemnt_ref.append(np.nan)
|
134 |
+
account_mgr.append(np.nan)
|
135 |
+
emp_province.append(np.nan)
|
136 |
+
region.append(np.nan)
|
137 |
+
industry.append(np.nan)
|
138 |
+
contributing_stts.append(np.nan)
|
139 |
+
date_joined.append(np.nan)
|
140 |
+
termination_date.append(np.nan)
|
141 |
+
email_addr.append(np.nan)
|
142 |
else:
|
143 |
tmp = master_data[master_data['Employer Name'] == name]
|
144 |
if tmp.empty:
|
|
|
214 |
# Preprocess queries from transaction data
|
215 |
queries = list(df[9])
|
216 |
queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-'
|
217 |
+
acc_nos = list(df[1])
|
218 |
empnos = [fetch_empno(text) for text in queries]
|
219 |
new_queries = [preprocess_query(query) for query in queries]
|
220 |
|
|
|
230 |
exact_matches.append('')
|
231 |
|
232 |
res_names, found_by, scores = [], [], []
|
233 |
+
found_by_direct_search, found_by_emp_no, found_by_bm5, not_found, edas = 0, 0, 0, 0, 0
|
234 |
|
235 |
# Match each query to an employer
|
236 |
+
for query,empno_arr,exact_match,acc_no in zip(new_queries,empnos,exact_matches,acc_nos):
|
237 |
+
if acc_no == '1001233102':
|
238 |
+
edas+=1
|
239 |
+
res_names.append("EDAS")
|
240 |
+
found_by.append("EDAS")
|
241 |
+
else:
|
242 |
+
name = ""
|
243 |
+
# Find Employer by Direct Search
|
244 |
+
if exact_match!='':
|
245 |
+
name = exact_match
|
246 |
+
scores.append(100)
|
247 |
+
found_by_direct_search+=1
|
248 |
+
found_by.append("Direct Search")
|
249 |
+
res_names.append(name)
|
250 |
|
251 |
+
# Try to find an employer using the employee number if Direct Search Fails
|
252 |
+
elif len(empno_arr) != 0:
|
253 |
+
for empno in empno_arr:
|
254 |
+
names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
|
255 |
+
if len(names)!=0:
|
256 |
+
name=names[0]
|
257 |
+
scores.append(100) # Perfect match with employee number
|
258 |
+
found_by_emp_no+=1
|
259 |
+
found_by.append("Employer Number")
|
260 |
+
res_names.append(name)
|
261 |
+
break
|
262 |
+
# Fall back to BM25 matching if employee number fails
|
263 |
+
if name=="":
|
264 |
+
tokenized_query = query.split(" ")
|
265 |
+
name = bm25.get_top_n(tokenized_query, corpus, n=1)
|
266 |
+
doc_score = max(bm25.get_scores(tokenized_query))
|
267 |
+
scores.append(doc_score)
|
268 |
+
if doc_score>threshold:
|
269 |
+
found_by_bm5 += 1
|
270 |
+
res_names.append(name[0])
|
271 |
+
found_by.append("BM25")
|
272 |
+
else:
|
273 |
+
not_found+=1
|
274 |
+
res_names.append("NOT FOUND")
|
275 |
+
found_by.append("NOT FOUND")
|
276 |
|
277 |
# Generate the final result DataFrame
|
278 |
res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
|
279 |
+
print(f"{found_by_direct_search=},{found_by_emp_no=},{found_by_bm5=},{not_found=},{edas=}")
|
280 |
return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found
|