chandanzeon commited on
Commit
4efd35b
·
1 Parent(s): 80b184f

ignore other accounts

Browse files
Files changed (3) hide show
  1. __pycache__/helper.cpython-312.pyc +0 -0
  2. app.py +2 -0
  3. helper.py +54 -37
__pycache__/helper.cpython-312.pyc CHANGED
Binary files a/__pycache__/helper.cpython-312.pyc and b/__pycache__/helper.cpython-312.pyc differ
 
app.py CHANGED
@@ -44,6 +44,7 @@ def process_files(excel_file, text_file):
44
 
45
  # Ensure the 'Employer Number' column values are formatted as zero-padded 6-digit strings
46
  df_excel['Employer Number'] = [str(number).zfill(6) for number in df_excel['Employer Number']]
 
47
 
48
  # Read and process the text file content into a list of lines
49
  lines = text_file.read().decode('utf-8').splitlines()
@@ -51,6 +52,7 @@ def process_files(excel_file, text_file):
51
 
52
  # Create a DataFrame from the parsed text file data
53
  df = pd.DataFrame(data)
 
54
 
55
  return df_excel, df
56
 
 
44
 
45
  # Ensure the 'Employer Number' column values are formatted as zero-padded 6-digit strings
46
  df_excel['Employer Number'] = [str(number).zfill(6) for number in df_excel['Employer Number']]
47
+ df_excel = df_excel.dropna(subset=['Employer Name'])
48
 
49
  # Read and process the text file content into a list of lines
50
  lines = text_file.read().decode('utf-8').splitlines()
 
52
 
53
  # Create a DataFrame from the parsed text file data
54
  df = pd.DataFrame(data)
55
+ df = df[df[1].isin(['1001010071', '1001233102'])]
56
 
57
  return df_excel, df
58
 
helper.py CHANGED
@@ -107,7 +107,7 @@ def generate_df(master_data, df, employer_names):
107
  """
108
  dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
109
  bank_desc = list(df[9])
110
- accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
111
  credits = list(df[7])
112
 
113
  # Initialize lists for employer-related fields
@@ -128,6 +128,17 @@ def generate_df(master_data, df, employer_names):
128
  date_joined.append(np.nan)
129
  termination_date.append(np.nan)
130
  email_addr.append(np.nan)
 
 
 
 
 
 
 
 
 
 
 
131
  else:
132
  tmp = master_data[master_data['Employer Name'] == name]
133
  if tmp.empty:
@@ -203,6 +214,7 @@ def get_res_df(master_data, df, thrshld):
203
  # Preprocess queries from transaction data
204
  queries = list(df[9])
205
  queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-'
 
206
  empnos = [fetch_empno(text) for text in queries]
207
  new_queries = [preprocess_query(query) for query in queries]
208
 
@@ -218,46 +230,51 @@ def get_res_df(master_data, df, thrshld):
218
  exact_matches.append('')
219
 
220
  res_names, found_by, scores = [], [], []
221
- found_by_direct_search, found_by_emp_no, found_by_bm5, not_found = 0, 0, 0, 0
222
 
223
  # Match each query to an employer
224
- for query,empno_arr,exact_match in zip(new_queries,empnos,exact_matches):
225
- name = ""
226
- # Find Employer by Direct Search
227
- if exact_match!='':
228
- name = exact_match
229
- scores.append(100)
230
- found_by_direct_search+=1
231
- found_by.append("Direct Search")
232
- res_names.append(name)
 
 
 
 
 
233
 
234
- # Try to find an employer using the employee number if Direct Search Fails
235
- elif len(empno_arr) != 0:
236
- for empno in empno_arr:
237
- names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
238
- if len(names)!=0:
239
- name=names[0]
240
- scores.append(100) # Perfect match with employee number
241
- found_by_emp_no+=1
242
- found_by.append("Employer Number")
243
- res_names.append(name)
244
- break
245
- # Fall back to BM25 matching if employee number fails
246
- if name=="":
247
- tokenized_query = query.split(" ")
248
- name = bm25.get_top_n(tokenized_query, corpus, n=1)
249
- doc_score = max(bm25.get_scores(tokenized_query))
250
- scores.append(doc_score)
251
- if doc_score>threshold:
252
- found_by_bm5 += 1
253
- res_names.append(name[0])
254
- found_by.append("BM25")
255
- else:
256
- not_found+=1
257
- res_names.append("NOT FOUND")
258
- found_by.append("NOT FOUND")
259
 
260
  # Generate the final result DataFrame
261
  res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
262
-
263
  return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found
 
107
  """
108
  dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
109
  bank_desc = list(df[9])
110
+ accounts = ['NASA' if i == '1001010071' else 'EDAS' if i == '1001233102' else '' for i in df[1]]
111
  credits = list(df[7])
112
 
113
  # Initialize lists for employer-related fields
 
128
  date_joined.append(np.nan)
129
  termination_date.append(np.nan)
130
  email_addr.append(np.nan)
131
+ elif name == "EDAS":
132
+ employer_codes.append(np.nan)
133
+ bank_statemnt_ref.append(np.nan)
134
+ account_mgr.append(np.nan)
135
+ emp_province.append(np.nan)
136
+ region.append(np.nan)
137
+ industry.append(np.nan)
138
+ contributing_stts.append(np.nan)
139
+ date_joined.append(np.nan)
140
+ termination_date.append(np.nan)
141
+ email_addr.append(np.nan)
142
  else:
143
  tmp = master_data[master_data['Employer Name'] == name]
144
  if tmp.empty:
 
214
  # Preprocess queries from transaction data
215
  queries = list(df[9])
216
  queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-'
217
+ acc_nos = list(df[1])
218
  empnos = [fetch_empno(text) for text in queries]
219
  new_queries = [preprocess_query(query) for query in queries]
220
 
 
230
  exact_matches.append('')
231
 
232
  res_names, found_by, scores = [], [], []
233
+ found_by_direct_search, found_by_emp_no, found_by_bm5, not_found, edas = 0, 0, 0, 0, 0
234
 
235
  # Match each query to an employer
236
+ for query,empno_arr,exact_match,acc_no in zip(new_queries,empnos,exact_matches,acc_nos):
237
+ if acc_no == '1001233102':
238
+ edas+=1
239
+ res_names.append("EDAS")
240
+ found_by.append("EDAS")
241
+ else:
242
+ name = ""
243
+ # Find Employer by Direct Search
244
+ if exact_match!='':
245
+ name = exact_match
246
+ scores.append(100)
247
+ found_by_direct_search+=1
248
+ found_by.append("Direct Search")
249
+ res_names.append(name)
250
 
251
+ # Try to find an employer using the employee number if Direct Search Fails
252
+ elif len(empno_arr) != 0:
253
+ for empno in empno_arr:
254
+ names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
255
+ if len(names)!=0:
256
+ name=names[0]
257
+ scores.append(100) # Perfect match with employee number
258
+ found_by_emp_no+=1
259
+ found_by.append("Employer Number")
260
+ res_names.append(name)
261
+ break
262
+ # Fall back to BM25 matching if employee number fails
263
+ if name=="":
264
+ tokenized_query = query.split(" ")
265
+ name = bm25.get_top_n(tokenized_query, corpus, n=1)
266
+ doc_score = max(bm25.get_scores(tokenized_query))
267
+ scores.append(doc_score)
268
+ if doc_score>threshold:
269
+ found_by_bm5 += 1
270
+ res_names.append(name[0])
271
+ found_by.append("BM25")
272
+ else:
273
+ not_found+=1
274
+ res_names.append("NOT FOUND")
275
+ found_by.append("NOT FOUND")
276
 
277
  # Generate the final result DataFrame
278
  res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
279
+ print(f"{found_by_direct_search=},{found_by_emp_no=},{found_by_bm5=},{not_found=},{edas=}")
280
  return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found