chandanzeon commited on
Commit
7eab253
·
1 Parent(s): 891d816

removed commented code

Browse files
Files changed (2) hide show
  1. app.py +0 -46
  2. helper.py +0 -163
app.py CHANGED
@@ -80,49 +80,3 @@ if uploaded_excel and uploaded_text:
80
  data=excel_data,
81
  file_name='Fetch_Employer_Output.xlsx',
82
  mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
83
-
84
-
85
- # import streamlit as st
86
- # import pandas as pd
87
- # from io import BytesIO
88
- # from helper import get_res_df
89
-
90
- # def to_excel(df):
91
- # output = BytesIO()
92
- # with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
93
- # df.to_excel(writer, index=False)
94
- # processed_data = output.getvalue()
95
- # return processed_data
96
-
97
- # def process_files(excel_file, text_file):
98
- # print(excel_file,text_file)
99
- # if excel_file.name.endswith('.csv'):
100
- # df_excel = pd.read_csv(excel_file)
101
- # else:
102
- # df_excel = pd.read_excel(excel_file)
103
- # df_excel['Employer Number']=[str(number).zfill(6) for number in df_excel['Employer Number']]
104
-
105
- # lines = text_file.read().decode('utf-8').splitlines()
106
- # data = [line.strip().split(',') for line in lines]
107
- # df = pd.DataFrame(data)
108
-
109
- # return df_excel,df
110
-
111
-
112
- # st.title("Fetch Employer")
113
-
114
- # uploaded_excel = st.file_uploader("Upload the Master file(.xls or .csv)", type=["csv", "xls", "xlsx"])
115
- # uploaded_text = st.file_uploader("Upload your Text file(.txt)", type=["txt"])
116
-
117
- # if uploaded_excel and uploaded_text:
118
- # st.write("Processing the files...")
119
- # master_data, df = process_files(uploaded_excel, uploaded_text)
120
-
121
- # st.write("Final Output")
122
- # res = get_res_df(master_data,df)
123
- # st.dataframe(res)
124
- # excel_data = to_excel(res)
125
- # st.download_button(label="Download Excel",
126
- # data=excel_data,
127
- # file_name='Fetch_Employer_Output.xlsx',
128
- # mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
 
80
  data=excel_data,
81
  file_name='Fetch_Employer_Output.xlsx',
82
  mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
helper.py CHANGED
@@ -234,166 +234,3 @@ def get_res_df(master_data, df):
234
  res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
235
 
236
  return res_df
237
-
238
- # import pandas as pd
239
- # import numpy as np
240
- # from rank_bm25 import BM25Okapi
241
- # import re
242
- # from nltk.stem import WordNetLemmatizer,PorterStemmer
243
- # from datetime import datetime
244
- # lemmatizer = WordNetLemmatizer()
245
-
246
- # threshold = 11
247
-
248
- # def clean_text(text):
249
- # cleaned_text = text.lower()
250
- # cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text)
251
- # cleaned_text = re.sub(r'\.', '', cleaned_text)
252
- # cleaned_text = re.sub(r'\/', '', cleaned_text)
253
- # cleaned_text = re.sub(r'\d{3,}', '', cleaned_text)
254
- # cleaned_text = re.sub('pvt','private',cleaned_text)
255
- # cleaned_text = re.sub('ltd','limited',cleaned_text)
256
- # cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development',cleaned_text)
257
- # cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation',cleaned_text)
258
- # cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
259
- # cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()])
260
- # # cleaned_text = ' '.join([stemmer.stem(word) for word in cleaned_text.split()])
261
- # return cleaned_text.strip()
262
-
263
- # def fetch_empno(text):
264
- # return re.findall(r'\b\d{6}\b', text)
265
-
266
- # def preprocess_query(query):
267
- # new_query = query
268
- # if '||' in query:
269
- # ind = query.find('||')
270
- # new_query=query[ind+2:]
271
- # elif '-' in query:
272
- # ind = query.find('-')
273
- # new_query=query[ind:]
274
- # if len(new_query) < 20:
275
- # new_query = query
276
- # new_query = clean_text(new_query)
277
- # return new_query
278
-
279
- # def parse_date(date_str):
280
- # try:
281
- # return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y')
282
- # except ValueError:
283
- # try:
284
- # return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y')
285
- # except ValueError:
286
- # return date_str.strftime('%m/%d/%Y')
287
-
288
- # def generate_df(master_data, df, employer_names):
289
- # dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
290
- # bank_desc = list(df[9])
291
- # accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
292
- # credits = list(df[7])
293
- # employer_codes = []
294
- # bank_statemnt_ref = []
295
- # account_mgr = []
296
- # emp_province = []
297
- # region = []
298
- # industry = []
299
- # contributing_stts = []
300
- # date_joined = []
301
- # termination_date = []
302
- # email_addr = []
303
- # for name in employer_names:
304
- # if name=="NOT FOUND":
305
- # employer_codes.append(np.nan)
306
- # bank_statemnt_ref.append(np.nan)
307
- # account_mgr.append(np.nan)
308
- # emp_province.append(np.nan)
309
- # region.append(np.nan)
310
- # industry.append(np.nan)
311
- # contributing_stts.append(np.nan)
312
- # date_joined.append(np.nan)
313
- # termination_date.append(np.nan)
314
- # email_addr.append(np.nan)
315
- # else:
316
- # tmp = master_data[master_data['Employer Name']==name]
317
- # if tmp.empty:
318
- # employer_codes.append(np.nan)
319
- # bank_statemnt_ref.append(np.nan)
320
- # account_mgr.append(np.nan)
321
- # emp_province.append(np.nan)
322
- # region.append(np.nan)
323
- # industry.append(np.nan)
324
- # contributing_stts.append(np.nan)
325
- # date_joined.append(np.nan)
326
- # termination_date.append(np.nan)
327
- # email_addr.append(np.nan)
328
- # else:
329
- # employer_codes.append(list(tmp['Employer Number'])[-1])
330
- # bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1])
331
- # account_mgr.append(list(tmp['NASFUNDContact'])[-1])
332
- # emp_province.append(list(tmp['Employer Province'])[-1])
333
- # region.append(list(tmp['Region'])[-1])
334
- # industry.append(list(tmp['Industry'])[-1])
335
- # contributing_stts.append(list(tmp['Contributing Status'])[-1])
336
- # date = str(list(tmp['Date Joined Plan'])[-1])
337
- # date_joined.append(parse_date(date))
338
- # termination_date.append(list(tmp['Termination Date'])[-1])
339
- # email_addr.append(list(tmp['Email Addresses'])[-1])
340
-
341
- # res_df = pd.DataFrame()
342
- # res_df['Receipt Date'] = dates
343
- # res_df['Bank Description'] = bank_desc
344
- # res_df['Account'] = accounts
345
- # res_df[' Credit '] = credits
346
- # res_df['Employer Code'] = employer_codes
347
- # res_df['Employer Name'] = employer_names
348
- # res_df['Bank Statement Reference'] = bank_statemnt_ref
349
- # res_df['Account Manager'] = account_mgr
350
- # res_df['Employer Province'] = emp_province
351
- # res_df['Region'] = region
352
- # res_df['Industry'] = industry
353
- # res_df['Contributing Status'] = contributing_stts
354
- # res_df['Date Joined Plan'] = date_joined
355
- # res_df['Termination Date'] = termination_date
356
- # res_df['Email Addresses'] = email_addr
357
- # res_df['First Name'] = np.nan
358
- # res_df['Surname'] = np.nan
359
- # res_df['Membership#'] = np.nan
360
-
361
- # return res_df
362
-
363
- # def get_res_df(master_data,df):
364
- # corpus = list(master_data['Employer Name'])
365
- # lower_case_corpus = [clean_text(name) for name in corpus]
366
- # corpus = corpus[1:]
367
- # lower_case_corpus = lower_case_corpus[1:]
368
- # tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus]
369
- # bm25 = BM25Okapi(tokenized_corpus)
370
- # queries = list(df[9])
371
- # queries = [query[:query.rindex('-')] for query in queries]
372
- # empnos = [fetch_empno(text) for text in queries]
373
- # new_queries = [preprocess_query(query) for query in queries]
374
-
375
- # res_names = []
376
- # scores = []
377
- # for query,empno_arr in zip(new_queries,empnos):
378
- # name = ""
379
- # if len(empno_arr) != 0:
380
- # for empno in empno_arr:
381
- # names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
382
- # if len(names)!=0:
383
- # name=names[0]
384
- # scores.append(100)
385
- # res_names.append(name)
386
- # break
387
- # if name=="":
388
- # tokenized_query = query.split(" ")
389
- # name = bm25.get_top_n(tokenized_query, corpus, n=1)
390
- # doc_score = max(bm25.get_scores(tokenized_query))
391
- # scores.append(doc_score)
392
- # res_names.append(name[0] if doc_score>threshold else "NOT FOUND")
393
- # not_found=0
394
- # for score in scores:
395
- # if score<threshold:
396
- # not_found+=1
397
- # res_df = generate_df(master_data=master_data,df=df,employer_names=res_names)
398
-
399
- # return res_df
 
234
  res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
235
 
236
  return res_df