chandanzeon commited on
Commit
2d3bc6e
·
1 Parent(s): 54b1517

First Commit

Browse files
Files changed (3) hide show
  1. app.py +125 -0
  2. helper.py +399 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from io import BytesIO
4
+ from helper import get_res_df
5
+
6
+ def to_excel(df):
7
+ """
8
+ Convert a Pandas DataFrame to an Excel file in memory.
9
+
10
+ Parameters:
11
+ df (DataFrame): The DataFrame to be converted to Excel format.
12
+
13
+ Returns:
14
+ bytes: The in-memory Excel file data.
15
+ """
16
+ output = BytesIO()
17
+ # Use the Pandas ExcelWriter to write the DataFrame to an in-memory file
18
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
19
+ df.to_excel(writer, index=False)
20
+ processed_data = output.getvalue()
21
+ return processed_data
22
+
23
+ def process_files(excel_file, text_file):
24
+ """
25
+ Process the uploaded Excel/CSV and text files and return cleaned dataframes.
26
+
27
+ Parameters:
28
+ excel_file (UploadedFile): The uploaded Excel or CSV file.
29
+ text_file (UploadedFile): The uploaded text file.
30
+
31
+ Returns:
32
+ Tuple[DataFrame, DataFrame]: A tuple containing the cleaned DataFrame from the Excel/CSV file
33
+ and a DataFrame created from the text file data.
34
+ """
35
+ print(excel_file, text_file) # Debugging information
36
+
37
+ # Read the Excel/CSV file into a DataFrame
38
+ if excel_file.name.endswith('.csv'):
39
+ df_excel = pd.read_csv(excel_file)
40
+ else:
41
+ df_excel = pd.read_excel(excel_file)
42
+
43
+ # Ensure the 'cfcf' column values are formatted as zero-padded 6-digit strings
44
+ df_excel['cfcf'] = [str(number).zfill(6) for number in df_excel['cfcf']]
45
+
46
+ # Read and process the text file content into a list of lines
47
+ lines = text_file.read().decode('utf-8').splitlines()
48
+ data = [line.strip().split(',') for line in lines] # Split each line by commas
49
+
50
+ # Create a DataFrame from the parsed text file data
51
+ df = pd.DataFrame(data)
52
+
53
+ return df_excel, df
54
+
55
+
56
+ # Streamlit UI section
57
+ st.title("Fetch Employer") # Application title
58
+
59
+ # File uploader widgets to allow users to upload an Excel/CSV file and a text file
60
+ uploaded_excel = st.file_uploader("Upload the Master file(.xls or .csv)", type=["csv", "xls", "xlsx"])
61
+ uploaded_text = st.file_uploader("Upload your Text file(.txt)", type=["txt"])
62
+
63
+ # Check if both files are uploaded
64
+ if uploaded_excel and uploaded_text:
65
+ st.write("Processing the files...") # Inform the user that the files are being processed
66
+ master_data, df = process_files(uploaded_excel, uploaded_text) # Process the files
67
+
68
+ st.write("Final Output") # Display the result of file processing
69
+ res = get_res_df(master_data, df) # Generate the result DataFrame using the helper function
70
+ st.dataframe(res) # Show the result in a table format on the web app
71
+
72
+ # Convert the result DataFrame to an Excel file for download
73
+ excel_data = to_excel(res)
74
+
75
+ # Provide a button for the user to download the result as an Excel file
76
+ st.download_button(label="Download Excel",
77
+ data=excel_data,
78
+ file_name='Fetch_Employer_Output.xlsx',
79
+ mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
80
+
81
+
82
+ # import streamlit as st
83
+ # import pandas as pd
84
+ # from io import BytesIO
85
+ # from helper import get_res_df
86
+
87
+ # def to_excel(df):
88
+ # output = BytesIO()
89
+ # with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
90
+ # df.to_excel(writer, index=False)
91
+ # processed_data = output.getvalue()
92
+ # return processed_data
93
+
94
+ # def process_files(excel_file, text_file):
95
+ # print(excel_file,text_file)
96
+ # if excel_file.name.endswith('.csv'):
97
+ # df_excel = pd.read_csv(excel_file)
98
+ # else:
99
+ # df_excel = pd.read_excel(excel_file)
100
+ # df_excel['cfcf']=[str(number).zfill(6) for number in df_excel['cfcf']]
101
+
102
+ # lines = text_file.read().decode('utf-8').splitlines()
103
+ # data = [line.strip().split(',') for line in lines]
104
+ # df = pd.DataFrame(data)
105
+
106
+ # return df_excel,df
107
+
108
+
109
+ # st.title("Fetch Employer")
110
+
111
+ # uploaded_excel = st.file_uploader("Upload the Master file(.xls or .csv)", type=["csv", "xls", "xlsx"])
112
+ # uploaded_text = st.file_uploader("Upload your Text file(.txt)", type=["txt"])
113
+
114
+ # if uploaded_excel and uploaded_text:
115
+ # st.write("Processing the files...")
116
+ # master_data, df = process_files(uploaded_excel, uploaded_text)
117
+
118
+ # st.write("Final Output")
119
+ # res = get_res_df(master_data,df)
120
+ # st.dataframe(res)
121
+ # excel_data = to_excel(res)
122
+ # st.download_button(label="Download Excel",
123
+ # data=excel_data,
124
+ # file_name='Fetch_Employer_Output.xlsx',
125
+ # mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
helper.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from rank_bm25 import BM25Okapi
4
+ import re
5
+ from nltk.stem import WordNetLemmatizer, PorterStemmer
6
+ from datetime import datetime
7
+
8
+ lemmatizer = WordNetLemmatizer()
9
+
10
+ threshold = 11.6 # Threshold score for employer match
11
+
12
+ def clean_text(text):
13
+ """
14
+ Cleans and normalizes the input text by performing the following operations:
15
+ - Lowercases the text
16
+ - Removes special characters and digits
17
+ - Replaces abbreviations with full words (e.g., 'pvt' -> 'private', 'ltd' -> 'limited')
18
+ - Lemmatizes the words for normalization
19
+
20
+ Parameters:
21
+ text (str): The input text string to be cleaned.
22
+
23
+ Returns:
24
+ str: The cleaned and lemmatized text.
25
+ """
26
+ cleaned_text = text.lower()
27
+ cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text) # Remove special characters
28
+ cleaned_text = re.sub(r'\.', '', cleaned_text) # Remove periods
29
+ cleaned_text = re.sub(r'\/', '', cleaned_text) # Remove slashes
30
+ cleaned_text = re.sub(r'\d{3,}', '', cleaned_text) # Remove numbers with more than 3 digits
31
+ cleaned_text = re.sub('pvt', 'private', cleaned_text) # Replace 'pvt' with 'private'
32
+ cleaned_text = re.sub('ltd', 'limited', cleaned_text) # Replace 'ltd' with 'limited'
33
+ cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development', cleaned_text) # Replace 'dev' with 'development'
34
+ cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation', cleaned_text) # Replace 'co' with 'corporation'
35
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Remove extra spaces
36
+ cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()]) # Lemmatize the words
37
+ return cleaned_text.strip()
38
+
39
+ def fetch_empno(text):
40
+ """
41
+ Extracts 6-digit employee numbers from the input text using a regular expression.
42
+
43
+ Parameters:
44
+ text (str): The input text from which to extract employee numbers.
45
+
46
+ Returns:
47
+ list: A list of extracted 6-digit employee numbers.
48
+ """
49
+ return re.findall(r'\b\d{6}\b', text)
50
+
51
+ def preprocess_query(query):
52
+ """
53
+ Preprocesses the input query by cleaning and extracting the meaningful part of the text.
54
+ - Removes extra data from query if certain characters ('||', '-') are present
55
+ - Cleans the query using the `clean_text` function
56
+
57
+ Parameters:
58
+ query (str): The raw query text to preprocess.
59
+
60
+ Returns:
61
+ str: The cleaned and processed query text.
62
+ """
63
+ new_query = query
64
+ # Extract part of the query after '||' or '-'
65
+ if '||' in query:
66
+ ind = query.find('||')
67
+ new_query = query[ind + 2:]
68
+ elif '-' in query:
69
+ ind = query.find('-')
70
+ new_query = query[ind:]
71
+ if len(new_query) < 20:
72
+ new_query = query # Restore original query if extracted part is too short
73
+ new_query = clean_text(new_query)
74
+ return new_query
75
+
76
+ def parse_date(date_str):
77
+ """
78
+ Parses a date string and converts it to the format 'DD/MM/YYYY'.
79
+ Handles multiple input date formats.
80
+
81
+ Parameters:
82
+ date_str (str): The input date string.
83
+
84
+ Returns:
85
+ str: The date formatted as 'DD/MM/YYYY'.
86
+ """
87
+ try:
88
+ return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y')
89
+ except ValueError:
90
+ try:
91
+ return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y')
92
+ except ValueError:
93
+ return date_str.strftime('%m/%d/%Y') # Return original string if parsing fails
94
+
95
+ def generate_df(master_data, df, employer_names):
96
+ """
97
+ Generates a DataFrame by combining employer information from the master data
98
+ with transaction data from the input DataFrame.
99
+
100
+ Parameters:
101
+ master_data (DataFrame): The master data containing employer information.
102
+ df (DataFrame): The input data with transaction details.
103
+ employer_names (list): List of employer names to be matched with master data.
104
+
105
+ Returns:
106
+ DataFrame: A DataFrame combining transaction details with corresponding employer information.
107
+ """
108
+ dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
109
+ bank_desc = list(df[9])
110
+ accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
111
+ credits = list(df[7])
112
+
113
+ # Initialize lists for employer-related fields
114
+ employer_codes, bank_statemnt_ref, account_mgr = [], [], []
115
+ emp_province, region, industry, contributing_stts = [], [], [], []
116
+ date_joined, termination_date, email_addr = [], [], []
117
+
118
+ # Iterate through each employer name and retrieve details from the master data
119
+ for name in employer_names:
120
+ if name == "NOT FOUND":
121
+ employer_codes.append(np.nan)
122
+ bank_statemnt_ref.append(np.nan)
123
+ account_mgr.append(np.nan)
124
+ emp_province.append(np.nan)
125
+ region.append(np.nan)
126
+ industry.append(np.nan)
127
+ contributing_stts.append(np.nan)
128
+ date_joined.append(np.nan)
129
+ termination_date.append(np.nan)
130
+ email_addr.append(np.nan)
131
+ else:
132
+ tmp = master_data[master_data['Employer Name'] == name]
133
+ if tmp.empty:
134
+ employer_codes.append(np.nan)
135
+ bank_statemnt_ref.append(np.nan)
136
+ account_mgr.append(np.nan)
137
+ emp_province.append(np.nan)
138
+ region.append(np.nan)
139
+ industry.append(np.nan)
140
+ contributing_stts.append(np.nan)
141
+ date_joined.append(np.nan)
142
+ termination_date.append(np.nan)
143
+ email_addr.append(np.nan)
144
+ else:
145
+ employer_codes.append(list(tmp['cfcf'])[-1])
146
+ bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1])
147
+ account_mgr.append(list(tmp['NASFUNDContact'])[-1])
148
+ emp_province.append(list(tmp['Employer Province'])[-1])
149
+ region.append(list(tmp['Region'])[-1])
150
+ industry.append(list(tmp['Industry'])[-1])
151
+ contributing_stts.append(list(tmp['Contributing Status'])[-1])
152
+ date = str(list(tmp['Date Joined Plan'])[-1])
153
+ date_joined.append(parse_date(date))
154
+ termination_date.append(list(tmp['Termination Date'])[-1])
155
+ email_addr.append(list(tmp['Email Addresses'])[-1])
156
+
157
+ # Construct the final DataFrame
158
+ res_df = pd.DataFrame({
159
+ 'Receipt Date': dates,
160
+ 'Bank Description': bank_desc,
161
+ 'Account': accounts,
162
+ ' Credit ': credits,
163
+ 'Employer Code': employer_codes,
164
+ 'Employer Name': employer_names,
165
+ 'Bank Statement Reference': bank_statemnt_ref,
166
+ 'Account Manager': account_mgr,
167
+ 'Employer Province': emp_province,
168
+ 'Region': region,
169
+ 'Industry': industry,
170
+ 'Contributing Status': contributing_stts,
171
+ 'Date Joined Plan': date_joined,
172
+ 'Termination Date': termination_date,
173
+ 'Email Addresses': email_addr,
174
+ 'First Name': np.nan,
175
+ 'Surname': np.nan,
176
+ 'Membership#': np.nan
177
+ })
178
+
179
+ return res_df
180
+
181
+ def get_res_df(master_data, df):
182
+ """
183
+ Retrieves the result DataFrame by matching employer names using BM25 algorithm
184
+ and employee numbers.
185
+
186
+ Parameters:
187
+ master_data (DataFrame): The master data containing employer information.
188
+ df (DataFrame): The input data with transaction details.
189
+
190
+ Returns:
191
+ DataFrame: A DataFrame containing matched employer data and transaction details.
192
+ """
193
+ # Preprocess master data
194
+ corpus = list(master_data['Employer Name'])
195
+ lower_case_corpus = [clean_text(name) for name in corpus]
196
+ corpus = corpus[1:] # Exclude the first row if it's a header
197
+ lower_case_corpus = lower_case_corpus[1:]
198
+ tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus]
199
+
200
+ bm25 = BM25Okapi(tokenized_corpus) # BM25 model for employer name matching
201
+
202
+ # Preprocess queries from transaction data
203
+ queries = list(df[9])
204
+ queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-'
205
+ empnos = [fetch_empno(text) for text in queries]
206
+ new_queries = [preprocess_query(query) for query in queries]
207
+
208
+ res_names, scores = [], []
209
+
210
+ # Match each query to an employer
211
+ for query, empno_arr in zip(new_queries, empnos):
212
+ name = ""
213
+ if len(empno_arr) != 0:
214
+ # Try to find an employer using the employee number
215
+ for empno in empno_arr:
216
+ names = list(master_data[master_data['cfcf'] == empno]['Employer Name'])
217
+ if len(names) != 0:
218
+ name = names[0]
219
+ scores.append(100) # Perfect match with employee number
220
+ res_names.append(name)
221
+ break
222
+ if name == "":
223
+ # Fall back to BM25 matching if employee number fails
224
+ tokenized_query = query.split(" ")
225
+ name = bm25.get_top_n(tokenized_query, corpus, n=1)
226
+ doc_score = max(bm25.get_scores(tokenized_query))
227
+ scores.append(doc_score)
228
+ res_names.append(name[0] if doc_score > threshold else "NOT FOUND")
229
+
230
+ # Count the number of unmatched results
231
+ not_found = sum(score < threshold for score in scores)
232
+
233
+ # Generate the final result DataFrame
234
+ res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
235
+
236
+ return res_df
237
+
238
+ # import pandas as pd
239
+ # import numpy as np
240
+ # from rank_bm25 import BM25Okapi
241
+ # import re
242
+ # from nltk.stem import WordNetLemmatizer,PorterStemmer
243
+ # from datetime import datetime
244
+ # lemmatizer = WordNetLemmatizer()
245
+
246
+ # threshold = 11
247
+
248
+ # def clean_text(text):
249
+ # cleaned_text = text.lower()
250
+ # cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text)
251
+ # cleaned_text = re.sub(r'\.', '', cleaned_text)
252
+ # cleaned_text = re.sub(r'\/', '', cleaned_text)
253
+ # cleaned_text = re.sub(r'\d{3,}', '', cleaned_text)
254
+ # cleaned_text = re.sub('pvt','private',cleaned_text)
255
+ # cleaned_text = re.sub('ltd','limited',cleaned_text)
256
+ # cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development',cleaned_text)
257
+ # cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation',cleaned_text)
258
+ # cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
259
+ # cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()])
260
+ # # cleaned_text = ' '.join([stemmer.stem(word) for word in cleaned_text.split()])
261
+ # return cleaned_text.strip()
262
+
263
+ # def fetch_empno(text):
264
+ # return re.findall(r'\b\d{6}\b', text)
265
+
266
+ # def preprocess_query(query):
267
+ # new_query = query
268
+ # if '||' in query:
269
+ # ind = query.find('||')
270
+ # new_query=query[ind+2:]
271
+ # elif '-' in query:
272
+ # ind = query.find('-')
273
+ # new_query=query[ind:]
274
+ # if len(new_query) < 20:
275
+ # new_query = query
276
+ # new_query = clean_text(new_query)
277
+ # return new_query
278
+
279
+ # def parse_date(date_str):
280
+ # try:
281
+ # return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y')
282
+ # except ValueError:
283
+ # try:
284
+ # return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y')
285
+ # except ValueError:
286
+ # return date_str.strftime('%m/%d/%Y')
287
+
288
+ # def generate_df(master_data, df, employer_names):
289
+ # dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
290
+ # bank_desc = list(df[9])
291
+ # accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
292
+ # credits = list(df[7])
293
+ # employer_codes = []
294
+ # bank_statemnt_ref = []
295
+ # account_mgr = []
296
+ # emp_province = []
297
+ # region = []
298
+ # industry = []
299
+ # contributing_stts = []
300
+ # date_joined = []
301
+ # termination_date = []
302
+ # email_addr = []
303
+ # for name in employer_names:
304
+ # if name=="NOT FOUND":
305
+ # employer_codes.append(np.nan)
306
+ # bank_statemnt_ref.append(np.nan)
307
+ # account_mgr.append(np.nan)
308
+ # emp_province.append(np.nan)
309
+ # region.append(np.nan)
310
+ # industry.append(np.nan)
311
+ # contributing_stts.append(np.nan)
312
+ # date_joined.append(np.nan)
313
+ # termination_date.append(np.nan)
314
+ # email_addr.append(np.nan)
315
+ # else:
316
+ # tmp = master_data[master_data['Employer Name']==name]
317
+ # if tmp.empty:
318
+ # employer_codes.append(np.nan)
319
+ # bank_statemnt_ref.append(np.nan)
320
+ # account_mgr.append(np.nan)
321
+ # emp_province.append(np.nan)
322
+ # region.append(np.nan)
323
+ # industry.append(np.nan)
324
+ # contributing_stts.append(np.nan)
325
+ # date_joined.append(np.nan)
326
+ # termination_date.append(np.nan)
327
+ # email_addr.append(np.nan)
328
+ # else:
329
+ # employer_codes.append(list(tmp['cfcf'])[-1])
330
+ # bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1])
331
+ # account_mgr.append(list(tmp['NASFUNDContact'])[-1])
332
+ # emp_province.append(list(tmp['Employer Province'])[-1])
333
+ # region.append(list(tmp['Region'])[-1])
334
+ # industry.append(list(tmp['Industry'])[-1])
335
+ # contributing_stts.append(list(tmp['Contributing Status'])[-1])
336
+ # date = str(list(tmp['Date Joined Plan'])[-1])
337
+ # date_joined.append(parse_date(date))
338
+ # termination_date.append(list(tmp['Termination Date'])[-1])
339
+ # email_addr.append(list(tmp['Email Addresses'])[-1])
340
+
341
+ # res_df = pd.DataFrame()
342
+ # res_df['Receipt Date'] = dates
343
+ # res_df['Bank Description'] = bank_desc
344
+ # res_df['Account'] = accounts
345
+ # res_df[' Credit '] = credits
346
+ # res_df['Employer Code'] = employer_codes
347
+ # res_df['Employer Name'] = employer_names
348
+ # res_df['Bank Statement Reference'] = bank_statemnt_ref
349
+ # res_df['Account Manager'] = account_mgr
350
+ # res_df['Employer Province'] = emp_province
351
+ # res_df['Region'] = region
352
+ # res_df['Industry'] = industry
353
+ # res_df['Contributing Status'] = contributing_stts
354
+ # res_df['Date Joined Plan'] = date_joined
355
+ # res_df['Termination Date'] = termination_date
356
+ # res_df['Email Addresses'] = email_addr
357
+ # res_df['First Name'] = np.nan
358
+ # res_df['Surname'] = np.nan
359
+ # res_df['Membership#'] = np.nan
360
+
361
+ # return res_df
362
+
363
+ # def get_res_df(master_data,df):
364
+ # corpus = list(master_data['Employer Name'])
365
+ # lower_case_corpus = [clean_text(name) for name in corpus]
366
+ # corpus = corpus[1:]
367
+ # lower_case_corpus = lower_case_corpus[1:]
368
+ # tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus]
369
+ # bm25 = BM25Okapi(tokenized_corpus)
370
+ # queries = list(df[9])
371
+ # queries = [query[:query.rindex('-')] for query in queries]
372
+ # empnos = [fetch_empno(text) for text in queries]
373
+ # new_queries = [preprocess_query(query) for query in queries]
374
+
375
+ # res_names = []
376
+ # scores = []
377
+ # for query,empno_arr in zip(new_queries,empnos):
378
+ # name = ""
379
+ # if len(empno_arr) != 0:
380
+ # for empno in empno_arr:
381
+ # names = list(master_data[master_data['cfcf']==empno]['Employer Name'])
382
+ # if len(names)!=0:
383
+ # name=names[0]
384
+ # scores.append(100)
385
+ # res_names.append(name)
386
+ # break
387
+ # if name=="":
388
+ # tokenized_query = query.split(" ")
389
+ # name = bm25.get_top_n(tokenized_query, corpus, n=1)
390
+ # doc_score = max(bm25.get_scores(tokenized_query))
391
+ # scores.append(doc_score)
392
+ # res_names.append(name[0] if doc_score>threshold else "NOT FOUND")
393
+ # not_found=0
394
+ # for score in scores:
395
+ # if score<threshold:
396
+ # not_found+=1
397
+ # res_df = generate_df(master_data=master_data,df=df,employer_names=res_names)
398
+
399
+ # return res_df
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ xlsxwriter==3.2.0
2
+ rank-bm25==0.2.2
3
+ numpy
4
+ pandas
5
+ streamlit==1.32.0
6
+ nltk==3.8.1