chandanzeon commited on
Commit
fd2307e
·
1 Parent(s): 7eab253

added string search

Browse files
Files changed (3) hide show
  1. __pycache__/helper.cpython-312.pyc +0 -0
  2. app.py +18 -6
  3. helper.py +42 -16
__pycache__/helper.cpython-312.pyc ADDED
Binary file (12.6 kB). View file
 
app.py CHANGED
@@ -17,7 +17,6 @@ def to_excel(df):
17
  bytes: The in-memory Excel file data.
18
  """
19
  output = BytesIO()
20
- # Use the Pandas ExcelWriter to write the DataFrame to an in-memory file
21
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
22
  df.to_excel(writer, index=False)
23
  processed_data = output.getvalue()
@@ -59,17 +58,30 @@ def process_files(excel_file, text_file):
59
  # Streamlit UI section
60
  st.title("Fetch Employer") # Application title
61
 
62
- # File uploader widgets to allow users to upload an Excel/CSV file and a text file
63
- uploaded_excel = st.file_uploader("Upload the Master file(.xls or .csv)", type=["csv", "xls", "xlsx"])
64
- uploaded_text = st.file_uploader("Upload your Text file(.txt)", type=["txt"])
 
 
65
 
66
  # Check if both files are uploaded
67
  if uploaded_excel and uploaded_text:
68
- st.write("Processing the files...") # Inform the user that the files are being processed
69
  master_data, df = process_files(uploaded_excel, uploaded_text) # Process the files
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  st.write("Final Output") # Display the result of file processing
72
- res = get_res_df(master_data, df) # Generate the result DataFrame using the helper function
73
  st.dataframe(res) # Show the result in a table format on the web app
74
 
75
  # Convert the result DataFrame to an Excel file for download
 
17
  bytes: The in-memory Excel file data.
18
  """
19
  output = BytesIO()
 
20
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
21
  df.to_excel(writer, index=False)
22
  processed_data = output.getvalue()
 
58
  # Streamlit UI section
59
  st.title("Fetch Employer") # Application title
60
 
61
+ # Sidebar file uploader widgets to allow users to upload an Excel/CSV file and a text file
62
+ with st.sidebar:
63
+ st.header("Upload Files")
64
+ uploaded_excel = st.file_uploader("Upload the Master file (.xls or .csv)", type=["csv", "xls", "xlsx"])
65
+ uploaded_text = st.file_uploader("Upload your Text file (.txt)", type=["txt"])
66
 
67
  # Check if both files are uploaded
68
  if uploaded_excel and uploaded_text:
 
69
  master_data, df = process_files(uploaded_excel, uploaded_text) # Process the files
70
 
71
+ res, fbdm, fben, fbbm25, nf = get_res_df(master_data, df) # Generate the result DataFrame using the helper function
72
+
73
+ # Create a layout with four columns to display the metrics
74
+ col1, col2, col3, col4 = st.columns(4)
75
+ with col1:
76
+ st.metric("Direct Match", fbdm)
77
+ with col2:
78
+ st.metric("Employer Number", fben)
79
+ with col3:
80
+ st.metric("BM25 Match", fbbm25)
81
+ with col4:
82
+ st.metric("Rejected For Threshold", nf)
83
+
84
  st.write("Final Output") # Display the result of file processing
 
85
  st.dataframe(res) # Show the result in a table format on the web app
86
 
87
  # Convert the result DataFrame to an Excel file for download
helper.py CHANGED
@@ -205,32 +205,58 @@ def get_res_df(master_data, df):
205
  empnos = [fetch_empno(text) for text in queries]
206
  new_queries = [preprocess_query(query) for query in queries]
207
 
208
- res_names, scores = [], []
209
-
 
 
 
 
 
 
 
 
 
 
 
 
210
  # Match each query to an employer
211
- for query, empno_arr in zip(new_queries, empnos):
212
  name = ""
213
- if len(empno_arr) != 0:
214
- # Try to find an employer using the employee number
 
 
 
 
 
 
 
 
215
  for empno in empno_arr:
216
- names = list(master_data[master_data['Employer Number'] == empno]['Employer Name'])
217
- if len(names) != 0:
218
- name = names[0]
219
- scores.append(100) # Perfect match with employee number
 
 
220
  res_names.append(name)
221
  break
222
- if name == "":
223
- # Fall back to BM25 matching if employee number fails
224
  tokenized_query = query.split(" ")
225
  name = bm25.get_top_n(tokenized_query, corpus, n=1)
226
  doc_score = max(bm25.get_scores(tokenized_query))
227
  scores.append(doc_score)
228
- res_names.append(name[0] if doc_score > threshold else "NOT FOUND")
229
-
230
- # Count the number of unmatched results
231
- not_found = sum(score < threshold for score in scores)
 
 
 
 
232
 
233
  # Generate the final result DataFrame
234
  res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
235
 
236
- return res_df
 
205
  empnos = [fetch_empno(text) for text in queries]
206
  new_queries = [preprocess_query(query) for query in queries]
207
 
208
+ exact_matches = []
209
+ for query in queries:
210
+ match_found = False
211
+ for j, comp in enumerate(corpus):
212
+ if comp.lower().strip() in query.lower().strip():
213
+ exact_matches.append(corpus[j])
214
+ match_found = True
215
+ break
216
+ if not match_found:
217
+ exact_matches.append('')
218
+
219
+ res_names, found_by, scores = [], [], []
220
+ found_by_direct_search, found_by_emp_no, found_by_bm5, not_found = 0, 0, 0, 0
221
+
222
  # Match each query to an employer
223
+ for query,empno_arr,exact_match in zip(new_queries,empnos,exact_matches):
224
  name = ""
225
+ # Find Employer by Direct Search
226
+ if exact_match!='':
227
+ name = exact_match
228
+ scores.append(100)
229
+ found_by_direct_search+=1
230
+ found_by.append("Direct Search")
231
+ res_names.append(name)
232
+
233
+ # Try to find an employer using the employee number if Direct Search Fails
234
+ elif len(empno_arr) != 0:
235
  for empno in empno_arr:
236
+ names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
237
+ if len(names)!=0:
238
+ name=names[0]
239
+ scores.append(100) # Perfect match with employee number
240
+ found_by_emp_no+=1
241
+ found_by.append("Employer Number")
242
  res_names.append(name)
243
  break
244
+ # Fall back to BM25 matching if employee number fails
245
+ if name=="":
246
  tokenized_query = query.split(" ")
247
  name = bm25.get_top_n(tokenized_query, corpus, n=1)
248
  doc_score = max(bm25.get_scores(tokenized_query))
249
  scores.append(doc_score)
250
+ if doc_score>threshold:
251
+ found_by_bm5 += 1
252
+ res_names.append(name[0])
253
+ found_by.append("BM25")
254
+ else:
255
+ not_found+=1
256
+ res_names.append("NOT FOUND")
257
+ found_by.append("NOT FOUND")
258
 
259
  # Generate the final result DataFrame
260
  res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
261
 
262
+ return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found