Spaces:
Sleeping
Sleeping
Commit
·
fd2307e
1
Parent(s):
7eab253
added string search
Browse files- __pycache__/helper.cpython-312.pyc +0 -0
- app.py +18 -6
- helper.py +42 -16
__pycache__/helper.cpython-312.pyc
ADDED
Binary file (12.6 kB). View file
|
|
app.py
CHANGED
@@ -17,7 +17,6 @@ def to_excel(df):
|
|
17 |
bytes: The in-memory Excel file data.
|
18 |
"""
|
19 |
output = BytesIO()
|
20 |
-
# Use the Pandas ExcelWriter to write the DataFrame to an in-memory file
|
21 |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
22 |
df.to_excel(writer, index=False)
|
23 |
processed_data = output.getvalue()
|
@@ -59,17 +58,30 @@ def process_files(excel_file, text_file):
|
|
59 |
# Streamlit UI section
|
60 |
st.title("Fetch Employer") # Application title
|
61 |
|
62 |
-
#
|
63 |
-
|
64 |
-
|
|
|
|
|
65 |
|
66 |
# Check if both files are uploaded
|
67 |
if uploaded_excel and uploaded_text:
|
68 |
-
st.write("Processing the files...") # Inform the user that the files are being processed
|
69 |
master_data, df = process_files(uploaded_excel, uploaded_text) # Process the files
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
st.write("Final Output") # Display the result of file processing
|
72 |
-
res = get_res_df(master_data, df) # Generate the result DataFrame using the helper function
|
73 |
st.dataframe(res) # Show the result in a table format on the web app
|
74 |
|
75 |
# Convert the result DataFrame to an Excel file for download
|
|
|
17 |
bytes: The in-memory Excel file data.
|
18 |
"""
|
19 |
output = BytesIO()
|
|
|
20 |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
21 |
df.to_excel(writer, index=False)
|
22 |
processed_data = output.getvalue()
|
|
|
58 |
# Streamlit UI section
|
59 |
st.title("Fetch Employer") # Application title
|
60 |
|
61 |
+
# Sidebar file uploader widgets to allow users to upload an Excel/CSV file and a text file
|
62 |
+
with st.sidebar:
|
63 |
+
st.header("Upload Files")
|
64 |
+
uploaded_excel = st.file_uploader("Upload the Master file (.xls or .csv)", type=["csv", "xls", "xlsx"])
|
65 |
+
uploaded_text = st.file_uploader("Upload your Text file (.txt)", type=["txt"])
|
66 |
|
67 |
# Check if both files are uploaded
|
68 |
if uploaded_excel and uploaded_text:
|
|
|
69 |
master_data, df = process_files(uploaded_excel, uploaded_text) # Process the files
|
70 |
|
71 |
+
res, fbdm, fben, fbbm25, nf = get_res_df(master_data, df) # Generate the result DataFrame using the helper function
|
72 |
+
|
73 |
+
# Create a layout with four columns to display the metrics
|
74 |
+
col1, col2, col3, col4 = st.columns(4)
|
75 |
+
with col1:
|
76 |
+
st.metric("Direct Match", fbdm)
|
77 |
+
with col2:
|
78 |
+
st.metric("Employer Number", fben)
|
79 |
+
with col3:
|
80 |
+
st.metric("BM25 Match", fbbm25)
|
81 |
+
with col4:
|
82 |
+
st.metric("Rejected For Threshold", nf)
|
83 |
+
|
84 |
st.write("Final Output") # Display the result of file processing
|
|
|
85 |
st.dataframe(res) # Show the result in a table format on the web app
|
86 |
|
87 |
# Convert the result DataFrame to an Excel file for download
|
helper.py
CHANGED
@@ -205,32 +205,58 @@ def get_res_df(master_data, df):
|
|
205 |
empnos = [fetch_empno(text) for text in queries]
|
206 |
new_queries = [preprocess_query(query) for query in queries]
|
207 |
|
208 |
-
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
# Match each query to an employer
|
211 |
-
for query,
|
212 |
name = ""
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
for empno in empno_arr:
|
216 |
-
names = list(master_data[master_data['Employer Number']
|
217 |
-
if len(names)
|
218 |
-
name
|
219 |
-
scores.append(100)
|
|
|
|
|
220 |
res_names.append(name)
|
221 |
break
|
222 |
-
if
|
223 |
-
|
224 |
tokenized_query = query.split(" ")
|
225 |
name = bm25.get_top_n(tokenized_query, corpus, n=1)
|
226 |
doc_score = max(bm25.get_scores(tokenized_query))
|
227 |
scores.append(doc_score)
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
232 |
|
233 |
# Generate the final result DataFrame
|
234 |
res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
|
235 |
|
236 |
-
return res_df
|
|
|
205 |
empnos = [fetch_empno(text) for text in queries]
|
206 |
new_queries = [preprocess_query(query) for query in queries]
|
207 |
|
208 |
+
exact_matches = []
|
209 |
+
for query in queries:
|
210 |
+
match_found = False
|
211 |
+
for j, comp in enumerate(corpus):
|
212 |
+
if comp.lower().strip() in query.lower().strip():
|
213 |
+
exact_matches.append(corpus[j])
|
214 |
+
match_found = True
|
215 |
+
break
|
216 |
+
if not match_found:
|
217 |
+
exact_matches.append('')
|
218 |
+
|
219 |
+
res_names, found_by, scores = [], [], []
|
220 |
+
found_by_direct_search, found_by_emp_no, found_by_bm5, not_found = 0, 0, 0, 0
|
221 |
+
|
222 |
# Match each query to an employer
|
223 |
+
for query,empno_arr,exact_match in zip(new_queries,empnos,exact_matches):
|
224 |
name = ""
|
225 |
+
# Find Employer by Direct Search
|
226 |
+
if exact_match!='':
|
227 |
+
name = exact_match
|
228 |
+
scores.append(100)
|
229 |
+
found_by_direct_search+=1
|
230 |
+
found_by.append("Direct Search")
|
231 |
+
res_names.append(name)
|
232 |
+
|
233 |
+
# Try to find an employer using the employee number if Direct Search Fails
|
234 |
+
elif len(empno_arr) != 0:
|
235 |
for empno in empno_arr:
|
236 |
+
names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
|
237 |
+
if len(names)!=0:
|
238 |
+
name=names[0]
|
239 |
+
scores.append(100) # Perfect match with employee number
|
240 |
+
found_by_emp_no+=1
|
241 |
+
found_by.append("Employer Number")
|
242 |
res_names.append(name)
|
243 |
break
|
244 |
+
# Fall back to BM25 matching if employee number fails
|
245 |
+
if name=="":
|
246 |
tokenized_query = query.split(" ")
|
247 |
name = bm25.get_top_n(tokenized_query, corpus, n=1)
|
248 |
doc_score = max(bm25.get_scores(tokenized_query))
|
249 |
scores.append(doc_score)
|
250 |
+
if doc_score>threshold:
|
251 |
+
found_by_bm5 += 1
|
252 |
+
res_names.append(name[0])
|
253 |
+
found_by.append("BM25")
|
254 |
+
else:
|
255 |
+
not_found+=1
|
256 |
+
res_names.append("NOT FOUND")
|
257 |
+
found_by.append("NOT FOUND")
|
258 |
|
259 |
# Generate the final result DataFrame
|
260 |
res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
|
261 |
|
262 |
+
return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found
|