Spaces:

chandanzeon
/

Fetch_Employer_Name

Sleeping

App Files Files Community

chandanzeon commited on Oct 15, 2024

Commit

fd2307e

1 Parent(s): 7eab253

added string search

Browse files

Files changed (3) hide show

__pycache__/helper.cpython-312.pyc +0 -0
app.py +18 -6
helper.py +42 -16

__pycache__/helper.cpython-312.pyc ADDED Viewed

Binary file (12.6 kB). View file

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ def to_excel(df):
     bytes: The in-memory Excel file data.
     """
     output = BytesIO()
-    # Use the Pandas ExcelWriter to write the DataFrame to an in-memory file
     with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
         df.to_excel(writer, index=False)
     processed_data = output.getvalue()
@@ -59,17 +58,30 @@ def process_files(excel_file, text_file):
 # Streamlit UI section
 st.title("Fetch Employer")  # Application title
-# File uploader widgets to allow users to upload an Excel/CSV file and a text file
-uploaded_excel = st.file_uploader("Upload the Master file(.xls or .csv)", type=["csv", "xls", "xlsx"])
-uploaded_text = st.file_uploader("Upload your Text file(.txt)", type=["txt"])
 # Check if both files are uploaded
 if uploaded_excel and uploaded_text:
-    st.write("Processing the files...")  # Inform the user that the files are being processed
     master_data, df = process_files(uploaded_excel, uploaded_text)  # Process the files
     st.write("Final Output")  # Display the result of file processing
-    res = get_res_df(master_data, df)  # Generate the result DataFrame using the helper function
     st.dataframe(res)  # Show the result in a table format on the web app
     # Convert the result DataFrame to an Excel file for download

     bytes: The in-memory Excel file data.
     """
     output = BytesIO()
     with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
         df.to_excel(writer, index=False)
     processed_data = output.getvalue()
 # Streamlit UI section
 st.title("Fetch Employer")  # Application title
+# Sidebar file uploader widgets to allow users to upload an Excel/CSV file and a text file
+with st.sidebar:
+    st.header("Upload Files")
+    uploaded_excel = st.file_uploader("Upload the Master file (.xls or .csv)", type=["csv", "xls", "xlsx"])
+    uploaded_text = st.file_uploader("Upload your Text file (.txt)", type=["txt"])
 # Check if both files are uploaded
 if uploaded_excel and uploaded_text:
     master_data, df = process_files(uploaded_excel, uploaded_text)  # Process the files
+    res, fbdm, fben, fbbm25, nf = get_res_df(master_data, df)  # Generate the result DataFrame using the helper function
+    # Create a layout with four columns to display the metrics
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Direct Match", fbdm)
+    with col2:
+        st.metric("Employer Number", fben)
+    with col3:
+        st.metric("BM25 Match", fbbm25)
+    with col4:
+        st.metric("Rejected For Threshold", nf)
     st.write("Final Output")  # Display the result of file processing
     st.dataframe(res)  # Show the result in a table format on the web app
     # Convert the result DataFrame to an Excel file for download

helper.py CHANGED Viewed

@@ -205,32 +205,58 @@ def get_res_df(master_data, df):
     empnos = [fetch_empno(text) for text in queries]
     new_queries = [preprocess_query(query) for query in queries]
-    res_names, scores = [], []
     # Match each query to an employer
-    for query, empno_arr in zip(new_queries, empnos):
         name = ""
-        if len(empno_arr) != 0:
-            # Try to find an employer using the employee number
             for empno in empno_arr:
-                names = list(master_data[master_data['Employer Number'] == empno]['Employer Name'])
-                if len(names) != 0:
-                    name = names[0]
-                    scores.append(100)  # Perfect match with employee number
                     res_names.append(name)
                     break
-        if name == "":
-            # Fall back to BM25 matching if employee number fails
             tokenized_query = query.split(" ")
             name = bm25.get_top_n(tokenized_query, corpus, n=1)
             doc_score = max(bm25.get_scores(tokenized_query))
             scores.append(doc_score)
-            res_names.append(name[0] if doc_score > threshold else "NOT FOUND")
-    # Count the number of unmatched results
-    not_found = sum(score < threshold for score in scores)
     # Generate the final result DataFrame
     res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
-    return res_df

     empnos = [fetch_empno(text) for text in queries]
     new_queries = [preprocess_query(query) for query in queries]
+    exact_matches = []
+    for query in queries:
+        match_found = False
+        for j, comp in enumerate(corpus):
+            if comp.lower().strip() in query.lower().strip():
+                exact_matches.append(corpus[j])
+                match_found = True
+                break
+        if not match_found:
+            exact_matches.append('')
+    res_names, found_by, scores = [], [], []
+    found_by_direct_search, found_by_emp_no, found_by_bm5, not_found = 0, 0, 0, 0
     # Match each query to an employer
+    for query,empno_arr,exact_match in zip(new_queries,empnos,exact_matches):
         name = ""
+        # Find Employer by Direct Search
+        if exact_match!='':
+            name = exact_match
+            scores.append(100)
+            found_by_direct_search+=1
+            found_by.append("Direct Search")
+            res_names.append(name)
+        # Try to find an employer using the employee number if Direct Search Fails
+        elif len(empno_arr) != 0:
             for empno in empno_arr:
+                names = list(master_data[master_data['Employer Number']==empno]['Employer Name'])
+                if len(names)!=0:
+                    name=names[0]
+                    scores.append(100) # Perfect match with employee number
+                    found_by_emp_no+=1
+                    found_by.append("Employer Number")
                     res_names.append(name)
                     break
+        # Fall back to BM25 matching if employee number fails
+        if name=="":
             tokenized_query = query.split(" ")
             name = bm25.get_top_n(tokenized_query, corpus, n=1)
             doc_score = max(bm25.get_scores(tokenized_query))
             scores.append(doc_score)
+            if doc_score>threshold:
+                found_by_bm5 += 1
+                res_names.append(name[0])
+                found_by.append("BM25")
+            else:
+                not_found+=1
+                res_names.append("NOT FOUND")
+                found_by.append("NOT FOUND")
     # Generate the final result DataFrame
     res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
+    return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found