Spaces:

chandanzeon
/

Fetch_Employer_Name

Sleeping

App Files Files Community

chandanzeon commited on Oct 15, 2024

Commit

2d3bc6e

1 Parent(s): 54b1517

First Commit

Browse files

Files changed (3) hide show

app.py +125 -0
helper.py +399 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import streamlit as st
+import pandas as pd
+from io import BytesIO
+from helper import get_res_df
+def to_excel(df):
+    """
+    Convert a Pandas DataFrame to an Excel file in memory.
+    Parameters:
+    df (DataFrame): The DataFrame to be converted to Excel format.
+    Returns:
+    bytes: The in-memory Excel file data.
+    """
+    output = BytesIO()
+    # Use the Pandas ExcelWriter to write the DataFrame to an in-memory file
+    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+        df.to_excel(writer, index=False)
+    processed_data = output.getvalue()
+    return processed_data
+def process_files(excel_file, text_file):
+    """
+    Process the uploaded Excel/CSV and text files and return cleaned dataframes.
+    Parameters:
+    excel_file (UploadedFile): The uploaded Excel or CSV file.
+    text_file (UploadedFile): The uploaded text file.
+    Returns:
+    Tuple[DataFrame, DataFrame]: A tuple containing the cleaned DataFrame from the Excel/CSV file
+                                 and a DataFrame created from the text file data.
+    """
+    print(excel_file, text_file)  # Debugging information
+    # Read the Excel/CSV file into a DataFrame
+    if excel_file.name.endswith('.csv'):
+        df_excel = pd.read_csv(excel_file)
+    else:
+        df_excel = pd.read_excel(excel_file)
+    # Ensure the 'cfcf' column values are formatted as zero-padded 6-digit strings
+    df_excel['cfcf'] = [str(number).zfill(6) for number in df_excel['cfcf']]
+    # Read and process the text file content into a list of lines
+    lines = text_file.read().decode('utf-8').splitlines()
+    data = [line.strip().split(',') for line in lines]  # Split each line by commas
+    # Create a DataFrame from the parsed text file data
+    df = pd.DataFrame(data)
+    return df_excel, df
+# Streamlit UI section
+st.title("Fetch Employer")  # Application title
+# File uploader widgets to allow users to upload an Excel/CSV file and a text file
+uploaded_excel = st.file_uploader("Upload the Master file(.xls or .csv)", type=["csv", "xls", "xlsx"])
+uploaded_text = st.file_uploader("Upload your Text file(.txt)", type=["txt"])
+# Check if both files are uploaded
+if uploaded_excel and uploaded_text:
+    st.write("Processing the files...")  # Inform the user that the files are being processed
+    master_data, df = process_files(uploaded_excel, uploaded_text)  # Process the files
+    st.write("Final Output")  # Display the result of file processing
+    res = get_res_df(master_data, df)  # Generate the result DataFrame using the helper function
+    st.dataframe(res)  # Show the result in a table format on the web app
+    # Convert the result DataFrame to an Excel file for download
+    excel_data = to_excel(res)
+    # Provide a button for the user to download the result as an Excel file
+    st.download_button(label="Download Excel",
+                       data=excel_data,
+                       file_name='Fetch_Employer_Output.xlsx',
+                       mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
+# import streamlit as st
+# import pandas as pd
+# from io import BytesIO
+# from helper import get_res_df
+# def to_excel(df):
+#     output = BytesIO()
+#     with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+#         df.to_excel(writer, index=False)
+#     processed_data = output.getvalue()
+#     return processed_data
+# def process_files(excel_file, text_file):
+#     print(excel_file,text_file)
+#     if excel_file.name.endswith('.csv'):
+#         df_excel = pd.read_csv(excel_file)
+#     else:
+#         df_excel = pd.read_excel(excel_file)
+#     df_excel['cfcf']=[str(number).zfill(6) for number in df_excel['cfcf']]
+#     lines = text_file.read().decode('utf-8').splitlines()
+#     data = [line.strip().split(',') for line in lines]
+#     df = pd.DataFrame(data)
+#     return df_excel,df
+# st.title("Fetch Employer")
+# uploaded_excel = st.file_uploader("Upload the Master file(.xls or .csv)", type=["csv", "xls", "xlsx"])
+# uploaded_text = st.file_uploader("Upload your Text file(.txt)", type=["txt"])
+# if uploaded_excel and uploaded_text:
+#     st.write("Processing the files...")
+#     master_data, df = process_files(uploaded_excel, uploaded_text)
+#     st.write("Final Output")
+#     res = get_res_df(master_data,df)
+#     st.dataframe(res)
+#     excel_data = to_excel(res)
+#     st.download_button(label="Download Excel",
+#                     data=excel_data,
+#                     file_name='Fetch_Employer_Output.xlsx',
+#                     mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')

helper.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import pandas as pd
+import numpy as np
+from rank_bm25 import BM25Okapi
+import re
+from nltk.stem import WordNetLemmatizer, PorterStemmer
+from datetime import datetime
+lemmatizer = WordNetLemmatizer()
+threshold = 11.6  # Threshold score for employer match
+def clean_text(text):
+    """
+    Cleans and normalizes the input text by performing the following operations:
+    - Lowercases the text
+    - Removes special characters and digits
+    - Replaces abbreviations with full words (e.g., 'pvt' -> 'private', 'ltd' -> 'limited')
+    - Lemmatizes the words for normalization
+    Parameters:
+    text (str): The input text string to be cleaned.
+    Returns:
+    str: The cleaned and lemmatized text.
+    """
+    cleaned_text = text.lower()
+    cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text)  # Remove special characters
+    cleaned_text = re.sub(r'\.', '', cleaned_text)  # Remove periods
+    cleaned_text = re.sub(r'\/', '', cleaned_text)  # Remove slashes
+    cleaned_text = re.sub(r'\d{3,}', '', cleaned_text)  # Remove numbers with more than 3 digits
+    cleaned_text = re.sub('pvt', 'private', cleaned_text)  # Replace 'pvt' with 'private'
+    cleaned_text = re.sub('ltd', 'limited', cleaned_text)  # Replace 'ltd' with 'limited'
+    cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development', cleaned_text)  # Replace 'dev' with 'development'
+    cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation', cleaned_text)  # Replace 'co' with 'corporation'
+    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra spaces
+    cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()])  # Lemmatize the words
+    return cleaned_text.strip()
+def fetch_empno(text):
+    """
+    Extracts 6-digit employee numbers from the input text using a regular expression.
+    Parameters:
+    text (str): The input text from which to extract employee numbers.
+    Returns:
+    list: A list of extracted 6-digit employee numbers.
+    """
+    return re.findall(r'\b\d{6}\b', text)
+def preprocess_query(query):
+    """
+    Preprocesses the input query by cleaning and extracting the meaningful part of the text.
+    - Removes extra data from query if certain characters ('||', '-') are present
+    - Cleans the query using the `clean_text` function
+    Parameters:
+    query (str): The raw query text to preprocess.
+    Returns:
+    str: The cleaned and processed query text.
+    """
+    new_query = query
+    # Extract part of the query after '||' or '-'
+    if '||' in query:
+        ind = query.find('||')
+        new_query = query[ind + 2:]
+    elif '-' in query:
+        ind = query.find('-')
+        new_query = query[ind:]
+    if len(new_query) < 20:
+        new_query = query  # Restore original query if extracted part is too short
+    new_query = clean_text(new_query)
+    return new_query
+def parse_date(date_str):
+    """
+    Parses a date string and converts it to the format 'DD/MM/YYYY'.
+    Handles multiple input date formats.
+    Parameters:
+    date_str (str): The input date string.
+    Returns:
+    str: The date formatted as 'DD/MM/YYYY'.
+    """
+    try:
+        return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y')
+    except ValueError:
+        try:
+            return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y')
+        except ValueError:
+            return date_str.strftime('%m/%d/%Y')  # Return original string if parsing fails
+def generate_df(master_data, df, employer_names):
+    """
+    Generates a DataFrame by combining employer information from the master data
+    with transaction data from the input DataFrame.
+    Parameters:
+    master_data (DataFrame): The master data containing employer information.
+    df (DataFrame): The input data with transaction details.
+    employer_names (list): List of employer names to be matched with master data.
+    Returns:
+    DataFrame: A DataFrame combining transaction details with corresponding employer information.
+    """
+    dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
+    bank_desc = list(df[9])
+    accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
+    credits = list(df[7])
+    # Initialize lists for employer-related fields
+    employer_codes, bank_statemnt_ref, account_mgr = [], [], []
+    emp_province, region, industry, contributing_stts = [], [], [], []
+    date_joined, termination_date, email_addr = [], [], []
+    # Iterate through each employer name and retrieve details from the master data
+    for name in employer_names:
+        if name == "NOT FOUND":
+            employer_codes.append(np.nan)
+            bank_statemnt_ref.append(np.nan)
+            account_mgr.append(np.nan)
+            emp_province.append(np.nan)
+            region.append(np.nan)
+            industry.append(np.nan)
+            contributing_stts.append(np.nan)
+            date_joined.append(np.nan)
+            termination_date.append(np.nan)
+            email_addr.append(np.nan)
+        else:
+            tmp = master_data[master_data['Employer Name'] == name]
+            if tmp.empty:
+                employer_codes.append(np.nan)
+                bank_statemnt_ref.append(np.nan)
+                account_mgr.append(np.nan)
+                emp_province.append(np.nan)
+                region.append(np.nan)
+                industry.append(np.nan)
+                contributing_stts.append(np.nan)
+                date_joined.append(np.nan)
+                termination_date.append(np.nan)
+                email_addr.append(np.nan)
+            else:
+                employer_codes.append(list(tmp['cfcf'])[-1])
+                bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1])
+                account_mgr.append(list(tmp['NASFUNDContact'])[-1])
+                emp_province.append(list(tmp['Employer Province'])[-1])
+                region.append(list(tmp['Region'])[-1])
+                industry.append(list(tmp['Industry'])[-1])
+                contributing_stts.append(list(tmp['Contributing Status'])[-1])
+                date = str(list(tmp['Date Joined Plan'])[-1])
+                date_joined.append(parse_date(date))
+                termination_date.append(list(tmp['Termination Date'])[-1])
+                email_addr.append(list(tmp['Email Addresses'])[-1])
+    # Construct the final DataFrame
+    res_df = pd.DataFrame({
+        'Receipt Date': dates,
+        'Bank Description': bank_desc,
+        'Account': accounts,
+        '  Credit  ': credits,
+        'Employer Code': employer_codes,
+        'Employer Name': employer_names,
+        'Bank Statement Reference': bank_statemnt_ref,
+        'Account Manager': account_mgr,
+        'Employer Province': emp_province,
+        'Region': region,
+        'Industry': industry,
+        'Contributing Status': contributing_stts,
+        'Date Joined Plan': date_joined,
+        'Termination Date': termination_date,
+        'Email Addresses': email_addr,
+        'First Name': np.nan,
+        'Surname': np.nan,
+        'Membership#': np.nan
+    })
+    return res_df
+def get_res_df(master_data, df):
+    """
+    Retrieves the result DataFrame by matching employer names using BM25 algorithm
+    and employee numbers.
+    Parameters:
+    master_data (DataFrame): The master data containing employer information.
+    df (DataFrame): The input data with transaction details.
+    Returns:
+    DataFrame: A DataFrame containing matched employer data and transaction details.
+    """
+    # Preprocess master data
+    corpus = list(master_data['Employer Name'])
+    lower_case_corpus = [clean_text(name) for name in corpus]
+    corpus = corpus[1:]  # Exclude the first row if it's a header
+    lower_case_corpus = lower_case_corpus[1:]
+    tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus]
+    bm25 = BM25Okapi(tokenized_corpus)  # BM25 model for employer name matching
+    # Preprocess queries from transaction data
+    queries = list(df[9])
+    queries = [query[:query.rindex('-')] for query in queries]  # Extract part of the query before '-'
+    empnos = [fetch_empno(text) for text in queries]
+    new_queries = [preprocess_query(query) for query in queries]
+    res_names, scores = [], []
+    # Match each query to an employer
+    for query, empno_arr in zip(new_queries, empnos):
+        name = ""
+        if len(empno_arr) != 0:
+            # Try to find an employer using the employee number
+            for empno in empno_arr:
+                names = list(master_data[master_data['cfcf'] == empno]['Employer Name'])
+                if len(names) != 0:
+                    name = names[0]
+                    scores.append(100)  # Perfect match with employee number
+                    res_names.append(name)
+                    break
+        if name == "":
+            # Fall back to BM25 matching if employee number fails
+            tokenized_query = query.split(" ")
+            name = bm25.get_top_n(tokenized_query, corpus, n=1)
+            doc_score = max(bm25.get_scores(tokenized_query))
+            scores.append(doc_score)
+            res_names.append(name[0] if doc_score > threshold else "NOT FOUND")
+    # Count the number of unmatched results
+    not_found = sum(score < threshold for score in scores)
+    # Generate the final result DataFrame
+    res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
+    return res_df
+# import pandas as pd
+# import numpy as np
+# from rank_bm25 import BM25Okapi
+# import re
+# from nltk.stem import WordNetLemmatizer,PorterStemmer
+# from datetime import datetime
+# lemmatizer = WordNetLemmatizer()
+# threshold = 11
+# def clean_text(text):
+#     cleaned_text = text.lower()
+#     cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text)
+#     cleaned_text = re.sub(r'\.', '', cleaned_text)
+#     cleaned_text = re.sub(r'\/', '', cleaned_text)
+#     cleaned_text = re.sub(r'\d{3,}', '', cleaned_text)
+#     cleaned_text = re.sub('pvt','private',cleaned_text)
+#     cleaned_text = re.sub('ltd','limited',cleaned_text)
+#     cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development',cleaned_text)
+#     cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation',cleaned_text)
+#     cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
+#     cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()])
+#     # cleaned_text = ' '.join([stemmer.stem(word) for word in cleaned_text.split()])
+#     return cleaned_text.strip()
+# def fetch_empno(text):
+#     return re.findall(r'\b\d{6}\b', text)
+# def preprocess_query(query):
+#     new_query = query
+#     if '||' in query:
+#         ind = query.find('||')
+#         new_query=query[ind+2:]
+#     elif '-' in query:
+#         ind = query.find('-')
+#         new_query=query[ind:]
+#     if len(new_query) < 20:
+#         new_query = query
+#     new_query = clean_text(new_query)
+#     return new_query
+# def parse_date(date_str):
+#     try:
+#         return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y')
+#     except ValueError:
+#         try:
+#             return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y')
+#         except ValueError:
+#             return date_str.strftime('%m/%d/%Y')
+# def generate_df(master_data, df, employer_names):
+#     dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
+#     bank_desc = list(df[9])
+#     accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
+#     credits = list(df[7])
+#     employer_codes = []
+#     bank_statemnt_ref = []
+#     account_mgr = []
+#     emp_province = []
+#     region = []
+#     industry = []
+#     contributing_stts = []
+#     date_joined = []
+#     termination_date = []
+#     email_addr = []
+#     for name in employer_names:
+#         if name=="NOT FOUND":
+#             employer_codes.append(np.nan)
+#             bank_statemnt_ref.append(np.nan)
+#             account_mgr.append(np.nan)
+#             emp_province.append(np.nan)
+#             region.append(np.nan)
+#             industry.append(np.nan)
+#             contributing_stts.append(np.nan)
+#             date_joined.append(np.nan)
+#             termination_date.append(np.nan)
+#             email_addr.append(np.nan)
+#         else:
+#             tmp = master_data[master_data['Employer Name']==name]
+#             if tmp.empty:
+#                 employer_codes.append(np.nan)
+#                 bank_statemnt_ref.append(np.nan)
+#                 account_mgr.append(np.nan)
+#                 emp_province.append(np.nan)
+#                 region.append(np.nan)
+#                 industry.append(np.nan)
+#                 contributing_stts.append(np.nan)
+#                 date_joined.append(np.nan)
+#                 termination_date.append(np.nan)
+#                 email_addr.append(np.nan)
+#             else:
+#                 employer_codes.append(list(tmp['cfcf'])[-1])
+#                 bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1])
+#                 account_mgr.append(list(tmp['NASFUNDContact'])[-1])
+#                 emp_province.append(list(tmp['Employer Province'])[-1])
+#                 region.append(list(tmp['Region'])[-1])
+#                 industry.append(list(tmp['Industry'])[-1])
+#                 contributing_stts.append(list(tmp['Contributing Status'])[-1])
+#                 date = str(list(tmp['Date Joined Plan'])[-1])
+#                 date_joined.append(parse_date(date))
+#                 termination_date.append(list(tmp['Termination Date'])[-1])
+#                 email_addr.append(list(tmp['Email Addresses'])[-1])
+#     res_df  = pd.DataFrame()
+#     res_df['Receipt Date'] = dates
+#     res_df['Bank Description'] = bank_desc
+#     res_df['Account'] = accounts
+#     res_df['  Credit  '] = credits
+#     res_df['Employer Code'] = employer_codes
+#     res_df['Employer Name'] = employer_names
+#     res_df['Bank Statement Reference'] = bank_statemnt_ref
+#     res_df['Account Manager'] = account_mgr
+#     res_df['Employer Province'] = emp_province
+#     res_df['Region'] = region
+#     res_df['Industry'] = industry
+#     res_df['Contributing Status'] = contributing_stts
+#     res_df['Date Joined Plan'] = date_joined
+#     res_df['Termination Date'] = termination_date
+#     res_df['Email Addresses'] = email_addr
+#     res_df['First Name'] = np.nan
+#     res_df['Surname'] = np.nan
+#     res_df['Membership#'] = np.nan
+#     return res_df
+# def get_res_df(master_data,df):
+#     corpus = list(master_data['Employer Name'])
+#     lower_case_corpus = [clean_text(name) for name in corpus]
+#     corpus = corpus[1:]
+#     lower_case_corpus = lower_case_corpus[1:]
+#     tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus]
+#     bm25 = BM25Okapi(tokenized_corpus)
+#     queries = list(df[9])
+#     queries = [query[:query.rindex('-')] for query in queries]
+#     empnos = [fetch_empno(text) for text in queries]
+#     new_queries = [preprocess_query(query) for query in queries]
+#     res_names = []
+#     scores = []
+#     for query,empno_arr in zip(new_queries,empnos):
+#         name = ""
+#         if len(empno_arr) != 0:
+#             for empno in empno_arr:
+#                 names = list(master_data[master_data['cfcf']==empno]['Employer Name'])
+#                 if len(names)!=0:
+#                     name=names[0]
+#                     scores.append(100)
+#                     res_names.append(name)
+#                     break
+#         if name=="":
+#             tokenized_query = query.split(" ")
+#             name = bm25.get_top_n(tokenized_query, corpus, n=1)
+#             doc_score = max(bm25.get_scores(tokenized_query))
+#             scores.append(doc_score)
+#             res_names.append(name[0] if doc_score>threshold else "NOT FOUND")
+#     not_found=0
+#     for score in scores:
+#         if score<threshold:
+#             not_found+=1
+#     res_df = generate_df(master_data=master_data,df=df,employer_names=res_names)
+#     return res_df

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+xlsxwriter==3.2.0
+rank-bm25==0.2.2
+numpy
+pandas
+streamlit==1.32.0
+nltk==3.8.1