Fetch_Employer_Name / helper.py
chandanzeon's picture
First Commit
2d3bc6e
raw
history blame
16 kB
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
lemmatizer = WordNetLemmatizer()
threshold = 11.6 # Threshold score for employer match
def clean_text(text):
"""
Cleans and normalizes the input text by performing the following operations:
- Lowercases the text
- Removes special characters and digits
- Replaces abbreviations with full words (e.g., 'pvt' -> 'private', 'ltd' -> 'limited')
- Lemmatizes the words for normalization
Parameters:
text (str): The input text string to be cleaned.
Returns:
str: The cleaned and lemmatized text.
"""
cleaned_text = text.lower()
cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text) # Remove special characters
cleaned_text = re.sub(r'\.', '', cleaned_text) # Remove periods
cleaned_text = re.sub(r'\/', '', cleaned_text) # Remove slashes
cleaned_text = re.sub(r'\d{3,}', '', cleaned_text) # Remove numbers with more than 3 digits
cleaned_text = re.sub('pvt', 'private', cleaned_text) # Replace 'pvt' with 'private'
cleaned_text = re.sub('ltd', 'limited', cleaned_text) # Replace 'ltd' with 'limited'
cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development', cleaned_text) # Replace 'dev' with 'development'
cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation', cleaned_text) # Replace 'co' with 'corporation'
cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Remove extra spaces
cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()]) # Lemmatize the words
return cleaned_text.strip()
def fetch_empno(text):
"""
Extracts 6-digit employee numbers from the input text using a regular expression.
Parameters:
text (str): The input text from which to extract employee numbers.
Returns:
list: A list of extracted 6-digit employee numbers.
"""
return re.findall(r'\b\d{6}\b', text)
def preprocess_query(query):
"""
Preprocesses the input query by cleaning and extracting the meaningful part of the text.
- Removes extra data from query if certain characters ('||', '-') are present
- Cleans the query using the `clean_text` function
Parameters:
query (str): The raw query text to preprocess.
Returns:
str: The cleaned and processed query text.
"""
new_query = query
# Extract part of the query after '||' or '-'
if '||' in query:
ind = query.find('||')
new_query = query[ind + 2:]
elif '-' in query:
ind = query.find('-')
new_query = query[ind:]
if len(new_query) < 20:
new_query = query # Restore original query if extracted part is too short
new_query = clean_text(new_query)
return new_query
def parse_date(date_str):
"""
Parses a date string and converts it to the format 'DD/MM/YYYY'.
Handles multiple input date formats.
Parameters:
date_str (str): The input date string.
Returns:
str: The date formatted as 'DD/MM/YYYY'.
"""
try:
return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y')
except ValueError:
try:
return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y')
except ValueError:
return date_str.strftime('%m/%d/%Y') # Return original string if parsing fails
def generate_df(master_data, df, employer_names):
"""
Generates a DataFrame by combining employer information from the master data
with transaction data from the input DataFrame.
Parameters:
master_data (DataFrame): The master data containing employer information.
df (DataFrame): The input data with transaction details.
employer_names (list): List of employer names to be matched with master data.
Returns:
DataFrame: A DataFrame combining transaction details with corresponding employer information.
"""
dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
bank_desc = list(df[9])
accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
credits = list(df[7])
# Initialize lists for employer-related fields
employer_codes, bank_statemnt_ref, account_mgr = [], [], []
emp_province, region, industry, contributing_stts = [], [], [], []
date_joined, termination_date, email_addr = [], [], []
# Iterate through each employer name and retrieve details from the master data
for name in employer_names:
if name == "NOT FOUND":
employer_codes.append(np.nan)
bank_statemnt_ref.append(np.nan)
account_mgr.append(np.nan)
emp_province.append(np.nan)
region.append(np.nan)
industry.append(np.nan)
contributing_stts.append(np.nan)
date_joined.append(np.nan)
termination_date.append(np.nan)
email_addr.append(np.nan)
else:
tmp = master_data[master_data['Employer Name'] == name]
if tmp.empty:
employer_codes.append(np.nan)
bank_statemnt_ref.append(np.nan)
account_mgr.append(np.nan)
emp_province.append(np.nan)
region.append(np.nan)
industry.append(np.nan)
contributing_stts.append(np.nan)
date_joined.append(np.nan)
termination_date.append(np.nan)
email_addr.append(np.nan)
else:
employer_codes.append(list(tmp['cfcf'])[-1])
bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1])
account_mgr.append(list(tmp['NASFUNDContact'])[-1])
emp_province.append(list(tmp['Employer Province'])[-1])
region.append(list(tmp['Region'])[-1])
industry.append(list(tmp['Industry'])[-1])
contributing_stts.append(list(tmp['Contributing Status'])[-1])
date = str(list(tmp['Date Joined Plan'])[-1])
date_joined.append(parse_date(date))
termination_date.append(list(tmp['Termination Date'])[-1])
email_addr.append(list(tmp['Email Addresses'])[-1])
# Construct the final DataFrame
res_df = pd.DataFrame({
'Receipt Date': dates,
'Bank Description': bank_desc,
'Account': accounts,
' Credit ': credits,
'Employer Code': employer_codes,
'Employer Name': employer_names,
'Bank Statement Reference': bank_statemnt_ref,
'Account Manager': account_mgr,
'Employer Province': emp_province,
'Region': region,
'Industry': industry,
'Contributing Status': contributing_stts,
'Date Joined Plan': date_joined,
'Termination Date': termination_date,
'Email Addresses': email_addr,
'First Name': np.nan,
'Surname': np.nan,
'Membership#': np.nan
})
return res_df
def get_res_df(master_data, df):
"""
Retrieves the result DataFrame by matching employer names using BM25 algorithm
and employee numbers.
Parameters:
master_data (DataFrame): The master data containing employer information.
df (DataFrame): The input data with transaction details.
Returns:
DataFrame: A DataFrame containing matched employer data and transaction details.
"""
# Preprocess master data
corpus = list(master_data['Employer Name'])
lower_case_corpus = [clean_text(name) for name in corpus]
corpus = corpus[1:] # Exclude the first row if it's a header
lower_case_corpus = lower_case_corpus[1:]
tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus]
bm25 = BM25Okapi(tokenized_corpus) # BM25 model for employer name matching
# Preprocess queries from transaction data
queries = list(df[9])
queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-'
empnos = [fetch_empno(text) for text in queries]
new_queries = [preprocess_query(query) for query in queries]
res_names, scores = [], []
# Match each query to an employer
for query, empno_arr in zip(new_queries, empnos):
name = ""
if len(empno_arr) != 0:
# Try to find an employer using the employee number
for empno in empno_arr:
names = list(master_data[master_data['cfcf'] == empno]['Employer Name'])
if len(names) != 0:
name = names[0]
scores.append(100) # Perfect match with employee number
res_names.append(name)
break
if name == "":
# Fall back to BM25 matching if employee number fails
tokenized_query = query.split(" ")
name = bm25.get_top_n(tokenized_query, corpus, n=1)
doc_score = max(bm25.get_scores(tokenized_query))
scores.append(doc_score)
res_names.append(name[0] if doc_score > threshold else "NOT FOUND")
# Count the number of unmatched results
not_found = sum(score < threshold for score in scores)
# Generate the final result DataFrame
res_df = generate_df(master_data=master_data, df=df, employer_names=res_names)
return res_df
# import pandas as pd
# import numpy as np
# from rank_bm25 import BM25Okapi
# import re
# from nltk.stem import WordNetLemmatizer,PorterStemmer
# from datetime import datetime
# lemmatizer = WordNetLemmatizer()
# threshold = 11
# def clean_text(text):
# cleaned_text = text.lower()
# cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text)
# cleaned_text = re.sub(r'\.', '', cleaned_text)
# cleaned_text = re.sub(r'\/', '', cleaned_text)
# cleaned_text = re.sub(r'\d{3,}', '', cleaned_text)
# cleaned_text = re.sub('pvt','private',cleaned_text)
# cleaned_text = re.sub('ltd','limited',cleaned_text)
# cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development',cleaned_text)
# cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation',cleaned_text)
# cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
# cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()])
# # cleaned_text = ' '.join([stemmer.stem(word) for word in cleaned_text.split()])
# return cleaned_text.strip()
# def fetch_empno(text):
# return re.findall(r'\b\d{6}\b', text)
# def preprocess_query(query):
# new_query = query
# if '||' in query:
# ind = query.find('||')
# new_query=query[ind+2:]
# elif '-' in query:
# ind = query.find('-')
# new_query=query[ind:]
# if len(new_query) < 20:
# new_query = query
# new_query = clean_text(new_query)
# return new_query
# def parse_date(date_str):
# try:
# return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y')
# except ValueError:
# try:
# return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y')
# except ValueError:
# return date_str.strftime('%m/%d/%Y')
# def generate_df(master_data, df, employer_names):
# dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]]
# bank_desc = list(df[9])
# accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]]
# credits = list(df[7])
# employer_codes = []
# bank_statemnt_ref = []
# account_mgr = []
# emp_province = []
# region = []
# industry = []
# contributing_stts = []
# date_joined = []
# termination_date = []
# email_addr = []
# for name in employer_names:
# if name=="NOT FOUND":
# employer_codes.append(np.nan)
# bank_statemnt_ref.append(np.nan)
# account_mgr.append(np.nan)
# emp_province.append(np.nan)
# region.append(np.nan)
# industry.append(np.nan)
# contributing_stts.append(np.nan)
# date_joined.append(np.nan)
# termination_date.append(np.nan)
# email_addr.append(np.nan)
# else:
# tmp = master_data[master_data['Employer Name']==name]
# if tmp.empty:
# employer_codes.append(np.nan)
# bank_statemnt_ref.append(np.nan)
# account_mgr.append(np.nan)
# emp_province.append(np.nan)
# region.append(np.nan)
# industry.append(np.nan)
# contributing_stts.append(np.nan)
# date_joined.append(np.nan)
# termination_date.append(np.nan)
# email_addr.append(np.nan)
# else:
# employer_codes.append(list(tmp['cfcf'])[-1])
# bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1])
# account_mgr.append(list(tmp['NASFUNDContact'])[-1])
# emp_province.append(list(tmp['Employer Province'])[-1])
# region.append(list(tmp['Region'])[-1])
# industry.append(list(tmp['Industry'])[-1])
# contributing_stts.append(list(tmp['Contributing Status'])[-1])
# date = str(list(tmp['Date Joined Plan'])[-1])
# date_joined.append(parse_date(date))
# termination_date.append(list(tmp['Termination Date'])[-1])
# email_addr.append(list(tmp['Email Addresses'])[-1])
# res_df = pd.DataFrame()
# res_df['Receipt Date'] = dates
# res_df['Bank Description'] = bank_desc
# res_df['Account'] = accounts
# res_df[' Credit '] = credits
# res_df['Employer Code'] = employer_codes
# res_df['Employer Name'] = employer_names
# res_df['Bank Statement Reference'] = bank_statemnt_ref
# res_df['Account Manager'] = account_mgr
# res_df['Employer Province'] = emp_province
# res_df['Region'] = region
# res_df['Industry'] = industry
# res_df['Contributing Status'] = contributing_stts
# res_df['Date Joined Plan'] = date_joined
# res_df['Termination Date'] = termination_date
# res_df['Email Addresses'] = email_addr
# res_df['First Name'] = np.nan
# res_df['Surname'] = np.nan
# res_df['Membership#'] = np.nan
# return res_df
# def get_res_df(master_data,df):
# corpus = list(master_data['Employer Name'])
# lower_case_corpus = [clean_text(name) for name in corpus]
# corpus = corpus[1:]
# lower_case_corpus = lower_case_corpus[1:]
# tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus]
# bm25 = BM25Okapi(tokenized_corpus)
# queries = list(df[9])
# queries = [query[:query.rindex('-')] for query in queries]
# empnos = [fetch_empno(text) for text in queries]
# new_queries = [preprocess_query(query) for query in queries]
# res_names = []
# scores = []
# for query,empno_arr in zip(new_queries,empnos):
# name = ""
# if len(empno_arr) != 0:
# for empno in empno_arr:
# names = list(master_data[master_data['cfcf']==empno]['Employer Name'])
# if len(names)!=0:
# name=names[0]
# scores.append(100)
# res_names.append(name)
# break
# if name=="":
# tokenized_query = query.split(" ")
# name = bm25.get_top_n(tokenized_query, corpus, n=1)
# doc_score = max(bm25.get_scores(tokenized_query))
# scores.append(doc_score)
# res_names.append(name[0] if doc_score>threshold else "NOT FOUND")
# not_found=0
# for score in scores:
# if score<threshold:
# not_found+=1
# res_df = generate_df(master_data=master_data,df=df,employer_names=res_names)
# return res_df