Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from rank_bm25 import BM25Okapi | |
import re | |
from nltk.stem import WordNetLemmatizer, PorterStemmer | |
from datetime import datetime | |
lemmatizer = WordNetLemmatizer() | |
threshold = 11.6 # Threshold score for employer match | |
def clean_text(text): | |
""" | |
Cleans and normalizes the input text by performing the following operations: | |
- Lowercases the text | |
- Removes special characters and digits | |
- Replaces abbreviations with full words (e.g., 'pvt' -> 'private', 'ltd' -> 'limited') | |
- Lemmatizes the words for normalization | |
Parameters: | |
text (str): The input text string to be cleaned. | |
Returns: | |
str: The cleaned and lemmatized text. | |
""" | |
cleaned_text = text.lower() | |
cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text) # Remove special characters | |
cleaned_text = re.sub(r'\.', '', cleaned_text) # Remove periods | |
cleaned_text = re.sub(r'\/', '', cleaned_text) # Remove slashes | |
cleaned_text = re.sub(r'\d{3,}', '', cleaned_text) # Remove numbers with more than 3 digits | |
cleaned_text = re.sub('pvt', 'private', cleaned_text) # Replace 'pvt' with 'private' | |
cleaned_text = re.sub('ltd', 'limited', cleaned_text) # Replace 'ltd' with 'limited' | |
cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development', cleaned_text) # Replace 'dev' with 'development' | |
cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation', cleaned_text) # Replace 'co' with 'corporation' | |
cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Remove extra spaces | |
cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()]) # Lemmatize the words | |
return cleaned_text.strip() | |
def fetch_empno(text): | |
""" | |
Extracts 6-digit employee numbers from the input text using a regular expression. | |
Parameters: | |
text (str): The input text from which to extract employee numbers. | |
Returns: | |
list: A list of extracted 6-digit employee numbers. | |
""" | |
return re.findall(r'\b\d{6}\b', text) | |
def preprocess_query(query): | |
""" | |
Preprocesses the input query by cleaning and extracting the meaningful part of the text. | |
- Removes extra data from query if certain characters ('||', '-') are present | |
- Cleans the query using the `clean_text` function | |
Parameters: | |
query (str): The raw query text to preprocess. | |
Returns: | |
str: The cleaned and processed query text. | |
""" | |
new_query = query | |
# Extract part of the query after '||' or '-' | |
if '||' in query: | |
ind = query.find('||') | |
new_query = query[ind + 2:] | |
elif '-' in query: | |
ind = query.find('-') | |
new_query = query[ind:] | |
if len(new_query) < 20: | |
new_query = query # Restore original query if extracted part is too short | |
new_query = clean_text(new_query) | |
return new_query | |
def parse_date(date_str): | |
""" | |
Parses a date string and converts it to the format 'DD/MM/YYYY'. | |
Handles multiple input date formats. | |
Parameters: | |
date_str (str): The input date string. | |
Returns: | |
str: The date formatted as 'DD/MM/YYYY'. | |
""" | |
try: | |
return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y') | |
except ValueError: | |
try: | |
return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y') | |
except ValueError: | |
return date_str.strftime('%m/%d/%Y') # Return original string if parsing fails | |
def generate_df(master_data, df, employer_names): | |
""" | |
Generates a DataFrame by combining employer information from the master data | |
with transaction data from the input DataFrame. | |
Parameters: | |
master_data (DataFrame): The master data containing employer information. | |
df (DataFrame): The input data with transaction details. | |
employer_names (list): List of employer names to be matched with master data. | |
Returns: | |
DataFrame: A DataFrame combining transaction details with corresponding employer information. | |
""" | |
dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]] | |
bank_desc = list(df[9]) | |
accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]] | |
credits = list(df[7]) | |
# Initialize lists for employer-related fields | |
employer_codes, bank_statemnt_ref, account_mgr = [], [], [] | |
emp_province, region, industry, contributing_stts = [], [], [], [] | |
date_joined, termination_date, email_addr = [], [], [] | |
# Iterate through each employer name and retrieve details from the master data | |
for name in employer_names: | |
if name == "NOT FOUND": | |
employer_codes.append(np.nan) | |
bank_statemnt_ref.append(np.nan) | |
account_mgr.append(np.nan) | |
emp_province.append(np.nan) | |
region.append(np.nan) | |
industry.append(np.nan) | |
contributing_stts.append(np.nan) | |
date_joined.append(np.nan) | |
termination_date.append(np.nan) | |
email_addr.append(np.nan) | |
else: | |
tmp = master_data[master_data['Employer Name'] == name] | |
if tmp.empty: | |
employer_codes.append(np.nan) | |
bank_statemnt_ref.append(np.nan) | |
account_mgr.append(np.nan) | |
emp_province.append(np.nan) | |
region.append(np.nan) | |
industry.append(np.nan) | |
contributing_stts.append(np.nan) | |
date_joined.append(np.nan) | |
termination_date.append(np.nan) | |
email_addr.append(np.nan) | |
else: | |
employer_codes.append(list(tmp['cfcf'])[-1]) | |
bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1]) | |
account_mgr.append(list(tmp['NASFUNDContact'])[-1]) | |
emp_province.append(list(tmp['Employer Province'])[-1]) | |
region.append(list(tmp['Region'])[-1]) | |
industry.append(list(tmp['Industry'])[-1]) | |
contributing_stts.append(list(tmp['Contributing Status'])[-1]) | |
date = str(list(tmp['Date Joined Plan'])[-1]) | |
date_joined.append(parse_date(date)) | |
termination_date.append(list(tmp['Termination Date'])[-1]) | |
email_addr.append(list(tmp['Email Addresses'])[-1]) | |
# Construct the final DataFrame | |
res_df = pd.DataFrame({ | |
'Receipt Date': dates, | |
'Bank Description': bank_desc, | |
'Account': accounts, | |
' Credit ': credits, | |
'Employer Code': employer_codes, | |
'Employer Name': employer_names, | |
'Bank Statement Reference': bank_statemnt_ref, | |
'Account Manager': account_mgr, | |
'Employer Province': emp_province, | |
'Region': region, | |
'Industry': industry, | |
'Contributing Status': contributing_stts, | |
'Date Joined Plan': date_joined, | |
'Termination Date': termination_date, | |
'Email Addresses': email_addr, | |
'First Name': np.nan, | |
'Surname': np.nan, | |
'Membership#': np.nan | |
}) | |
return res_df | |
def get_res_df(master_data, df): | |
""" | |
Retrieves the result DataFrame by matching employer names using BM25 algorithm | |
and employee numbers. | |
Parameters: | |
master_data (DataFrame): The master data containing employer information. | |
df (DataFrame): The input data with transaction details. | |
Returns: | |
DataFrame: A DataFrame containing matched employer data and transaction details. | |
""" | |
# Preprocess master data | |
corpus = list(master_data['Employer Name']) | |
lower_case_corpus = [clean_text(name) for name in corpus] | |
corpus = corpus[1:] # Exclude the first row if it's a header | |
lower_case_corpus = lower_case_corpus[1:] | |
tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus] | |
bm25 = BM25Okapi(tokenized_corpus) # BM25 model for employer name matching | |
# Preprocess queries from transaction data | |
queries = list(df[9]) | |
queries = [query[:query.rindex('-')] for query in queries] # Extract part of the query before '-' | |
empnos = [fetch_empno(text) for text in queries] | |
new_queries = [preprocess_query(query) for query in queries] | |
res_names, scores = [], [] | |
# Match each query to an employer | |
for query, empno_arr in zip(new_queries, empnos): | |
name = "" | |
if len(empno_arr) != 0: | |
# Try to find an employer using the employee number | |
for empno in empno_arr: | |
names = list(master_data[master_data['cfcf'] == empno]['Employer Name']) | |
if len(names) != 0: | |
name = names[0] | |
scores.append(100) # Perfect match with employee number | |
res_names.append(name) | |
break | |
if name == "": | |
# Fall back to BM25 matching if employee number fails | |
tokenized_query = query.split(" ") | |
name = bm25.get_top_n(tokenized_query, corpus, n=1) | |
doc_score = max(bm25.get_scores(tokenized_query)) | |
scores.append(doc_score) | |
res_names.append(name[0] if doc_score > threshold else "NOT FOUND") | |
# Count the number of unmatched results | |
not_found = sum(score < threshold for score in scores) | |
# Generate the final result DataFrame | |
res_df = generate_df(master_data=master_data, df=df, employer_names=res_names) | |
return res_df | |
# import pandas as pd | |
# import numpy as np | |
# from rank_bm25 import BM25Okapi | |
# import re | |
# from nltk.stem import WordNetLemmatizer,PorterStemmer | |
# from datetime import datetime | |
# lemmatizer = WordNetLemmatizer() | |
# threshold = 11 | |
# def clean_text(text): | |
# cleaned_text = text.lower() | |
# cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text) | |
# cleaned_text = re.sub(r'\.', '', cleaned_text) | |
# cleaned_text = re.sub(r'\/', '', cleaned_text) | |
# cleaned_text = re.sub(r'\d{3,}', '', cleaned_text) | |
# cleaned_text = re.sub('pvt','private',cleaned_text) | |
# cleaned_text = re.sub('ltd','limited',cleaned_text) | |
# cleaned_text = re.sub(r'(?<!\w)dev(?!\w)', 'development',cleaned_text) | |
# cleaned_text = re.sub(r'(?<!\w)co(?!\w)', 'corporation',cleaned_text) | |
# cleaned_text = re.sub(r'\s+', ' ', cleaned_text) | |
# cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()]) | |
# # cleaned_text = ' '.join([stemmer.stem(word) for word in cleaned_text.split()]) | |
# return cleaned_text.strip() | |
# def fetch_empno(text): | |
# return re.findall(r'\b\d{6}\b', text) | |
# def preprocess_query(query): | |
# new_query = query | |
# if '||' in query: | |
# ind = query.find('||') | |
# new_query=query[ind+2:] | |
# elif '-' in query: | |
# ind = query.find('-') | |
# new_query=query[ind:] | |
# if len(new_query) < 20: | |
# new_query = query | |
# new_query = clean_text(new_query) | |
# return new_query | |
# def parse_date(date_str): | |
# try: | |
# return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').strftime('%d/%m/%Y') | |
# except ValueError: | |
# try: | |
# return datetime.strptime(date_str, '%m/%d/%Y').strftime('%d/%m/%Y') | |
# except ValueError: | |
# return date_str.strftime('%m/%d/%Y') | |
# def generate_df(master_data, df, employer_names): | |
# dates = [datetime.strptime(date_str, '%d%m%y').strftime('%d/%m/%Y') for date_str in df[4]] | |
# bank_desc = list(df[9]) | |
# accounts = ['NASA' if i == '713' else 'EDAS' if i == '068' else None for i in df[0]] | |
# credits = list(df[7]) | |
# employer_codes = [] | |
# bank_statemnt_ref = [] | |
# account_mgr = [] | |
# emp_province = [] | |
# region = [] | |
# industry = [] | |
# contributing_stts = [] | |
# date_joined = [] | |
# termination_date = [] | |
# email_addr = [] | |
# for name in employer_names: | |
# if name=="NOT FOUND": | |
# employer_codes.append(np.nan) | |
# bank_statemnt_ref.append(np.nan) | |
# account_mgr.append(np.nan) | |
# emp_province.append(np.nan) | |
# region.append(np.nan) | |
# industry.append(np.nan) | |
# contributing_stts.append(np.nan) | |
# date_joined.append(np.nan) | |
# termination_date.append(np.nan) | |
# email_addr.append(np.nan) | |
# else: | |
# tmp = master_data[master_data['Employer Name']==name] | |
# if tmp.empty: | |
# employer_codes.append(np.nan) | |
# bank_statemnt_ref.append(np.nan) | |
# account_mgr.append(np.nan) | |
# emp_province.append(np.nan) | |
# region.append(np.nan) | |
# industry.append(np.nan) | |
# contributing_stts.append(np.nan) | |
# date_joined.append(np.nan) | |
# termination_date.append(np.nan) | |
# email_addr.append(np.nan) | |
# else: | |
# employer_codes.append(list(tmp['cfcf'])[-1]) | |
# bank_statemnt_ref.append(list(tmp['Bank Statement Reference'])[-1]) | |
# account_mgr.append(list(tmp['NASFUNDContact'])[-1]) | |
# emp_province.append(list(tmp['Employer Province'])[-1]) | |
# region.append(list(tmp['Region'])[-1]) | |
# industry.append(list(tmp['Industry'])[-1]) | |
# contributing_stts.append(list(tmp['Contributing Status'])[-1]) | |
# date = str(list(tmp['Date Joined Plan'])[-1]) | |
# date_joined.append(parse_date(date)) | |
# termination_date.append(list(tmp['Termination Date'])[-1]) | |
# email_addr.append(list(tmp['Email Addresses'])[-1]) | |
# res_df = pd.DataFrame() | |
# res_df['Receipt Date'] = dates | |
# res_df['Bank Description'] = bank_desc | |
# res_df['Account'] = accounts | |
# res_df[' Credit '] = credits | |
# res_df['Employer Code'] = employer_codes | |
# res_df['Employer Name'] = employer_names | |
# res_df['Bank Statement Reference'] = bank_statemnt_ref | |
# res_df['Account Manager'] = account_mgr | |
# res_df['Employer Province'] = emp_province | |
# res_df['Region'] = region | |
# res_df['Industry'] = industry | |
# res_df['Contributing Status'] = contributing_stts | |
# res_df['Date Joined Plan'] = date_joined | |
# res_df['Termination Date'] = termination_date | |
# res_df['Email Addresses'] = email_addr | |
# res_df['First Name'] = np.nan | |
# res_df['Surname'] = np.nan | |
# res_df['Membership#'] = np.nan | |
# return res_df | |
# def get_res_df(master_data,df): | |
# corpus = list(master_data['Employer Name']) | |
# lower_case_corpus = [clean_text(name) for name in corpus] | |
# corpus = corpus[1:] | |
# lower_case_corpus = lower_case_corpus[1:] | |
# tokenized_corpus = [doc.split(' ') for doc in lower_case_corpus] | |
# bm25 = BM25Okapi(tokenized_corpus) | |
# queries = list(df[9]) | |
# queries = [query[:query.rindex('-')] for query in queries] | |
# empnos = [fetch_empno(text) for text in queries] | |
# new_queries = [preprocess_query(query) for query in queries] | |
# res_names = [] | |
# scores = [] | |
# for query,empno_arr in zip(new_queries,empnos): | |
# name = "" | |
# if len(empno_arr) != 0: | |
# for empno in empno_arr: | |
# names = list(master_data[master_data['cfcf']==empno]['Employer Name']) | |
# if len(names)!=0: | |
# name=names[0] | |
# scores.append(100) | |
# res_names.append(name) | |
# break | |
# if name=="": | |
# tokenized_query = query.split(" ") | |
# name = bm25.get_top_n(tokenized_query, corpus, n=1) | |
# doc_score = max(bm25.get_scores(tokenized_query)) | |
# scores.append(doc_score) | |
# res_names.append(name[0] if doc_score>threshold else "NOT FOUND") | |
# not_found=0 | |
# for score in scores: | |
# if score<threshold: | |
# not_found+=1 | |
# res_df = generate_df(master_data=master_data,df=df,employer_names=res_names) | |
# return res_df |