import pandas as pd import numpy as np from rank_bm25 import BM25Okapi import re from nltk.stem import WordNetLemmatizer, PorterStemmer from datetime import datetime lemmatizer = WordNetLemmatizer() threshold = 11.6 # Threshold score for employer match def clean_text(text): """ Cleans and normalizes the input text by performing the following operations: - Lowercases the text - Removes special characters and digits - Replaces abbreviations with full words (e.g., 'pvt' -> 'private', 'ltd' -> 'limited') - Lemmatizes the words for normalization Parameters: text (str): The input text string to be cleaned. Returns: str: The cleaned and lemmatized text. """ cleaned_text = text.lower() cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text) # Remove special characters cleaned_text = re.sub(r'\.', '', cleaned_text) # Remove periods cleaned_text = re.sub(r'\/', '', cleaned_text) # Remove slashes cleaned_text = re.sub(r'\d{3,}', '', cleaned_text) # Remove numbers with more than 3 digits cleaned_text = re.sub('pvt', 'private', cleaned_text) # Replace 'pvt' with 'private' cleaned_text = re.sub('ltd', 'limited', cleaned_text) # Replace 'ltd' with 'limited' cleaned_text = re.sub(r'(?threshold: found_by_bm5 += 1 res_names.append(name[0]) found_by.append("BM25") else: not_found+=1 res_names.append("NOT FOUND") found_by.append("NOT FOUND") # Generate the final result DataFrame res_df = generate_df(master_data=master_data, df=df, employer_names=res_names) print(f"{found_by_direct_search=},{found_by_emp_no=},{found_by_bm5=},{not_found=},{edas=}") return res_df, found_by_direct_search, found_by_emp_no, found_by_bm5, not_found