import pandas as pd import numpy as np from rank_bm25 import BM25Okapi import re from nltk.stem import WordNetLemmatizer, PorterStemmer from datetime import datetime lemmatizer = WordNetLemmatizer() threshold = 11.6 # Threshold score for employer match def clean_text(text): """ Cleans and normalizes the input text by performing the following operations: - Lowercases the text - Removes special characters and digits - Replaces abbreviations with full words (e.g., 'pvt' -> 'private', 'ltd' -> 'limited') - Lemmatizes the words for normalization Parameters: text (str): The input text string to be cleaned. Returns: str: The cleaned and lemmatized text. """ cleaned_text = text.lower() cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text) # Remove special characters cleaned_text = re.sub(r'\.', '', cleaned_text) # Remove periods cleaned_text = re.sub(r'\/', '', cleaned_text) # Remove slashes cleaned_text = re.sub(r'\d{3,}', '', cleaned_text) # Remove numbers with more than 3 digits cleaned_text = re.sub('pvt', 'private', cleaned_text) # Replace 'pvt' with 'private' cleaned_text = re.sub('ltd', 'limited', cleaned_text) # Replace 'ltd' with 'limited' cleaned_text = re.sub(r'(? threshold else "NOT FOUND") # Count the number of unmatched results not_found = sum(score < threshold for score in scores) # Generate the final result DataFrame res_df = generate_df(master_data=master_data, df=df, employer_names=res_names) return res_df # import pandas as pd # import numpy as np # from rank_bm25 import BM25Okapi # import re # from nltk.stem import WordNetLemmatizer,PorterStemmer # from datetime import datetime # lemmatizer = WordNetLemmatizer() # threshold = 11 # def clean_text(text): # cleaned_text = text.lower() # cleaned_text = re.sub(r'[^A-Za-z0-9\s./]', ' ', cleaned_text) # cleaned_text = re.sub(r'\.', '', cleaned_text) # cleaned_text = re.sub(r'\/', '', cleaned_text) # cleaned_text = re.sub(r'\d{3,}', '', cleaned_text) # cleaned_text = re.sub('pvt','private',cleaned_text) # cleaned_text = re.sub('ltd','limited',cleaned_text) # cleaned_text = re.sub(r'(?threshold else "NOT FOUND") # not_found=0 # for score in scores: # if score