Spaces:

WebashalarForML
/

ResumeExtractor2

Running

File size: 21,407 Bytes

71c6bbf

# mistral.py
import os
import json
import logging
from huggingface_hub import InferenceClient
from huggingface_hub.utils._errors import BadRequestError
from dotenv import load_dotenv
from utils.fileTotext import extract_text_based_on_format
import re
from utils.spacy import Parser_from_model

# Load environment variables from .env file
load_dotenv()

# Authenticate with Hugging Face
HFT = os.getenv('HF_TOKEN') 
if not HFT:
    raise ValueError("Hugging Face token is not set in environment variables.")
client = InferenceClient(model="mistralai/Mistral-Nemo-Instruct-2407", token=HFT)

# Function to clean model output
def Data_Cleaner(text):
    pattern = r".*?format:"
    result = re.split(pattern, text, maxsplit=1)
    if len(result) > 1:
        text_after_format = result[1].strip().strip('`').strip('json')
    else:
        text_after_format = text.strip().strip('`').strip('json')
        
    return text_after_format

# Function to call Mistral and process output
def Model_ProfessionalDetails_Output(resume, client):
    system_role = {
    "role": "system",
    "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
    }
    user_prompt = {
    "role": "user",
    "content": f'''Act as a resume parser for the following text given in text: {resume}

    Extract the text in the following output JSON string as:

    {{

        "professional": {{

            "technical_skills": "List all technical skills, programming languages, frameworks, and technologies mentioned in the resume, ensuring they are not mixed with other skill types.",

            "non_technical_skills": "Identify and list non-technical skills such as leadership, teamwork, and communication skills, ensuring they are not mixed with technical skills.",

            "tools": "Enumerate all software tools, platforms, and applications (e.g., Figma, Unity, MS Office, etc.) referenced in the resume, distinctly separate from skills.",

            "projects": "Extract the names or titles of all projects mentioned in the resume.",

            "projects_experience": "Summarize overall project experiences, providing a brief description of each project as detailed in the resume.",

            "experience": "Calculate total professional work experience in years and months based on the resume.",

            "companies_worked_at": "List the names of all companies where employment is mentioned in the resume.",

            "certifications": "Extract and list all certifications obtained as stated in the resume.",

            "roles": "Include the names of all job titles or roles held as indicated in the resume.",

            "qualifications": "List educational qualifications (e.g., B.Tech) from the resume. If none are found, return 'No education listed'.",

            "courses": "Extract the names of completed courses based on the resume. If none are found, return 'No courses listed'.",

            "university": "Identify the name of the university, college, or institute attended, based on the resume. If not found, return 'No university listed'.",

            "year_of_graduation": "Extract the year of graduation from the resume. If not found, return 'No year of graduation listed'."

        }}

    }}

    Json Output:

    ''' 
    }


    response = ""
    for message in client.chat_completion(messages=[system_role, user_prompt], max_tokens=3000, stream=True, temperature=0.35):
        response += message.choices[0].delta.content
    
    try:
        clean_response = Data_Cleaner(response)
        parsed_response = json.loads(clean_response)
    except json.JSONDecodeError as e:
        logging.error(f"JSON Decode Error: {e}")
        return {}
    
    return parsed_response

def Model_PersonalDetails_Output(resume, client):
    system_role = {
    "role": "system",
    "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
    }
    user_prompt = {
    "role": "user",
    "content": f'''Act as a resume parser for the following text given in text: {resume}

    Extract the text in the following output JSON string as:

    {{

        "personal": {{

            "name": "Extract the full name based on the resume. If not found, return 'No name listed'.",

            "contact_number": "Extract the contact number from the resume. If not found, return 'No contact number listed'.",

            "email": "Extract the email address from the resume. If not found, return 'No email listed'.",

            "Address": "Extract the Address or address from the resume. If not found, return 'No Address listed'.",

            "link": "Extract any relevant links (e.g., portfolio, LinkedIn) from the resume. If not found, return 'No link listed'."

        }}

    }} 

    output:

    '''
    }

    # Response
    response = ""
    for message in client.chat_completion(
        messages=[system_role, user_prompt],
        max_tokens=3000,
        stream=True,
        temperature=0.35,
    ):
        response += message.choices[0].delta.content

    # Handle cases where the response might have formatting issues
    try:
        #print('The Og response:-->',response)
        clean_response=Data_Cleaner(response)
        #print("After data cleaning",clean_response)
        parsed_response = json.loads(clean_response)
        
    except json.JSONDecodeError as e:
        print("JSON Decode Error:", e)
        print("Raw Response:", response)        
        return {}

    return parsed_response


# # Fallback to SpaCy if Mistral fails

# Add regex pattern for LinkedIn and GitHub links
linkedin_pattern = r"https?://(?:www\.)?linkedin\.com/[\w\-_/]+"
github_pattern = r"https?://(?:www\.)?github\.com/[\w\-_/]+"
email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
contact_pattern = r"^\+?[\d\s\-()]{7,15}$"

def extract_links(hyperlinks):
    linkedin_links = []
    github_links = []
    
    # Iterate through the hyperlinks and apply regex to find LinkedIn and GitHub links
    for link in hyperlinks:
        if re.match(linkedin_pattern, link):
            linkedin_links.append(link)
        elif re.match(github_pattern, link):
            github_links.append(link)
    
    return linkedin_links, github_links

def is_valid_email(email):
    email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
    return re.match(email_regex, email) is not None

def is_valid_contact(contact):
        patterns = [
        r'^\+91[\s\.\-\/]?\(?0?\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$',  # +91 with optional 0 and separators
        r'^\+91[\s\.\-\/]?\d{5}[\s\-\.\/]?\d{5}$',  # +91 with 10 digits separated
        r'^\d{5}[\s\-\.\/]?\d{5}$',  # Local format without country code
        r'^\+91[\s\.\-\/]?\d{10}$',  # +91 with 10 digits together
        r'^\d{10}$',  # 10 digits together
        r'^\+91[\s\.\-\/]?\(?\d{5}\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$'  # +91 with varying separators
        r'\+1\s\(\d{3}\)\s\d{3}-\d{4} ',               # USA/Canada Intl +1 (XXX) XXX-XXXX
        r'\(\d{3}\)\s\d{3}-\d{4} ',                    # USA/Canada STD (XXX) XXX-XXXX
        r'\(\d{3}\)\s\d{3}\s\d{4} ',                   # USA/Canada (XXX) XXX XXXX
        r'\(\d{3}\)\s\d{3}\s\d{3} ',                   # USA/Canada (XXX) XXX XXX
        r'\+1\d{10} ',                                 # +1 XXXXXXXXXX
        r'\d{10} ',                                    # XXXXXXXXXX
        r'\+44\s\d{4}\s\d{6} ',                        # UK Intl +44 XXXX XXXXXX
        r'\+44\s\d{3}\s\d{3}\s\d{4} ',                 # UK Intl +44 XXX XXX XXXX
        r'0\d{4}\s\d{6} ',                             # UK STD 0XXXX XXXXXX
        r'0\d{3}\s\d{3}\s\d{4} ',                      # UK STD 0XXX XXX XXXX
        r'\+44\d{10} ',                                # +44 XXXXXXXXXX
        r'0\d{10} ',                                   # 0XXXXXXXXXX
        r'\+61\s\d\s\d{4}\s\d{4} ',                    # Australia Intl +61 X XXXX XXXX
        r'0\d\s\d{4}\s\d{4} ',                         # Australia STD 0X XXXX XXXX
        r'\+61\d{9} ',                                 # +61 XXXXXXXXX
        r'0\d{9} ',                                    # 0XXXXXXXXX
        r'\+91\s\d{5}-\d{5} ',                         # India Intl +91 XXXXX-XXXXX
        r'\+91\s\d{4}-\d{6} ',                         # India Intl +91 XXXX-XXXXXX
        r'\+91\s\d{10} ',                              # India Intl +91 XXXXXXXXXX
        r'0\d{2}-\d{7} ',                              # India STD 0XX-XXXXXXX
        r'\+91\d{10} ',                                # +91 XXXXXXXXXX
        r'\+49\s\d{4}\s\d{8} ',                        # Germany Intl +49 XXXX XXXXXXXX
        r'\+49\s\d{3}\s\d{7} ',                        # Germany Intl +49 XXX XXXXXXX
        r'0\d{3}\s\d{8} ',                             # Germany STD 0XXX XXXXXXXX
        r'\+49\d{12} ',                                # +49 XXXXXXXXXXXX
        r'\+49\d{10} ',                                # +49 XXXXXXXXXX
        r'0\d{11} ',                                   # 0XXXXXXXXXXX
        r'\+86\s\d{3}\s\d{4}\s\d{4} ',                 # China Intl +86 XXX XXXX XXXX
        r'0\d{3}\s\d{4}\s\d{4} ',                      # China STD 0XXX XXXX XXXX
        r'\+86\d{11} ',                                # +86 XXXXXXXXXXX
        r'\+81\s\d\s\d{4}\s\d{4} ',                    # Japan Intl +81 X XXXX XXXX
        r'\+81\s\d{2}\s\d{4}\s\d{4} ',                 # Japan Intl +81 XX XXXX XXXX
        r'0\d\s\d{4}\s\d{4} ',                         # Japan STD 0X XXXX XXXX
        r'\+81\d{10} ',                                # +81 XXXXXXXXXX
        r'\+81\d{9} ',                                 # +81 XXXXXXXXX
        r'0\d{9} ',                                    # 0XXXXXXXXX
        r'\+55\s\d{2}\s\d{5}-\d{4} ',                  # Brazil Intl +55 XX XXXXX-XXXX
        r'\+55\s\d{2}\s\d{4}-\d{4} ',                  # Brazil Intl +55 XX XXXX-XXXX
        r'0\d{2}\s\d{4}\s\d{4} ',                      # Brazil STD 0XX XXXX XXXX
        r'\+55\d{11} ',                                # +55 XXXXXXXXXXX
        r'\+55\d{10} ',                                # +55 XXXXXXXXXX
        r'0\d{10} ',                                   # 0XXXXXXXXXX
        r'\+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ',      # France Intl +33 X XX XX XX XX
        r'0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ',           # France STD 0X XX XX XX XX
        r'\+33\d{9} ',                                 # +33 XXXXXXXXX
        r'0\d{9} ',                                    # 0XXXXXXXXX
        r'\+7\s\d{3}\s\d{3}-\d{2}-\d{2} ',             # Russia Intl +7 XXX XXX-XX-XX
        r'8\s\d{3}\s\d{3}-\d{2}-\d{2} ',               # Russia STD 8 XXX XXX-XX-XX
        r'\+7\d{10} ',                                 # +7 XXXXXXXXXX
        r'8\d{10} ',                                   # 8 XXXXXXXXXX
        r'\+27\s\d{2}\s\d{3}\s\d{4} ',                 # South Africa Intl +27 XX XXX XXXX
        r'0\d{2}\s\d{3}\s\d{4} ',                      # South Africa STD 0XX XXX XXXX
        r'\+27\d{9} ',                                 # +27 XXXXXXXXX
        r'0\d{9} ',                                    # 0XXXXXXXXX
        r'\+52\s\d{3}\s\d{3}\s\d{4} ',                 # Mexico Intl +52 XXX XXX XXXX
        r'\+52\s\d{2}\s\d{4}\s\d{4} ',                 # Mexico Intl +52 XX XXXX XXXX
        r'01\s\d{3}\s\d{4} ',                          # Mexico STD 01 XXX XXXX
        r'\+52\d{10} ',                                # +52 XXXXXXXXXX
        r'01\d{7} ',                                   # 01 XXXXXXX
        r'\+234\s\d{3}\s\d{3}\s\d{4} ',                # Nigeria Intl +234 XXX XXX XXXX
        r'0\d{3}\s\d{3}\s\d{4} ',                      # Nigeria STD 0XXX XXX XXXX
        r'\+234\d{10} ',                               # +234 XXXXXXXXXX
        r'0\d{10} ',                                   # 0XXXXXXXXXX
        r'\+971\s\d\s\d{3}\s\d{4} ',                   # UAE Intl +971 X XXX XXXX
        r'0\d\s\d{3}\s\d{4} ',                         # UAE STD 0X XXX XXXX
        r'\+971\d{8} ',                                # +971 XXXXXXXX
        r'0\d{8} ',                                    # 0XXXXXXXX
        r'\+54\s9\s\d{3}\s\d{3}\s\d{4} ',              # Argentina Intl +54 9 XXX XXX XXXX
        r'\+54\s\d{1}\s\d{4}\s\d{4} ',                 # Argentina Intl +54 X XXXX XXXX
        r'0\d{3}\s\d{4} ',                             # Argentina STD 0XXX XXXX
        r'\+54\d{10} ',                                # +54 9 XXXXXXXXXX
        r'\+54\d{9} ',                                 # +54 XXXXXXXXX
        r'0\d{7} ',                                    # 0XXXXXXX
        r'\+966\s\d\s\d{3}\s\d{4} ',                   # Saudi Intl +966 X XXX XXXX
        r'0\d\s\d{3}\s\d{4} ',                         # Saudi STD 0X XXX XXXX
        r'\+966\d{8} ',                                # +966 XXXXXXXX
        r'0\d{8} ',                                    # 0XXXXXXXX
        r'\+1\d{10} ',                                 # +1 XXXXXXXXXX
        r'\+1\s\d{3}\s\d{3}\s\d{4} ',                  # +1 XXX XXX XXXX
        r'\d{5}\s\d{5} ',                              # XXXXX XXXXX                              
        r'\d{10} ',                                    # XXXXXXXXXX
        r'\+44\d{10} ',                                # +44 XXXXXXXXXX
        r'0\d{10} ',                                   # 0XXXXXXXXXX
        r'\+61\d{9} ',                                 # +61 XXXXXXXXX
        r'0\d{9} ',                                    # 0XXXXXXXXX
        r'\+91\d{10} ',                                # +91 XXXXXXXXXX
        r'\+49\d{12} ',                                # +49 XXXXXXXXXXXX
        r'\+49\d{10} ',                                # +49 XXXXXXXXXX
        r'0\d{11} ',                                   # 0XXXXXXXXXXX
        r'\+86\d{11} ',                                # +86 XXXXXXXXXXX
        r'\+81\d{10} ',                                # +81 XXXXXXXXXX
        r'\+81\d{9} ',                                 # +81 XXXXXXXXX
        r'0\d{9} ',                                    # 0XXXXXXXXX
        r'\+55\d{11} ',                                # +55 XXXXXXXXXXX
        r'\+55\d{10} ',                                # +55 XXXXXXXXXX
        r'0\d{10} ',                                   # 0XXXXXXXXXX
        r'\+33\d{9} ',                                 # +33 XXXXXXXXX
        r'0\d{9} ',                                    # 0XXXXXXXXX
        r'\+7\d{10} ',                                 # +7 XXXXXXXXXX
        r'8\d{10} ',                                   # 8 XXXXXXXXXX
        r'\+27\d{9} ',                                 # +27 XXXXXXXXX
        r'0\d{9} ',                                    # 0XXXXXXXXX (South Africa STD)
        r'\+52\d{10} ',                                # +52 XXXXXXXXXX
        r'01\d{7} ',                                   # 01 XXXXXXX
        r'\+234\d{10} ',                               # +234 XXXXXXXXXX
        r'0\d{10} ',                                   # 0XXXXXXXXXX
        r'\+971\d{8} ',                                # +971 XXXXXXXX
        r'0\d{8} ',                                    # 0XXXXXXXX
        r'\+54\s9\s\d{10} ',                           # +54 9 XXXXXXXXXX
        r'\+54\d{9} ',                                 # +54 XXXXXXXXX
        r'0\d{7} ',                                    # 0XXXXXXX
        r'\+966\d{8} ',                                # +966 XXXXXXXX
        r'0\d{8}'                                     # 0XXXXXXXX
    ]

    # Check if the contact matches any of the patterns
        return any(re.match(pattern, contact) for pattern in patterns) is not None


def validate_contact_email(personal_data):
    contact = personal_data.get('contact', 'Not found')
    email = personal_data.get('email', 'Not found')
    
    valid_contact = is_valid_contact(contact) if contact != 'Not found' else False
    valid_email = is_valid_email(email) if email != 'Not found' else False

    invalid_contact = 'Invalid contact' if not valid_contact else 'Valid contact'
    invalid_email = 'Invalid email' if not valid_email else 'Valid email'
    
    return valid_contact, invalid_contact, valid_email, invalid_email


def process_resume_data(file_path):
    resume_text, hyperlinks = extract_text_based_on_format(file_path)
    print("Resume converted to text successfully.")
    
    if not resume_text:
        return {"error": "Text extraction failed"}
    
    # Extract LinkedIn and GitHub links
    linkedin_links, github_links = extract_links(hyperlinks)
    
    # Attempt to use Mistral model for parsing
    try:
        # Extract personal details using Mistral
        per_data = Model_PersonalDetails_Output(resume_text, client)
        
        # Extract professional details using Mistral
        pro_data = Model_ProfessionalDetails_Output(resume_text, client)
        
        # Check if per_data and pro_data have been populated correctly
        if not per_data:
            logging.warning("Mistral personal data extraction failed.")
            per_data = {}
        
        if not pro_data:
            logging.warning("Mistral professional data extraction failed.")
            pro_data = {}
        
        # Combine both personal and professional details into a structured output
        result = {
            "personal": {
                "name": per_data.get('personal', {}).get('name', 'Not found'),
                "contact": per_data.get('personal', {}).get('contact_number', 'Not found'),
                "email": per_data.get('personal', {}).get('email', 'Not found'),
                "location": per_data.get('personal', {}).get('Address', 'Not found'),
                "linkedin": linkedin_links,
                "github": github_links,
                "other_links": hyperlinks  # Store remaining links if needed
            },
            "professional": {
                "technical_skills": pro_data.get('professional', {}).get('technical_skills', 'Not found'),
                "non_technical_skills": pro_data.get('professional', {}).get('non_technical_skills', 'Not found'),
                "tools": pro_data.get('professional', {}).get('tools', 'Not found'),
                "experience": [
                    {
                        "company": pro_data.get('professional', {}).get('companies_worked_at', 'Not found'),
                        "projects": pro_data.get('professional', {}).get('projects', 'Not found'),
                        "role": pro_data.get('professional', {}).get('worked_as', 'Not found'),
                        "years": pro_data.get('professional', {}).get('experience', 'Not found'),
                        "project_experience": pro_data.get('professional', {}).get('projects_experience', 'Not found')
                    }
                ],
                "education": [
                    {
                        "qualification": pro_data.get('professional', {}).get('qualification', 'Not found'),
                        "university": pro_data.get('professional', {}).get('university', 'Not found'),
                        "course": pro_data.get('professional', {}).get('course', 'Not found'),
                        "certificate": pro_data.get('professional', {}).get('certification', 'Not found')
                    }
                ]
            }
        }
        
        # Validate contact and email
        valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
        result['personal']['valid_contact'] = valid_contact
        result['personal']['invalid_contact'] = invalid_contact
        result['personal']['valid_email'] = valid_email
        result['personal']['invalid_email'] = invalid_email
        
        # If Mistral produces valid output, return it
        if per_data or pro_data:
            logging.info("Successfully extracted data using Mistral.")
            print("---------Mistral-------")
            return result
        else:
            raise ValueError("Mistral returned no output")
    
    # Handle HuggingFace API or Mistral model errors
    except BadRequestError as e:
        logging.error(f"HuggingFace API error: {e}. Falling back to SpaCy.")
        print(f"HuggingFace API error: {e}. Falling back to SpaCy.")
    except Exception as e:
        logging.error(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
        print(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
    
    # Fallback to SpaCy if Mistral fails
    logging.warning("Mistral failed, switching to SpaCy.")
    print("---------SpaCy-------")
    return Parser_from_model(file_path)