Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import spacy | |
from spacy import displacy | |
import plotly.express as px | |
import numpy as np | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
nltk.download(['stopwords','wordnet']) | |
nltk.download('omw-1.4') | |
# Load the CSV file into a DataFrame | |
dataset_path = "Resume.csv" | |
df = pd.read_csv(dataset_path) | |
df= df.reindex(np.random.permutation(df.index)) | |
data = df.copy().iloc[0:500,] | |
# Load the spaCy English language model with large vocabulary and pre-trained word vectors | |
spacy_model = spacy.load("en_core_web_lg") | |
# Path to the file containing skill patterns in JSONL format (2129 skills) | |
skill_pattern_path = "jz_skill_patterns.jsonl" | |
# Add an entity ruler to the spaCy pipeline | |
ruler = spacy_model.add_pipe("entity_ruler") | |
# Load skill patterns from disk into the entity ruler | |
ruler.from_disk(skill_pattern_path) | |
def get_unique_skills(text): | |
doc = spacy_model(text) | |
skills = set() | |
for ent in doc.ents: | |
if ent.label_ == "SKILL": | |
skills.add(ent.text) | |
return list(skills) | |
def preprocess_resume(resume_str): | |
# Remove special characters, URLs, and Twitter mentions | |
review = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", resume_str) | |
# Convert to lowercase and tokenize | |
review = review.lower().split() | |
# Lemmatize and remove stopwords | |
lm = WordNetLemmatizer() | |
review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))] | |
# Join the words back into a string | |
review = " ".join(review) | |
return review | |
# Apply the preprocess_resume function to each resume string and store the result in a new column | |
data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume) | |
# Extract skills from each preprocessed resume and store them in a new column | |
data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills) | |
def get_skills_distribution(Job_Category): | |
if Job_Category != "ALL": | |
filtered_data = data[data["Category"] == Job_Category]["skills"] | |
else: | |
filtered_data = data["skills"] | |
total_skills = [skill for sublist in filtered_data for skill in sublist] | |
fig = px.histogram( | |
x=total_skills, | |
labels={"x": "Skills"}, | |
title=f"{Job_Category} Distribution of Skills", | |
).update_xaxes(categoryorder="total descending") | |
return fig.show() | |
# Apply the preprocess_resume function to each resume string and store the result in a new column | |
data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume) | |
# Extract skills from each preprocessed resume and store them in a new column | |
data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills) | |
patterns = data.Category.unique() | |
for a in patterns: | |
ruler.add_patterns([{"label": "Job-Category", "pattern": a}]) | |
# Define the options for highlighting entities | |
options = { | |
"ents": [ | |
"Job-Category", | |
"SKILL", | |
"ORG", | |
"PERSON", | |
"GPE", | |
"DATE", | |
"ORDINAL", | |
"PRODUCT", | |
], | |
} | |
# Define a function to process the resume text and highlight entities | |
def highlight_entities(resume_text): | |
# Process the resume text with spaCy | |
doc = spacy_model(resume_text) | |
# Render the entities with displacy and return the HTML | |
html = displacy.render(doc, style="ent", options=options, jupyter=False) | |
return html | |
def calculate_semantic_similarity(required_skills, resume_skills): | |
""" | |
Calculate the semantic similarity between required skills and resume skills. | |
""" | |
required_skills_str = " ".join(required_skills) | |
resume_skills_str = " ".join(resume_skills) | |
required_skills_doc = spacy_model(required_skills_str) | |
resume_skills_doc = spacy_model(resume_skills_str) | |
similarity_score = required_skills_doc.similarity(resume_skills_doc) | |
return similarity_score | |
def find_matching_resumes(input_skills, n=5): | |
""" | |
Find and rank the top matching resumes based on input skills. | |
""" | |
req_skills = input_skills.lower().split(",") | |
ranked_resumes = [] | |
for idx, row in data.iterrows(): | |
resume_skills = row['skills'] | |
similarity_score = calculate_semantic_similarity(req_skills, resume_skills) | |
ranked_resumes.append((row['Resume_str'], similarity_score)) # Accessing 'resume_str' directly | |
# Sort resumes by similarity scores in descending order | |
ranked_resumes.sort(key=lambda x: x[1], reverse=True) | |
# Get the top N matching resumes | |
top_matching_resumes = ranked_resumes[:n] | |
# Construct output in a structured format | |
output = [] | |
for resume_str, score in top_matching_resumes: | |
output.append(f"Similarity Score: {score}\nResume: {resume_str}") # Return 'resume_str' instead of 'resume_id' | |
return output | |
with gr.Blocks() as demo: | |
gr.Markdown("Enter your resume text and perform NER, or enter the required skills and find the top matching resumes.") | |
with gr.Tab("Enter your resume text and perform NER"): | |
text_input = gr.Textbox(lines=10, label="Input Resume Text") | |
text_output = gr.HTML(label="Highlighted Entities") | |
text_button = gr.Button("Submit") | |
with gr.Tab("Enter the required skills (comma-separated) and find the top matching resumes."): | |
text_input2 = gr.Textbox(lines=5, label="Input Required Skills (comma-separated)") | |
text_output2 = gr.Textbox(label="Top Matching Resumes") | |
text_button2 = gr.Button("Submit") | |
text_button.click(highlight_entities, inputs=text_input, outputs=text_output) | |
text_button2.click(find_matching_resumes, inputs=text_input2, outputs=text_output2) | |
demo.launch() |