File size: 3,852 Bytes
1b50b66
895141b
324d859
2ee3ecc
c05213f
2ee3ecc
 
76aff4b
 
2ee3ecc
 
1b50b66
e27efab
7711d36
1b50b66
76aff4b
b4628ad
d605d91
76aff4b
 
a1a24b4
d605d91
76aff4b
 
 
 
 
a1a24b4
76aff4b
75a0105
76aff4b
 
 
 
 
2ee3ecc
75a0105
870e773
8f9d718
 
ba8654a
 
8f9d718
 
d605d91
6297210
324d859
75a0105
2ee3ecc
 
d605d91
a1a24b4
75a0105
ba8654a
75a0105
ba8654a
a1a24b4
 
75a0105
 
a1a24b4
8f9d718
d605d91
 
 
 
 
 
 
8f9d718
 
 
 
 
 
d605d91
 
8f9d718
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import streamlit as st
import pandas as pd
from transformers import pipeline
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Streamlit interface setup
st.title("Resume-based Personality Prediction by Serikov Ayanbek")
resume_text = st.text_area("Enter Resume Text Here", height=300)

# Load data from Excel
data = pd.read_excel("ResponseTest.xlsx")  # Replace 'ResponseTest.xlsx' with your actual file name
data_open = pd.read_excel("ResponseOpen.xlsx")  # Replace 'ResponseTest.xlsx' with your actual file name

# Preprocess text function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+[a-z]\s+', ' ', text)
    text = re.sub(r'^[a-z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Combine relevant text columns for processing
question_columns = [f'Q{i}' for i in range(1, 37)]  # Adjust the range based on your data columns
data['processed_text'] = data[['CV/Resume'] + question_columns].agg(lambda x: ', '.join(x), axis=1)
#data['processed_text'] = data[['CV/Resume'] + question_columns].agg(lambda x: ', '.join(x), axis=1).apply(preprocess_text)
data_open['processed_text_open'] = data_open[['CV/Resume', 'Question']].agg(' '.join, axis=1)
#data_open['processed_text_open'] = data_open[['CV/Resume', 'Question']].agg(' '.join, axis=1).apply(preprocess_text)
data_open['processed_text_open'] = data_open[['Demo_F', 'Question']].agg(' '.join, axis=1)
data_open['processed_text_mopen'] = data_open[['Demo_M', 'Question']].agg(' '.join, axis=1)

labels = ["Peacemaker", "Loyalist", "Achiever", "Reformer", "Individualist", "Helper", "Challenger", "Investigator", "Enthusiast"]

# Prediction confidence threshold
confidence_threshold = st.slider("Confidence Threshold", 0.0, 1.0, 0.5)

if st.button("Predict Personality by Test"):
    # Function to apply predictions using dynamic labels from MAX1, MAX2, MAX3 and only return the highest scored label
    def get_predictions(row):
        custom_labels = [row['MAX1'], row['MAX2'], row['MAX3']]  # Get labels from each row
        processed_text = row['processed_text']
        result = classifier(processed_text, custom_labels)
        highest_score_label = result['labels'][0]  # Assumes the labels are sorted by score, highest first
        return highest_score_label

    # Apply predictions across all rows
    data['Predicted'] = data.apply(get_predictions, axis=1)
    st.dataframe(data[['True_label','MAX1','MAX2','MAX3', 'Predicted']])

if st.button("Predict Personality by Open Question"):
    def get_predictions(row):
        processed_text = row['processed_text_open']
        result = classifier(processed_text, labels)
        highest_score_label = result['labels'][0]  # Assumes the labels are sorted by score, highest first
        return highest_score_label
        
    def get_predictionsM(row):
        processed_text = row['processed_text_mopen']
        result = classifier(processed_text, labels)
        highest_score_label = result['labels'][0]  # Assumes the labels are sorted by score, highest first
        return highest_score_label

    # Apply predictions across all rows
    data_open['Predicted_M'] = data_open.apply(get_predictions, axis=1)
    data_open['Predicted_F'] = data_open.apply(get_predictionsM, axis=1)
    st.dataframe(data_open[['True_label', 'Predicted_F', 'Predicted_M']])