personality / app.py
Fralet's picture
Update app.py
ceeef94 verified
raw
history blame
4.88 kB
"""import streamlit as st
import pandas as pd
from transformers import pipeline
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import logging
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Streamlit interface setup
st.title("Resume-based Personality Prediction by Serikov Ayanbek")
resume_text = st.text_area("Enter Resume Text Here", height=300)
# Load data from Excel
data = pd.read_excel("ResponseTest.xlsx")
data_open = pd.read_excel("ResponseOpen.xlsx")
# Define preprocessing function
def preprocess_text(text):
text = re.sub(r'\W', ' ', str(text))
text = text.lower()
text = re.sub(r'\s+[a-z]\s+', ' ', text)
text = re.sub(r'^[a-z]\s+', ' ', text)
text = re.sub(r'\s+', ' ', text)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokens = text.split()
tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
return ' '.join(tokens)
# Prepare the data for prediction
data['processed_text'] = data[['CV/Resume'] + [f'Q{i}' for i in range(1, 37)]].agg(lambda x: ', '.join(x), axis=1).apply(preprocess_text)
data_open['processed_text_open'] = data_open[['Demo_F', 'Question']].agg(' '.join, axis=1).apply(preprocess_text)
data_open['processed_text_mopen'] = data_open[['Demo_M', 'Question']].agg(' '.join, axis=1).apply(preprocess_text)
labels = ["Peacemaker", "Loyalist", "Achiever", "Reformer", "Individualist", "Helper", "Challenger", "Investigator", "Enthusiast"]
# Function to predict personality and log the predictions
def predict_and_log(data, prediction_column, process_text_column, true_label_column=None, custom_labels=None):
for index, row in data.iterrows():
processed_text = row[process_text_column]
if custom_labels:
result = classifier(processed_text, [row[label] for label in custom_labels])
else:
result = classifier(processed_text, labels)
highest_score_label = result['labels'][0]
data.at[index, prediction_column] = highest_score_label
true_label = row[true_label_column] if true_label_column else 'Not available'
data_id = row['id']
logging.info(f"Row {data_id}: True Label - {true_label}, {prediction_column} - {highest_score_label}")
# Predict and log results for each DataFrame
# predict_and_log(data, 'Predicted', 'processed_text', true_label_column='True_label', custom_labels=['MAX1', 'MAX2', 'MAX3'])
predict_and_log(data_open, 'Predicted_F', 'processed_text_open', true_label_column='True_label')
predict_and_log(data_open, 'Predicted_M', 'processed_text_mopen', true_label_column='True_label')
# Optionally display a confirmation message
st.write("Predictions have been logged. Check your logs for details.")
"""
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Load data
data = pd.read_excel("ResponseOpenPredicted.xlsx")
# Calculate metrics
def calculate_metrics(true_labels, predicted_labels):
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')
return accuracy, precision, recall, f1_score
accuracy_f, precision_f, recall_f, f1_score_f = calculate_metrics(data['True_label'], data['Predicted_F'])
accuracy_m, precision_m, recall_m, f1_score_m = calculate_metrics(data['True_label'], data['Predicted_M'])
# Confusion matrices visualization
conf_matrix_f = confusion_matrix(data['True_label'], data['Predicted_F'])
conf_matrix_m = confusion_matrix(data['True_label'], data['Predicted_M'])
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
sns.heatmap(conf_matrix_f, annot=True, fmt="d", cmap="Blues", ax=ax[0])
ax[0].set_title('Confusion Matrix for Predicted_F')
sns.heatmap(conf_matrix_m, annot=True, fmt="d", cmap="Purples", ax=ax[1])
ax[1].set_title('Confusion Matrix for Predicted_M')
# Distribution of prediction results
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
data['Predicted_F'].value_counts().plot(kind='bar', ax=ax[0], color='blue')
ax[0].set_title('Distribution of Predictions for Female Inputs')
ax[0].set_xlabel('Predicted Labels')
ax[0].set_ylabel('Frequency')
data['Predicted_M'].value_counts().plot(kind='bar', ax=ax[1], color='purple')
ax[1].set_title('Distribution of Predictions for Male Inputs')
ax[1].set_xlabel('Predicted Labels')
ax[1].set_ylabel('Frequency')
plt.tight_layout()
plt.show()