import streamlit as st
import pandas as pd
from transformers import pipeline
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Streamlit interface setup
st.title("Resume-based Personality Prediction by Serikov Ayanbek")
resume_text = st.text_area("Enter Resume Text Here", height=300)

# Load data from Excel
data = pd.read_excel("ResponseTest.xlsx")
data_open = pd.read_excel("ResponseOpen.xlsx")

# Define preprocessing function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+[a-z]\s+', ' ', text)
    text = re.sub(r'^[a-z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Prepare the data for prediction
data['processed_text'] = data[['CV/Resume'] + [f'Q{i}' for i in range(1, 37)]].agg(lambda x: ', '.join(x), axis=1).apply(preprocess_text)
data_open['processed_text_open'] = data_open[['Demo_F', 'Question']].agg(' '.join, axis=1).apply(preprocess_text)
data_open['processed_text_mopen'] = data_open[['Demo_M', 'Question']].agg(' '.join, axis=1).apply(preprocess_text)

labels = ["Peacemaker", "Loyalist", "Achiever", "Reformer", "Individualist", "Helper", "Challenger", "Investigator", "Enthusiast"]
confidence_threshold = st.slider("Confidence Threshold", 0.0, 1.0, 0.5)

# Automatic prediction on resume text input
if resume_text:
    processed_resume = preprocess_text(resume_text)
    resume_prediction = classifier(processed_resume, labels)
    highest_score_label = resume_prediction['labels'][0]
    st.write("Predicted Personality for the given resume:", highest_score_label)

# Automatic prediction for each row in DataFrame
for index, row in data.iterrows():
    result = classifier(row['processed_text'], labels)
    data.at[index, 'Predicted'] = result['labels'][0]
st.dataframe(data[['True_label', 'Predicted']])

# Separate predictions for Female and Male questions
for index, row in data_open.iterrows():
    result_f = classifier(row['processed_text_open'], labels)
    result_m = classifier(row['processed_text_mopen'], labels)
    data_open.at[index, 'Predicted_F'] = result_f['labels'][0]
    data_open.at[index, 'Predicted_M'] = result_m['labels'][0]
st.dataframe(data_open[['True_label', 'Predicted_F', 'Predicted_M']])