import streamlit as st
import pandas as pd
import pickle
from sklearn.impute import SimpleImputer
from sklearn.utils.validation import check_is_fitted
import numpy as np

# Load the trained model and preprocessing objects using pickle
with open('random_forest_model.pkl', 'rb') as f:
    random_forest_model = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

# State corrections and valid states/UTs
state_corrections = {
    'uttaranchal': 'uttarakhand',
    'orissa (odisha)': 'odisha',
    'kashmir': 'jammu and kashmir',
    'multi state': 'other',
    'not classified': 'other'
}

valid_states_uts = [
    'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', 'goa',
    'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', 'kerala',
    'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland',
    'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura',
    'uttar pradesh', 'uttarakhand', 'west bengal', 'andaman and nicobar islands',
    'chandigarh', 'dadra and nagar haveli and daman and diu', 'lakshadweep', 'delhi',
    'puducherry', 'jammu and kashmir', 'ladakh'
]

# Extract city, state, and country
def extract_city(x):
    if isinstance(x, str):
        splitted_string = x.split("-")
        if len(splitted_string) == 4:
            return f"{splitted_string[0].strip().lower()} {splitted_string[1].strip().lower()}"
        else:
            return splitted_string[0].strip().lower()
    else:
        return "other"

def extract_state(x):
    if isinstance(x, str):
        state = x.split("-")[-2].strip().lower()
        return state_corrections.get(state, state if state in valid_states_uts else 'other')
    else:
        return "other"

def extract_country(x):
    if isinstance(x, str):
        return x.split("-")[-1].strip().lower()
    else:
        return "other"


def preprocess_new_data(df):
    df['Ownership'] = df['Ownership'].str.lower().str.strip()
    df[' Type of Tender '] = df[' Type of Tender '].str.lower().str.strip()

    def parse_closing_date(date_str):
        try:
            return pd.to_datetime(date_str)
        except Exception:
            if " to " in date_str:
                date_str = date_str.split(" to ")[-1]
                return pd.to_datetime(date_str, errors='coerce')
            return pd.NaT

    df['Closing Date'] = df['Closing Date'].apply(parse_closing_date)
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['days_left'] = (df['Closing Date'] - df['Date']).dt.days

    df['city'] = df['Location'].apply(lambda x: extract_city(x))
    df['state'] = df['Location'].apply(lambda x: extract_state(x))
    df['country'] = df['Location'].apply(lambda x: extract_country(x))

    df['city'].fillna("other", inplace=True)
    df['state'].fillna("other", inplace=True)
    df['country'].fillna("other", inplace=True)

    # Remove commas and convert numerical columns to floats
    numerical_columns = ['Earnest Money', 'Estimated Cost', 'DocFees']
    for col in numerical_columns:
        df[col] = df[col].replace({',': ''}, regex=True).astype(float)

    df = df[['Ref No', 'Earnest Money', 'Estimated Cost', 'DocFees', 'Ownership', ' Type of Tender ', 'days_left', 'city', 'state', 'country']]

    imputer = SimpleImputer(strategy='median')
    df['days_left'] = imputer.fit_transform(df[['days_left']])

    for column in ['Ownership', ' Type of Tender ', 'city', 'state', 'country']:
        le = label_encoders[column]

        # Add 'other' to the classes if it's not already there
        if 'other' not in le.classes_:
            le.classes_ = np.append(le.classes_, 'other')

        # Replace unseen labels with 'other'
        df[column] = df[column].apply(lambda x: x if x in le.classes_ else 'other')
        df[column] = le.transform(df[column])

    numerical_features = ['Earnest Money', 'Estimated Cost', 'DocFees', 'days_left']
    df[numerical_features] = scaler.transform(df[numerical_features])

    return df


def predict_new_data(new_data):
    preprocessed_data = preprocess_new_data(new_data)
    X_new = preprocessed_data.drop(columns=['Ref No'])
    tender_ref_numbers_new = preprocessed_data['Ref No']
    predictions = random_forest_model.predict(X_new)
    results = pd.DataFrame({
        'Ref No': tender_ref_numbers_new,
        'predictions': predictions
    })
    
    return results

st.title("Tender Selection Prediction")
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])

if uploaded_file is not None:
    new_data = pd.read_csv(uploaded_file)
    prediction_results = predict_new_data(new_data)
    
    selected_tenders = prediction_results[prediction_results['predictions'] == "yes"]['Ref No'].astype(str).to_list()
    new_data['Ref No'] = new_data['Ref No'].astype(str)

    st.write("Selected Tenders:")
    st.write(new_data[new_data['Ref No'].isin(selected_tenders)].drop(columns=['Unnamed: 0']).reset_index().drop(columns=['index']))