import streamlit as st import pandas as pd import pickle from sklearn.impute import SimpleImputer from sklearn.utils.validation import check_is_fitted import numpy as np # Load the trained model and preprocessing objects using pickle with open('random_forest_model.pkl', 'rb') as f: random_forest_model = pickle.load(f) with open('scaler.pkl', 'rb') as f: scaler = pickle.load(f) with open('label_encoders.pkl', 'rb') as f: label_encoders = pickle.load(f) # State corrections and valid states/UTs state_corrections = { 'uttaranchal': 'uttarakhand', 'orissa (odisha)': 'odisha', 'kashmir': 'jammu and kashmir', 'multi state': 'other', 'not classified': 'other' } valid_states_uts = [ 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', 'kerala', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal', 'andaman and nicobar islands', 'chandigarh', 'dadra and nagar haveli and daman and diu', 'lakshadweep', 'delhi', 'puducherry', 'jammu and kashmir', 'ladakh' ] # Extract city, state, and country def extract_city(x): if isinstance(x, str): splitted_string = x.split("-") if len(splitted_string) == 4: return f"{splitted_string[0].strip().lower()} {splitted_string[1].strip().lower()}" else: return splitted_string[0].strip().lower() else: return "other" def extract_state(x): if isinstance(x, str): state = x.split("-")[-2].strip().lower() return state_corrections.get(state, state if state in valid_states_uts else 'other') else: return "other" def extract_country(x): if isinstance(x, str): return x.split("-")[-1].strip().lower() else: return "other" def preprocess_new_data(df): df['Ownership'] = df['Ownership'].str.lower().str.strip() df[' Type of Tender '] = df[' Type of Tender '].str.lower().str.strip() def parse_closing_date(date_str): try: return pd.to_datetime(date_str) except Exception: if " to " in date_str: date_str = date_str.split(" to ")[-1] return pd.to_datetime(date_str, errors='coerce') return pd.NaT df['Closing Date'] = df['Closing Date'].apply(parse_closing_date) df['Date'] = pd.to_datetime(df['Date'], errors='coerce') df['days_left'] = (df['Closing Date'] - df['Date']).dt.days df['city'] = df['Location'].apply(lambda x: extract_city(x)) df['state'] = df['Location'].apply(lambda x: extract_state(x)) df['country'] = df['Location'].apply(lambda x: extract_country(x)) df['city'].fillna("other", inplace=True) df['state'].fillna("other", inplace=True) df['country'].fillna("other", inplace=True) # Remove commas and convert numerical columns to floats numerical_columns = ['Earnest Money', 'Estimated Cost', 'DocFees'] for col in numerical_columns: df[col] = df[col].replace({',': ''}, regex=True).astype(float) df = df[['Ref No', 'Earnest Money', 'Estimated Cost', 'DocFees', 'Ownership', ' Type of Tender ', 'days_left', 'city', 'state', 'country']] imputer = SimpleImputer(strategy='median') df['days_left'] = imputer.fit_transform(df[['days_left']]) for column in ['Ownership', ' Type of Tender ', 'city', 'state', 'country']: le = label_encoders[column] # Add 'other' to the classes if it's not already there if 'other' not in le.classes_: le.classes_ = np.append(le.classes_, 'other') # Replace unseen labels with 'other' df[column] = df[column].apply(lambda x: x if x in le.classes_ else 'other') df[column] = le.transform(df[column]) numerical_features = ['Earnest Money', 'Estimated Cost', 'DocFees', 'days_left'] df[numerical_features] = scaler.transform(df[numerical_features]) return df def predict_new_data(new_data): preprocessed_data = preprocess_new_data(new_data) X_new = preprocessed_data.drop(columns=['Ref No']) tender_ref_numbers_new = preprocessed_data['Ref No'] predictions = random_forest_model.predict(X_new) results = pd.DataFrame({ 'Ref No': tender_ref_numbers_new, 'predictions': predictions }) return results st.title("Tender Selection Prediction") uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) if uploaded_file is not None: new_data = pd.read_csv(uploaded_file) prediction_results = predict_new_data(new_data) selected_tenders = prediction_results[prediction_results['predictions'] == "yes"]['Ref No'].astype(str).to_list() new_data['Ref No'] = new_data['Ref No'].astype(str) st.write("Selected Tenders:") st.write(new_data[new_data['Ref No'].isin(selected_tenders)].drop(columns=['Unnamed: 0']).reset_index().drop(columns=['index']))