Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import pickle | |
from sklearn.impute import SimpleImputer | |
from sklearn.utils.validation import check_is_fitted | |
import numpy as np | |
# Load the trained model and preprocessing objects using pickle | |
with open('random_forest_model.pkl', 'rb') as f: | |
random_forest_model = pickle.load(f) | |
with open('scaler.pkl', 'rb') as f: | |
scaler = pickle.load(f) | |
with open('label_encoders.pkl', 'rb') as f: | |
label_encoders = pickle.load(f) | |
# State corrections and valid states/UTs | |
state_corrections = { | |
'uttaranchal': 'uttarakhand', | |
'orissa (odisha)': 'odisha', | |
'kashmir': 'jammu and kashmir', | |
'multi state': 'other', | |
'not classified': 'other' | |
} | |
valid_states_uts = [ | |
'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', 'goa', | |
'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', 'kerala', | |
'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', | |
'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura', | |
'uttar pradesh', 'uttarakhand', 'west bengal', 'andaman and nicobar islands', | |
'chandigarh', 'dadra and nagar haveli and daman and diu', 'lakshadweep', 'delhi', | |
'puducherry', 'jammu and kashmir', 'ladakh' | |
] | |
# Extract city, state, and country | |
def extract_city(x): | |
if isinstance(x, str): | |
splitted_string = x.split("-") | |
if len(splitted_string) == 4: | |
return f"{splitted_string[0].strip().lower()} {splitted_string[1].strip().lower()}" | |
else: | |
return splitted_string[0].strip().lower() | |
else: | |
return "other" | |
def extract_state(x): | |
if isinstance(x, str): | |
state = x.split("-")[-2].strip().lower() | |
return state_corrections.get(state, state if state in valid_states_uts else 'other') | |
else: | |
return "other" | |
def extract_country(x): | |
if isinstance(x, str): | |
return x.split("-")[-1].strip().lower() | |
else: | |
return "other" | |
def preprocess_new_data(df): | |
df['Ownership'] = df['Ownership'].str.lower().str.strip() | |
df[' Type of Tender '] = df[' Type of Tender '].str.lower().str.strip() | |
def parse_closing_date(date_str): | |
try: | |
return pd.to_datetime(date_str) | |
except Exception: | |
if " to " in date_str: | |
date_str = date_str.split(" to ")[-1] | |
return pd.to_datetime(date_str, errors='coerce') | |
return pd.NaT | |
df['Closing Date'] = df['Closing Date'].apply(parse_closing_date) | |
df['Date'] = pd.to_datetime(df['Date'], errors='coerce') | |
df['days_left'] = (df['Closing Date'] - df['Date']).dt.days | |
df['city'] = df['Location'].apply(lambda x: extract_city(x)) | |
df['state'] = df['Location'].apply(lambda x: extract_state(x)) | |
df['country'] = df['Location'].apply(lambda x: extract_country(x)) | |
df['city'].fillna("other", inplace=True) | |
df['state'].fillna("other", inplace=True) | |
df['country'].fillna("other", inplace=True) | |
# Remove commas and convert numerical columns to floats | |
numerical_columns = ['Earnest Money', 'Estimated Cost', 'DocFees'] | |
for col in numerical_columns: | |
df[col] = df[col].replace({',': ''}, regex=True).astype(float) | |
df = df[['Ref No', 'Earnest Money', 'Estimated Cost', 'DocFees', 'Ownership', ' Type of Tender ', 'days_left', 'city', 'state', 'country']] | |
imputer = SimpleImputer(strategy='median') | |
df['days_left'] = imputer.fit_transform(df[['days_left']]) | |
for column in ['Ownership', ' Type of Tender ', 'city', 'state', 'country']: | |
le = label_encoders[column] | |
# Add 'other' to the classes if it's not already there | |
if 'other' not in le.classes_: | |
le.classes_ = np.append(le.classes_, 'other') | |
# Replace unseen labels with 'other' | |
df[column] = df[column].apply(lambda x: x if x in le.classes_ else 'other') | |
df[column] = le.transform(df[column]) | |
numerical_features = ['Earnest Money', 'Estimated Cost', 'DocFees', 'days_left'] | |
df[numerical_features] = scaler.transform(df[numerical_features]) | |
return df | |
def predict_new_data(new_data): | |
preprocessed_data = preprocess_new_data(new_data) | |
X_new = preprocessed_data.drop(columns=['Ref No']) | |
tender_ref_numbers_new = preprocessed_data['Ref No'] | |
predictions = random_forest_model.predict(X_new) | |
results = pd.DataFrame({ | |
'Ref No': tender_ref_numbers_new, | |
'predictions': predictions | |
}) | |
return results | |
st.title("Tender Selection Prediction") | |
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) | |
if uploaded_file is not None: | |
new_data = pd.read_csv(uploaded_file) | |
prediction_results = predict_new_data(new_data) | |
selected_tenders = prediction_results[prediction_results['predictions'] == "yes"]['Ref No'].astype(str).to_list() | |
new_data['Ref No'] = new_data['Ref No'].astype(str) | |
st.write("Selected Tenders:") | |
st.write(new_data[new_data['Ref No'].isin(selected_tenders)].drop(columns=['Unnamed: 0']).reset_index().drop(columns=['index'])) | |