Spaces:
Sleeping
Sleeping
File size: 5,094 Bytes
0836746 466d49d 0836746 5fac565 0836746 466d49d 0836746 466d49d 0836746 466d49d 0836746 5fac565 0836746 5fac565 0836746 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import streamlit as st
import pandas as pd
import pickle
from sklearn.impute import SimpleImputer
from sklearn.utils.validation import check_is_fitted
import numpy as np
# Load the trained model and preprocessing objects using pickle
with open('random_forest_model.pkl', 'rb') as f:
random_forest_model = pickle.load(f)
with open('scaler.pkl', 'rb') as f:
scaler = pickle.load(f)
with open('label_encoders.pkl', 'rb') as f:
label_encoders = pickle.load(f)
# State corrections and valid states/UTs
state_corrections = {
'uttaranchal': 'uttarakhand',
'orissa (odisha)': 'odisha',
'kashmir': 'jammu and kashmir',
'multi state': 'other',
'not classified': 'other'
}
valid_states_uts = [
'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', 'goa',
'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', 'kerala',
'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland',
'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura',
'uttar pradesh', 'uttarakhand', 'west bengal', 'andaman and nicobar islands',
'chandigarh', 'dadra and nagar haveli and daman and diu', 'lakshadweep', 'delhi',
'puducherry', 'jammu and kashmir', 'ladakh'
]
# Extract city, state, and country
def extract_city(x):
if isinstance(x, str):
splitted_string = x.split("-")
if len(splitted_string) == 4:
return f"{splitted_string[0].strip().lower()} {splitted_string[1].strip().lower()}"
else:
return splitted_string[0].strip().lower()
else:
return "other"
def extract_state(x):
if isinstance(x, str):
state = x.split("-")[-2].strip().lower()
return state_corrections.get(state, state if state in valid_states_uts else 'other')
else:
return "other"
def extract_country(x):
if isinstance(x, str):
return x.split("-")[-1].strip().lower()
else:
return "other"
def preprocess_new_data(df):
df['Ownership'] = df['Ownership'].str.lower().str.strip()
df[' Type of Tender '] = df[' Type of Tender '].str.lower().str.strip()
def parse_closing_date(date_str):
try:
return pd.to_datetime(date_str)
except Exception:
if " to " in date_str:
date_str = date_str.split(" to ")[-1]
return pd.to_datetime(date_str, errors='coerce')
return pd.NaT
df['Closing Date'] = df['Closing Date'].apply(parse_closing_date)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['days_left'] = (df['Closing Date'] - df['Date']).dt.days
df['city'] = df['Location'].apply(lambda x: extract_city(x))
df['state'] = df['Location'].apply(lambda x: extract_state(x))
df['country'] = df['Location'].apply(lambda x: extract_country(x))
df['city'].fillna("other", inplace=True)
df['state'].fillna("other", inplace=True)
df['country'].fillna("other", inplace=True)
# Remove commas and convert numerical columns to floats
numerical_columns = ['Earnest Money', 'Estimated Cost', 'DocFees']
for col in numerical_columns:
df[col] = df[col].replace({',': ''}, regex=True).astype(float)
df = df[['Ref No', 'Earnest Money', 'Estimated Cost', 'DocFees', 'Ownership', ' Type of Tender ', 'days_left', 'city', 'state', 'country']]
imputer = SimpleImputer(strategy='median')
df['days_left'] = imputer.fit_transform(df[['days_left']])
for column in ['Ownership', ' Type of Tender ', 'city', 'state', 'country']:
le = label_encoders[column]
# Add 'other' to the classes if it's not already there
if 'other' not in le.classes_:
le.classes_ = np.append(le.classes_, 'other')
# Replace unseen labels with 'other'
df[column] = df[column].apply(lambda x: x if x in le.classes_ else 'other')
df[column] = le.transform(df[column])
numerical_features = ['Earnest Money', 'Estimated Cost', 'DocFees', 'days_left']
df[numerical_features] = scaler.transform(df[numerical_features])
return df
def predict_new_data(new_data):
preprocessed_data = preprocess_new_data(new_data)
X_new = preprocessed_data.drop(columns=['Ref No'])
tender_ref_numbers_new = preprocessed_data['Ref No']
predictions = random_forest_model.predict(X_new)
results = pd.DataFrame({
'Ref No': tender_ref_numbers_new,
'predictions': predictions
})
return results
st.title("Tender Selection Prediction")
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
if uploaded_file is not None:
new_data = pd.read_csv(uploaded_file)
prediction_results = predict_new_data(new_data)
selected_tenders = prediction_results[prediction_results['predictions'] == "yes"]['Ref No'].astype(str).to_list()
new_data['Ref No'] = new_data['Ref No'].astype(str)
st.write("Selected Tenders:")
st.write(new_data[new_data['Ref No'].isin(selected_tenders)].drop(columns=['Unnamed: 0']).reset_index().drop(columns=['index']))
|