import streamlit as st |
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
import torch |
import string |
import plotly.express as px |
import pandas as pd |
import nltk |
from nltk.tokenize import sent_tokenize |
nltk.download('punkt') |
punctuations = string.punctuation |
def prep_text(text): |
clean_sents = [] |
sent_tokens = sent_tokenize(str(text)) |
for sent_token in sent_tokens: |
word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()] |
word_tokens = [word_token for word_token in word_tokens if word_token not in punctuations] |
clean_sents.append(' '.join((word_tokens))) |
joined = ' '.join(clean_sents).strip(' ') |
return joined |
checkpoint_1 = "Highway/SubCat" |
checkpoint_2 = "Highway/ExtraOver" |
checkpoint_3 = "Highway/Conversion" |
@st.cache(allow_output_mutation=True) |
def load_model_1(): |
return AutoModelForSequenceClassification.from_pretrained(checkpoint_1) |
@st.cache(allow_output_mutation=True) |
def load_tokenizer_1(): |
return AutoTokenizer.from_pretrained(checkpoint_1) |
@st.cache(allow_output_mutation=True) |
def load_model_2(): |
return AutoModelForSequenceClassification.from_pretrained(checkpoint_2) |
@st.cache(allow_output_mutation=True) |
def load_tokenizer_2(): |
return AutoTokenizer.from_pretrained(checkpoint_2) |
@st.cache(allow_output_mutation=True) |
def load_model_3(): |
return AutoModelForSequenceClassification.from_pretrained(checkpoint_3) |
@st.cache(allow_output_mutation=True) |
def load_tokenizer_3(): |
return AutoTokenizer.from_pretrained(checkpoint_3) |
st.set_page_config( |
page_title="Cost Data Classifier", layout= "wide", initial_sidebar_state="auto", page_icon="💷" |
) |
st.title("🚦 AI Infrastructure Cost Data Classifier") |
with st.expander("About this app", expanded=False): |
st.write( |
""" |
- Artificial Intelligence (AI) and Machine learning (ML) tool for automatic classification of infrastructure cost data for benchmarking |
- Classifies cost descriptions from documents such as Bills of Quantities (BOQs) and Schedule of Rates |
- Can be trained to classify granular and itemised cost descriptions into any predefined categories for benchmarking |
- Contact research team to discuss your data structures and suitability for the app |
- It is best to use this app on a laptop or desktop computer |
""" |
) |
st.markdown("##### Description") |
with st.form(key="my_form"): |
Text_entry = st.text_area( |
"Paste or type infrastructure cost description in the text box below (i.e., input)" |
) |
submitted = st.form_submit_button(label="👉 Get SubCat and ExtraOver!") |
if submitted: |
label_list_1 = [ |
'Arrow, Triangle, Circle, Letter, Numeral, Symbol and Sundries', |
'Binder', |
'Cable', |
'Catman Other Adjustment', |
'Cold Milling', |
'Disposal of Acceptable/Unacceptable Material', |
'Drain/Service Duct In Trench', |
'Erection & Dismantling of Temporary Accommodation/Facilities (All Types)', |
'Excavate And Replace Filter Material/Recycle Filter Material', |
'Excavation', |
'General TM Item', |
'Information boards', |
'Joint/Termination', |
'Line, Ancillary Line, Solid Area', |
'Loop Detector Installation', |
'Minimum Lining Visit Charge', |
'Node Marker', |
'PCC Kerb', |
'Provision of Mobile Welfare Facilities', |
'Removal of Deformable Safety Fence', |
'Removal of Line, Ancillary Line, Solid Area', |
'Removal of Traffic Sign and post(s)', |
'Road Stud', |
'Safety Barrier Or Bifurcation (Non-Concrete)', |
'Servicing of Temporary Accommodation/Facilities (All Types) (day)', |
'Tack Coat', |
'Temporary Road Markings', |
'Thin Surface Course', |
'Traffic Sign - Unknown specification', |
'Vegetation Clearance/Weed Control (m2)', |
'Others' |
] |
joined_clean_sents = prep_text(Text_entry) |
tokenizer_1 = load_tokenizer_1() |
tokenized_text_1 = tokenizer_1(joined_clean_sents, return_tensors="pt") |
model_1 = load_model_1() |
text_logits_1 = model_1(**tokenized_text_1).logits |
predictions_1 = torch.softmax(text_logits_1, dim=1).tolist()[0] |
predictions_1 = [round(a, 3) for a in predictions_1] |
pred_dict_1 = (dict(zip(label_list_1, predictions_1))) |
sorted_preds_1 = sorted(pred_dict_1.items(), key=lambda x: x[1], reverse=True) |
u_1, v_1 = zip(*sorted_preds_1) |
x_1 = list(u_1) |
y_1 = list(v_1) |
df2 = pd.DataFrame() |
df2['SubCatName'] = x_1 |
df2['Likelihood'] = y_1 |
label_list_2 = ["False", "True"] |
joined_clean_sents = prep_text(Text_entry) |
tokenizer_2 = load_tokenizer_2() |
tokenized_text_2 = tokenizer_2(joined_clean_sents, return_tensors="pt") |
model_2 = load_model_2() |
text_logits_2 = model_2(**tokenized_text_2).logits |
predictions_2 = torch.softmax(text_logits_2, dim=1).tolist()[0] |
predictions_2 = [round(a_, 3) for a_ in predictions_2] |
pred_dict_2 = (dict(zip(label_list_2, predictions_2))) |
sorted_preds_2 = sorted(pred_dict_2.items(), key=lambda x: x[1], reverse=True) |
u_2, v_2 = zip(*sorted_preds_2) |
x_2 = list(u_2) |
y_2 = list(v_2) |
df3 = pd.DataFrame() |
df3['ExtraOver'] = x_2 |
df3['Likelihood'] = y_2 |
label_list_3 = ['0.04', '0.045', '0.05', '0.1', '0.15', '0.2', '1.0', '7.0', '166.67', 'Others'] |
joined_clean_sents = prep_text(Text_entry) |
tokenizer_3 = load_tokenizer_3() |
tokenized_text_3 = tokenizer_3(joined_clean_sents, return_tensors="pt") |
model_3 = load_model_3() |
text_logits_3 = model_3(**tokenized_text_3).logits |
predictions_3 = torch.softmax(text_logits_3, dim=1).tolist()[0] |
predictions_3 = [round(a_, 3) for a_ in predictions_3] |
pred_dict_3 = (dict(zip(label_list_3, predictions_3))) |
sorted_preds_3 = sorted(pred_dict_3.items(), key=lambda x: x[1], reverse=True) |
u_3, v_3 = zip(*sorted_preds_3) |
x_3 = list(u_3) |
y_3 = list(v_3) |
df4 = pd.DataFrame() |
df4['Conversion_factor'] = x_3 |
df4['Likelihood'] = y_3 |
st.empty() |
tab1, tab2, tab3, tab4 = st.tabs(["Subcategory", "Extra Over", "Conversion Factor", "Summary"]) |
with tab1: |
st.header("SubCatName") |
fig = px.bar(df2, x="Likelihood", y="SubCatName", orientation="h") |
fig.update_layout( |
template='ggplot2', |
font=dict( |
family="Arial", |
size=14, |
color="black" |
), |
autosize=False, |
width=900, |
height=1000, |
xaxis_title="Likelihood of SubCatName", |
yaxis_title="SubCatNames", |
) |
fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=14)) |
fig.update_yaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=14)) |
fig.update_annotations(font_size=14) |
st.plotly_chart(fig, use_container_width=False) |
with tab2: |
st.header("ExtraOver") |
fig = px.bar(df3, x="Likelihood", y="ExtraOver", orientation="h") |
fig.update_layout( |
template='ggplot2', |
font=dict( |
family="Arial", |
size=14, |
color="black" |
), |
autosize=False, |
width=500, |
height=200, |
xaxis_title="Likelihood of ExtraOver", |
yaxis_title="ExtraOver", |
) |
fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=14)) |
fig.update_yaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=14)) |
fig.update_annotations(font_size=14) |
st.plotly_chart(fig, use_container_width=False) |
with tab3: |
st.header("Conversion_factor") |
fig = px.bar(df4, x="Likelihood", y="Conversion_factor", orientation="h") |
fig.update_layout( |
template='ggplot2', |
font=dict( |
family="Arial", |
size=14, |
color="black" |
), |
autosize=False, |
width=500, |
height=500, |
xaxis_title="Likelihood of Conversion_factor", |
yaxis_title="Conversion_factor", |
) |
fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=14)) |
fig.update_yaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=14)) |
fig.update_annotations(font_size=14) |
st.plotly_chart(fig, use_container_width=False) |
with tab4: |
st.header("") |
predicted_1 = st.metric("➡️ Predicted SubCatName", sorted_preds_1[0][0]) |
Prediction_confidence_1 = st.metric("Prediction confidence", (str(round(sorted_preds_1[0][1] * 100, 1)) + "%")) |
st.header("") |
predicted_2 = st.metric("➡️ Predicted ExtraOver", sorted_preds_2[0][0]) |
Prediction_confidence_2 = st.metric("Prediction confidence", (str(round(sorted_preds_2[0][1] * 100, 1)) + "%")) |
st.header("") |
predicted_3 = st.metric("➡️ Predicted Conversion_factor", sorted_preds_3[0][0]) |
Prediction_confidence_3 = st.metric("Prediction confidence", (str(round(sorted_preds_3[0][1] * 100, 1)) + "%")) |
st.success("Great! Predictions successfully completed. ", icon="✅") |