|
|
|
import streamlit as st |
|
import streamlit.components.v1 as components |
|
|
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import string |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
nltk.download("stopwords") |
|
nltk.download('wordnet') |
|
from sentence_transformers import SentenceTransformer |
|
import plotly.express as px |
|
import pandas as pd |
|
from sklearn.decomposition import PCA |
|
import time |
|
|
|
st.set_page_config(page_title="Mental disorder by description", page_icon="π€") |
|
|
|
def convert_string_to_numpy_array(s): |
|
'''Function to convert a string to a NumPy array''' |
|
numbers_list = re.findall(r'-?\d+\.\d+', s) |
|
return np.array(numbers_list, dtype=np.float64) |
|
|
|
|
|
@st.cache_resource |
|
def get_models(): |
|
st.write('*Loading the model...*') |
|
name = "stsb-bert-large" |
|
model = SentenceTransformer(name) |
|
st.write("*The app is loaded and ready to use! :tada:*") |
|
lemmatizer = WordNetLemmatizer() |
|
return model, lemmatizer |
|
|
|
model, lemmatizer = get_models() |
|
stop_words = set(stopwords.words('english')) |
|
|
|
|
|
@st.cache_data |
|
def load_data(): |
|
df_icd = pd.read_csv('icd_embedded.csv') |
|
df_icd['numpy_array'] = df_icd['Embeddings'].apply(convert_string_to_numpy_array) |
|
icd_embeddings = np.array(df_icd["numpy_array"].tolist()) |
|
return df_icd, icd_embeddings |
|
|
|
df_icd, icd_embeddings = load_data() |
|
|
|
|
|
@st.cache_data |
|
def create_disease_list(): |
|
disease_names = [] |
|
for name in df_icd["Disease"]: |
|
disease_names.append(name) |
|
return disease_names |
|
|
|
disease_names = create_disease_list() |
|
|
|
if 'descriptions' not in st.session_state: |
|
st.session_state.descriptions = [] |
|
|
|
def similarity_top(descr_emb, disorder_embs): |
|
|
|
descr_emb = descr_emb.reshape(1, -1) |
|
|
|
similarity_scores = cosine_similarity(disorder_embs, descr_emb) |
|
|
|
scores_names = [] |
|
for score, name in zip(similarity_scores, disease_names): |
|
data = {"disease_name": name, "similarity_score": score} |
|
scores_names.append(data) |
|
|
|
scores_names = sorted(scores_names, key=lambda x: x['similarity_score'], reverse=True) |
|
|
|
results = [] |
|
|
|
for item in scores_names: |
|
disease_name = item['disease_name'] |
|
similarity_score = item['similarity_score'][0] |
|
results.append((disease_name, similarity_score)) |
|
|
|
return results[:5] |
|
|
|
def vis_results_2d(input_embed): |
|
|
|
|
|
pca = PCA(n_components=2) |
|
disease_embeddings_2d = pca.fit_transform(icd_embeddings) |
|
|
|
|
|
disease_data_df = pd.DataFrame(disease_embeddings_2d, columns=['PC1', 'PC2']) |
|
disease_data_df['Type'] = 'Disease' |
|
disease_data_df['Name'] = disease_names |
|
|
|
input_embed_2d = input_embed.reshape(1, -1) |
|
input_embed_2d = pca.transform(input_embed_2d) |
|
|
|
|
|
pca_2d = pd.DataFrame(input_embed_2d, columns=['PC1', 'PC2']) |
|
pca_2d['Type'] = 'Character' |
|
pca_2d['Your character'] = 'Your character' |
|
|
|
|
|
combined_2d = pd.concat([disease_data_df, pca_2d], ignore_index=True) |
|
|
|
|
|
fig = px.scatter(combined_2d, x='PC1', y='PC2', text='Name', color='Type', symbol='Type', width=800, height=800) |
|
fig.show() |
|
|
|
|
|
def vis_results_3d(input_embed): |
|
|
|
|
|
pca = PCA(n_components=3) |
|
disease_embeddings_3d = pca.fit_transform(icd_embeddings) |
|
|
|
|
|
disease_data_df = pd.DataFrame(disease_embeddings_3d, columns=['PC1', 'PC2', 'PC3']) |
|
disease_data_df['Type'] = 'Disease' |
|
disease_data_df['Name'] = disease_names |
|
|
|
input_embed_2d = input_embed.reshape(1, -1) |
|
input_embed_3d = pca.transform(input_embed_2d) |
|
|
|
|
|
pca_3d = pd.DataFrame(input_embed_3d, columns=['PC1', 'PC2', 'PC3']) |
|
pca_3d['Type'] = 'Character' |
|
pca_3d['Your character'] = 'Your character' |
|
|
|
|
|
combined_3d = pd.concat([disease_data_df, pca_3d], ignore_index=True) |
|
|
|
|
|
fig = px.scatter_3d(combined_3d, x='PC1', y='PC2', z='PC3', text='Name', color='Type', symbol='Type', width=800, height=800) |
|
fig.show() |
|
|
|
|
|
st.title("Detect your character's mental disorder! :books: :mag:") |
|
st.markdown( |
|
"This mini-app predicts top-5 most likely mental disorders based on your description. The more information you provide, the more informative the results will be." |
|
) |
|
st.caption("Note that this app can't be used for diagnostic purposes.") |
|
input = st.text_input(label="Your description", placeholder="Insert a description of your character") |
|
if input: |
|
input_embed = model.encode(input) |
|
sim_score = similarity_top(input_embed, icd_embeddings) |
|
i = 1 |
|
nums = {1: 'one', 2: 'two', 3: 'three', 4:'four', 5:'five'} |
|
for dis, value in sim_score: |
|
st.write(f":green[*Prediction number*] :{i}: :") |
|
st.write(f"{dis} (similarity score:", value, ")") |
|
i+= 1 |
|
|
|
text_spinner_placeholder = st.empty() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|