File size: 5,648 Bytes
d9d1579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc4b5e3
d9d1579
 
 
 
 
 
 
 
 
 
 
cc4b5e3
d9d1579
 
cc4b5e3
d9d1579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85ad75f
d9d1579
85ad75f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc4b5e3
d9d1579
85ad75f
d9d1579
cc4b5e3
d9d1579
cc4b5e3
85ad75f
d9d1579
 
 
cc4b5e3
 
 
 
 
 
 
 
 
85ad75f
 
 
d9d1579
 
cc4b5e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Import from 3rd party libraries
import streamlit as st
import streamlit.components.v1 as components
# import streamlit_analytics
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download('wordnet')
from sentence_transformers import SentenceTransformer
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA
import time

st.set_page_config(page_title="Mental disorder by description", page_icon="πŸ€–")

def convert_string_to_numpy_array(s):
    '''Function to convert a string to a NumPy array'''
    numbers_list = re.findall(r'-?\d+\.\d+', s)
    return np.array(numbers_list, dtype=np.float64)

#load the model
@st.cache_resource
def get_models():
  st.write('*Loading the model...*')
  name = "stsb-bert-large"
  model = SentenceTransformer(name)
  st.write("*The app is loaded and ready to use!*")
  lemmatizer = WordNetLemmatizer()
  return model, lemmatizer

model, lemmatizer = get_models()
stop_words = set(stopwords.words('english'))

#load the dataframe with disorder embeddings
@st.cache_data  # πŸ‘ˆ Add the caching decorator
def load_data():
    df_icd = pd.read_csv('icd_embedded.csv')
    df_icd['numpy_array'] = df_icd['Embeddings'].apply(convert_string_to_numpy_array)
    icd_embeddings = np.array(df_icd["numpy_array"].tolist())
    return df_icd, icd_embeddings

df_icd, icd_embeddings = load_data()

#create a list of disease names
@st.cache_data  # πŸ‘ˆ Add the caching decorator
def create_disease_list():
    disease_names = []
    for name in df_icd["Disease"]:
        disease_names.append(name)
    return disease_names

disease_names = create_disease_list()

if 'descriptions' not in st.session_state:
  st.session_state.descriptions = []

def similarity_top(descr_emb, disorder_embs):
  # reshaping the character_embedding to match the shape of mental_disorder_embeddings
  descr_emb = descr_emb.reshape(1, -1)
  # calculating the cosine similarity
  similarity_scores = cosine_similarity(disorder_embs, descr_emb)

  scores_names = []
  for score, name in zip(similarity_scores, disease_names):
      data = {"disease_name": name, "similarity_score": score}
      scores_names.append(data)

  scores_names = sorted(scores_names, key=lambda x: x['similarity_score'], reverse=True)

  results = []

  for item in scores_names:
    disease_name = item['disease_name']
    similarity_score = item['similarity_score'][0]
    results.append((disease_name, similarity_score))

  return results[:5]

def vis_results_2d(input_embed):

    # performing dimensionality reduction using PCA
    pca = PCA(n_components=2)
    disease_embeddings_2d = pca.fit_transform(icd_embeddings)

    # creating a DataFrame for disease embeddings plot
    disease_data_df = pd.DataFrame(disease_embeddings_2d, columns=['PC1', 'PC2'])
    disease_data_df['Type'] = 'Disease'
    disease_data_df['Name'] = disease_names

    input_embed_2d = input_embed.reshape(1, -1)
    input_embed_2d = pca.transform(input_embed_2d)

    # creating a DataFrame for character embedding plot
    pca_2d = pd.DataFrame(input_embed_2d, columns=['PC1', 'PC2'])
    pca_2d['Type'] = 'Character'
    pca_2d['Your character'] = 'Your character'

    # concatenating the two DataFrames
    combined_2d = pd.concat([disease_data_df, pca_2d], ignore_index=True)

    # creating an interactive 3D scatter plot
    fig = px.scatter(combined_2d, x='PC1', y='PC2', text='Name', color='Type', symbol='Type', width=800, height=800)
    fig.show()

    
def vis_results_3d(input_embed):

    # performing dimensionality reduction using PCA
    pca = PCA(n_components=3)
    disease_embeddings_3d = pca.fit_transform(icd_embeddings)

    # creating a DataFrame for disease embeddings plot
    disease_data_df = pd.DataFrame(disease_embeddings_3d, columns=['PC1', 'PC2', 'PC3'])
    disease_data_df['Type'] = 'Disease'
    disease_data_df['Name'] = disease_names

    input_embed_2d = input_embed.reshape(1, -1)
    input_embed_3d = pca.transform(input_embed_2d)
    
    # creating a DataFrame for character embedding plot
    pca_3d = pd.DataFrame(input_embed_3d, columns=['PC1', 'PC2', 'PC3'])
    pca_3d['Type'] = 'Character'
    pca_3d['Your character'] = 'Your character'

    # concatenating the two DataFrames
    combined_3d = pd.concat([disease_data_df, pca_3d], ignore_index=True)

    # creating an interactive 3D scatter plot
    fig = px.scatter_3d(combined_3d, x='PC1', y='PC2', z='PC3', text='Name', color='Type', symbol='Type', width=800, height=800)
    fig.show()

# Configure Streamlit page and state
st.title("Detect your character's mental disorder!")
st.markdown(
    "This mini-app predicts top-5 most likely mental disorder based on your description. The more information you provide, the more informative the results will be."
)
st.caption("Note that this app can't be used for diagnostic purposes.")
input = st.text_input(label="Your description", placeholder="Insert a description of your character")
if input:
    input_embed = model.encode(input)
    sim_score = similarity_top(input_embed, icd_embeddings)
    i = 1
    for dis, value in sim_score:
        st.write(f":green[Prediction number] {i}:")
        st.write(f"{dis} (similarity score:", value, ")")
        i+= 1
        
    text_spinner_placeholder = st.empty()
    with st.spinner("Please wait while your visualizations are being generated..."):
        time.sleep(5)
    vis_results_2d(input_embed)
    vis_results_3d(input_embed)