File size: 3,906 Bytes
d9d1579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Import from 3rd party libraries
import streamlit as st
import streamlit.components.v1 as components
# import streamlit_analytics
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download('wordnet')
from sentence_transformers import SentenceTransformer
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA

st.set_page_config(page_title="Mental disorder by description", page_icon="πŸ€–")

def convert_string_to_numpy_array(s):
    '''Function to convert a string to a NumPy array'''
    numbers_list = re.findall(r'-?\d+\.\d+', s)
    return np.array(numbers_list, dtype=np.float64)

#load the model
@st.cache_resource
def get_models():
  st.write('Loading the model...')
  name = "stsb-bert-large"
  model = SentenceTransformer(name)
  st.write("The app is loaded and ready to use!")
  lemmatizer = WordNetLemmatizer()
  return model, lemmatizer

model, lemmatizer = get_models()
stop_words = set(stopwords.words('english'))

#load the dataframe with disorder embeddings
@st.cache_data  # πŸ‘ˆ Add the caching decorator
def load_data():
    df_icd = pd.read_csv('icd_embedded.csv')
    df_icd['numpy_array'] = df_icd['Embeddings'].apply(convert_string_to_numpy_array)
    icd_embeddings = np.array(df_icd["numpy_array"].tolist())
    return df_icd, icd_embeddings

df_icd, icd_embeddings = load_data()

#create a list of disease names
@st.cache_data  # πŸ‘ˆ Add the caching decorator
def create_disease_list():
    disease_names = []
    for name in df_icd["Disease"]:
        disease_names.append(name)
    return disease_names

disease_names = create_disease_list()

if 'descriptions' not in st.session_state:
  st.session_state.descriptions = []

def similarity_top(descr_emb, disorder_embs):
  # reshaping the character_embedding to match the shape of mental_disorder_embeddings
  descr_emb = descr_emb.reshape(1, -1)
  # calculating the cosine similarity
  similarity_scores = cosine_similarity(disorder_embs, descr_emb)

  scores_names = []
  for score, name in zip(similarity_scores, disease_names):
      data = {"disease_name": name, "similarity_score": score}
      scores_names.append(data)

  scores_names = sorted(scores_names, key=lambda x: x['similarity_score'], reverse=True)

  results = []

  for item in scores_names:
    disease_name = item['disease_name']
    similarity_score = item['similarity_score'][0]
    results.append((disease_name, similarity_score))

  return results[:5]


# with text_spinner_placeholder:
#     with st.spinner("Please wait while your Tweet is being generated..."):
#         mood_prompt = f"{mood} " if mood else ""
#         if style:
#             twitter = twe.Tweets(account=style)
#             tweets = twitter.fetch_tweets()
#             tweets_prompt = "\n\n".join(tweets)
#             prompt = (
#                 f"Write a {mood_prompt}Tweet about {topic} in less than 120 characters "
#                 f"and in the style of the following Tweets:\n\n{tweets_prompt}\n\n"
                    
# Configure Streamlit page and state
st.title("Detect the disorder")
st.markdown(
    "This mini-app predicts a mental disorder based on your description."
)

input = st.text_input(label="Your description)", placeholder="Insert a description of a character")
if input:
    input_embed = model.encode(input)
    sim_score = similarity_top(input_embed, icd_embeddings)
    st.write(sim_score)
    
# mood = st.text_input(
#     label="Mood (e.g. inspirational, funny, serious) (optional)",
#     placeholder="inspirational",
# )
# style = st.text_input(
#     label="Twitter account handle to style-copy recent Tweets (optional, limited by Twitter's API)",
#     placeholder="elonmusk",
# )

text_spinner_placeholder = st.empty()