manishjaiswal
commited on
Commit
•
248d9ff
1
Parent(s):
a4181e5
Create new file
Browse files
app.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import re
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from transformers import AutoTokenizer, AutoModel
|
8 |
+
from tokenizers import Tokenizer, AddedToken
|
9 |
+
import streamlit as st
|
10 |
+
from st_click_detector import click_detector
|
11 |
+
|
12 |
+
# This lil dealio is my test of the new experiemntal primitives which promise to put cach in streamlit within striking distance of simulating cognitive episodic memory (personalized feelings about a moment through space time), and semantic memory (factual memories we are ready to share and communicate like your email address or physical address yo
|
13 |
+
# Goal of this is to solve AI problem of two types of memory and their part in cognitive AGI along with the theory of model making as functional design of intelligence :
|
14 |
+
# Type 1 Memory - Semantic Memory:
|
15 |
+
# Semantic memory is conscious long-term memory for meaning, understanding, and conceptual facts about the world. Semantic memory is one of the two main varieties of explicit, conscious, long-term memory, which is memory that can be retrieved into conscious awareness after a long delay (from several seconds to years).
|
16 |
+
# Type 2 Memory - Episodic Memory:
|
17 |
+
# Episodic memory refers to the conscious recollection of a personal experience that contains information on what has happened and also where and when it happened. Recollection from episodic memory also implies a kind of first-person subjectivity that has been termed autonoetic consciousness.
|
18 |
+
# Functional Design of Intelligence: The brain uses map like structures to build a models repeatedly as part of LTM and STM memory by creating hundreds of thousands of models of everything we know. This allows us to answer important questions about how we perceive the world, why we have a sense of self, and the origin of higher level thought processes.
|
19 |
+
# Research Interests: AGI and ML Pipelines, Ambient IoT AI, Behavior Cognitive and Memory AI, Clinical Medical and Nursing AI, Genomics AI, GAN Gaming GAIL AR VR XR and Simulation AI, Graph Ontology KR KE AI, Languages and NLP AI, Quantum Compute GPU TPU NPU AI, Vision Image Document and Audio/Video AI
|
20 |
+
# Layman terms for interest with keyword intersection for plot search.
|
21 |
+
|
22 |
+
|
23 |
+
# callback to update query param on selectbox change
|
24 |
+
def update_params():
|
25 |
+
try:
|
26 |
+
print("update1")
|
27 |
+
#st.experimental_set_query_params(option=st.session_state.query)
|
28 |
+
except ValueError:
|
29 |
+
pass
|
30 |
+
|
31 |
+
# RADIO BUTTON SET PERSIST
|
32 |
+
# radio button persistance - plan is to hydrate when selected and change url along with textbox and search
|
33 |
+
options = ["artificial intelligence", "robot", "VR", "medicine", "genomics", "cure", "heal", "brain", "support", "friendship", "memory", "aging", "pharma", "virus", "nurse", "doctor", "therapist", "nutrition", "technology", "computer", "software", "neuroscience", "birth", "death", "soul", "space", "sci-fi"] # these options come from my research interests blended with keywords across film genres
|
34 |
+
|
35 |
+
query_params = st.experimental_get_query_params()
|
36 |
+
ix = 0
|
37 |
+
if query_params:
|
38 |
+
try:
|
39 |
+
q0 = query_params['query'][0]
|
40 |
+
ix = options.index(q0)
|
41 |
+
except ValueError:
|
42 |
+
pass
|
43 |
+
selected_option = st.radio(
|
44 |
+
"Param", options, index=ix, key="query", on_change=update_params
|
45 |
+
)
|
46 |
+
st.write("<style>div.row-widget.stRadio > div{flex-direction:row;}</style>", unsafe_allow_html=True)
|
47 |
+
|
48 |
+
|
49 |
+
st.experimental_set_query_params(option=selected_option)
|
50 |
+
|
51 |
+
try:
|
52 |
+
st.session_state.query = query # if set already above. this prevents two interface elements setting it first time once
|
53 |
+
except: # catch exception and set query param to predefined value
|
54 |
+
print("Error cant set after init")
|
55 |
+
|
56 |
+
|
57 |
+
# Text Input, check the query params set the text input to query value if in session
|
58 |
+
# check if here for the first time then set the query
|
59 |
+
if 'query' not in st.session_state:
|
60 |
+
#st.session_state['query'] = 'AI'
|
61 |
+
query = st.text_input("", value="artificial intelligence", key="query")
|
62 |
+
#st.session_state.query = 'AI'
|
63 |
+
#st.write(st.session_state.query)
|
64 |
+
else:
|
65 |
+
query = st.text_input("", value=st.session_state["query"], key="query")
|
66 |
+
try:
|
67 |
+
query_params = st.experimental_get_query_params()
|
68 |
+
query_option = query_params['query'][0] #throws an exception when visiting http://host:port
|
69 |
+
option_selected = st.sidebar.selectbox('Pick option', options, index=options.index(query_option))
|
70 |
+
except: # catch exception and set query param to predefined value
|
71 |
+
st.experimental_set_query_params(query="health") # set default
|
72 |
+
query_params = st.experimental_get_query_params()
|
73 |
+
query_option = query_params['query'][0]
|
74 |
+
query_option = "ai"
|
75 |
+
|
76 |
+
DEVICE = "cpu"
|
77 |
+
MODEL_OPTIONS = ["msmarco-distilbert-base-tas-b", "all-mpnet-base-v2"]
|
78 |
+
DESCRIPTION = """
|
79 |
+
# Semantic search
|
80 |
+
**Enter your query and hit enter**
|
81 |
+
Built with 🤗 Hugging Face's [transformers](https://huggingface.co/transformers/) library, [SentenceBert](https://www.sbert.net/) models, [Streamlit](https://streamlit.io/) and 44k movie descriptions from the Kaggle [Movies Dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset)
|
82 |
+
"""
|
83 |
+
|
84 |
+
# Session state - search parms
|
85 |
+
if 'key' not in st.session_state:
|
86 |
+
st.session_state['key'] = 'value'
|
87 |
+
if 'key' not in st.session_state:
|
88 |
+
st.session_state.key = 'value'
|
89 |
+
st.write(st.session_state.key)
|
90 |
+
st.write(st.session_state)
|
91 |
+
|
92 |
+
#st.session_state
|
93 |
+
for key in st.session_state.keys():
|
94 |
+
del st.session_state[key]
|
95 |
+
#st.text_input("Your name", key="name")
|
96 |
+
#st.session_state.name
|
97 |
+
|
98 |
+
@st.cache(
|
99 |
+
show_spinner=False,
|
100 |
+
hash_funcs={
|
101 |
+
AutoModel: lambda _: None,
|
102 |
+
AutoTokenizer: lambda _: None,
|
103 |
+
dict: lambda _: None,
|
104 |
+
},
|
105 |
+
)
|
106 |
+
def load():
|
107 |
+
models, tokenizers, embeddings = [], [], []
|
108 |
+
for model_option in MODEL_OPTIONS:
|
109 |
+
tokenizers.append(
|
110 |
+
AutoTokenizer.from_pretrained(f"sentence-transformers/{model_option}")
|
111 |
+
)
|
112 |
+
models.append(
|
113 |
+
AutoModel.from_pretrained(f"sentence-transformers/{model_option}").to(
|
114 |
+
DEVICE
|
115 |
+
)
|
116 |
+
)
|
117 |
+
embeddings.append(np.load("embeddings.npy"))
|
118 |
+
embeddings.append(np.load("embeddings2.npy"))
|
119 |
+
df = pd.read_csv("movies.csv")
|
120 |
+
return tokenizers, models, embeddings, df
|
121 |
+
|
122 |
+
tokenizers, models, embeddings, df = load()
|
123 |
+
def pooling(model_output):
|
124 |
+
return model_output.last_hidden_state[:, 0]
|
125 |
+
|
126 |
+
def compute_embeddings(texts):
|
127 |
+
encoded_input = tokenizers[0](
|
128 |
+
texts, padding=True, truncation=True, return_tensors="pt"
|
129 |
+
).to(DEVICE)
|
130 |
+
|
131 |
+
with torch.no_grad():
|
132 |
+
model_output = models[0](**encoded_input, return_dict=True)
|
133 |
+
|
134 |
+
embeddings = pooling(model_output)
|
135 |
+
return embeddings.cpu().numpy()
|
136 |
+
|
137 |
+
def pooling2(model_output, attention_mask):
|
138 |
+
token_embeddings = model_output[0]
|
139 |
+
input_mask_expanded = (
|
140 |
+
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
141 |
+
)
|
142 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
|
143 |
+
input_mask_expanded.sum(1), min=1e-9
|
144 |
+
)
|
145 |
+
|
146 |
+
def compute_embeddings2(list_of_strings):
|
147 |
+
encoded_input = tokenizers[1](
|
148 |
+
list_of_strings, padding=True, truncation=True, return_tensors="pt"
|
149 |
+
).to(DEVICE)
|
150 |
+
with torch.no_grad():
|
151 |
+
model_output = models[1](**encoded_input)
|
152 |
+
sentence_embeddings = pooling2(model_output, encoded_input["attention_mask"])
|
153 |
+
return F.normalize(sentence_embeddings, p=2, dim=1).cpu().numpy()
|
154 |
+
|
155 |
+
@st.cache(
|
156 |
+
show_spinner=False,
|
157 |
+
hash_funcs={Tokenizer: lambda _: None, AddedToken: lambda _: None},
|
158 |
+
)
|
159 |
+
def semantic_search(query, model_id):
|
160 |
+
start = time.time()
|
161 |
+
if len(query.strip()) == 0:
|
162 |
+
return ""
|
163 |
+
if "[Similar:" not in query:
|
164 |
+
if model_id == 0:
|
165 |
+
query_embedding = compute_embeddings([query])
|
166 |
+
else:
|
167 |
+
query_embedding = compute_embeddings2([query])
|
168 |
+
else:
|
169 |
+
match = re.match(r"\[Similar:(\d{1,5}).*", query)
|
170 |
+
if match:
|
171 |
+
idx = int(match.groups()[0])
|
172 |
+
query_embedding = embeddings[model_id][idx : idx + 1, :]
|
173 |
+
if query_embedding.shape[0] == 0:
|
174 |
+
return ""
|
175 |
+
else:
|
176 |
+
return ""
|
177 |
+
indices = np.argsort(embeddings[model_id] @ np.transpose(query_embedding)[:, 0])[
|
178 |
+
-1:-11:-1
|
179 |
+
]
|
180 |
+
if len(indices) == 0:
|
181 |
+
return ""
|
182 |
+
result = "<ol>"
|
183 |
+
for i in indices:
|
184 |
+
result += f"<li style='padding-top: 10px'><b>{df.iloc[i].title}</b> ({df.iloc[i].release_date}). {df.iloc[i].overview} "
|
185 |
+
#result += f"<a id='{i}' href='#'>Similar movies</a></li>"
|
186 |
+
#result += f"<a id='{i}' href=https://www.imdb.com/find?q={df.iloc[i].title}&ref_=nv_sr_sm>IMDB</a></li>"
|
187 |
+
delay = "%.3f" % (time.time() - start)
|
188 |
+
return f"<p><i>Computation time: {delay} seconds</i></p>{result}</ol>"
|
189 |
+
|
190 |
+
st.sidebar.markdown(DESCRIPTION)
|
191 |
+
|
192 |
+
model_choice = st.sidebar.selectbox("Similarity model", options=MODEL_OPTIONS)
|
193 |
+
model_id = 0 if model_choice == MODEL_OPTIONS[0] else 1
|
194 |
+
|
195 |
+
clicked = click_detector(semantic_search(query, model_id))
|
196 |
+
|
197 |
+
if clicked != "":
|
198 |
+
st.markdown(clicked)
|
199 |
+
change_query = False
|
200 |
+
if "last_clicked" not in st.session_state:
|
201 |
+
st.session_state["last_clicked"] = clicked
|
202 |
+
change_query = True
|
203 |
+
else:
|
204 |
+
if clicked != st.session_state["last_clicked"]:
|
205 |
+
st.session_state["last_clicked"] = clicked
|
206 |
+
change_query = True
|
207 |
+
if change_query:
|
208 |
+
st.session_state["query"] = f"[Similar:{clicked}] {df.iloc[int(clicked)].title}"
|
209 |
+
st.experimental_rerun()
|