Stefano Fiorucci commited on
Commit
418ba7e
·
1 Parent(s): 6e587e4

improved style and added some questions

Browse files
Files changed (2) hide show
  1. app.py +88 -60
  2. data/questions.txt +8 -1
app.py CHANGED
@@ -2,7 +2,6 @@
2
  import time
3
  import streamlit as st
4
  import logging
5
- import pandas as pd
6
  from json import JSONDecodeError
7
  from markdown import markdown
8
  import random
@@ -20,56 +19,71 @@ from urllib.parse import unquote
20
 
21
  # FAISS index directory
22
  INDEX_DIR = 'data/index'
 
 
 
 
 
 
 
23
  # pipe=None
24
 
25
  # the following function is cached to make index and models load only at start
26
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None}, allow_output_mutation=True)
 
 
 
27
  def start_haystack():
28
- """
29
- load document store, retriever, reader and create pipeline
30
- """
31
- shutil.copy(f'{INDEX_DIR}/faiss_document_store.db','.')
32
- document_store = FAISSDocumentStore(
33
- faiss_index_path=f'{INDEX_DIR}/my_faiss_index.faiss',
34
- faiss_config_path=f'{INDEX_DIR}/my_faiss_index.json')
35
- print (f'Index size: {document_store.get_document_count()}')
36
- retriever = EmbeddingRetriever(
37
- document_store=document_store,
38
- embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
39
- model_format="sentence_transformers"
40
- )
41
- reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
42
  use_gpu=False,
43
- confidence_threshold=0.15)
44
- pipe = ExtractiveQAPipeline(reader, retriever)
45
- return pipe
 
46
 
47
  @st.cache()
48
  def load_questions():
49
- with open('./data/questions.txt') as fin:
50
  questions = [line.strip() for line in fin.readlines()
51
- if not line.startswith('#')]
52
- return questions
 
53
 
54
  def set_state_if_absent(key, value):
55
  if key not in st.session_state:
56
  st.session_state[key] = value
57
 
58
- pipe=start_haystack()
59
 
60
- # hash_funcs={builtins.weakref: my_hash_func}
 
 
 
 
 
61
  @st.cache(persist=True, allow_output_mutation=True)
62
- def query(question: str, retriever_top_k:int=10, reader_top_k:int=5):
63
  """Run query and get answers"""
64
- params = {"Retriever": {"top_k": retriever_top_k},
65
  "Reader": {"top_k": reader_top_k}}
66
  results = pipe.run(question, params=params)
67
  return results
68
 
69
 
70
  def main():
71
-
72
-
73
  questions = load_questions()
74
 
75
  # Persistent state
@@ -87,7 +101,7 @@ def main():
87
 
88
  # sidebar style
89
  st.markdown(
90
- """
91
  <style>
92
  [data-testid="stSidebar"][aria-expanded="true"] > div:first-child{
93
  width: 350px;
@@ -97,23 +111,25 @@ def main():
97
  margin-left: -350px;
98
  }
99
  """,
100
- unsafe_allow_html=True,
101
  )
102
  # Title
103
  st.write("# Who killed Laura Palmer?")
104
  st.write("### The first Twin Peaks Question Answering system!")
105
-
106
  st.markdown("""
107
- Ask any question about Twin Peaks [Twin Peaks] (https://twinpeaks.fandom.com/wiki/Twin_Peaks)
108
  and see if the AI ​​can find an answer...
109
 
110
  *Note: do not use keywords, but full-fledged questions.*
111
  """)
112
 
113
  # Sidebar
114
- st.sidebar.header("Who killed Laura Palmer?")
115
- st.sidebar.image("https://upload.wikimedia.org/wikipedia/it/3/39/Twin-peaks-1990.jpg")
116
- st.sidebar.markdown('<p align="center"><b>Twin Peaks Question Answering system</b></p>', unsafe_allow_html=True)
 
 
117
  st.sidebar.markdown(f"""
118
  <style>
119
  a {{
@@ -139,7 +155,8 @@ and see if the AI ​​can find an answer...
139
  <div class="haystack-footer">
140
  <p><a href="https://github.com/anakin87/who-killed-laura-palmer">GitHub</a> -
141
  Built with <a href="https://github.com/deepset-ai/haystack/">Haystack</a><br/>
142
- <small>Data crawled from <a href="https://twinpeaks.fandom.com/wiki/Twin_Peaks_Wiki">Twin Peaks Wiki</a>.</small>
 
143
  </p>
144
  <img src = 'https://static.wikia.nocookie.net/twinpeaks/images/e/ef/Laura_Palmer%2C_the_Queen_Of_Hearts.jpg'/>
145
  <br/>
@@ -150,17 +167,19 @@ and see if the AI ​​can find an answer...
150
  st.sidebar.markdown("""
151
  <p align="center">
152
  <iframe style="border-radius:12px" src="https://open.spotify.com/embed/playlist/38rrtWgflrw7grB37aMlsO?utm_source=generator" width="85%" height="380" frameBorder="0" allowfullscreen="" allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"></iframe>
153
- </p>""", unsafe_allow_html=True)
154
 
155
  # Search bar
156
  question = st.text_input("",
157
- value=st.session_state.question,
158
- max_chars=100,
159
- on_change=reset_results
160
- )
161
  col1, col2 = st.columns(2)
162
- col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
163
- col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
 
 
164
 
165
  # Run button
166
  run_pressed = col1.button("Run")
@@ -169,22 +188,24 @@ and see if the AI ​​can find an answer...
169
  if col2.button("Random question"):
170
  reset_results()
171
  question = random.choice(questions)
172
- while question == st.session_state.question: # Avoid picking the same question twice (the change is not visible on the UI)
 
173
  question = random.choice(questions)
174
  st.session_state.question = question
175
- # st.session_state.answer = new_row["Answer"].values[0]
176
  st.session_state.random_question_requested = True
177
  # Re-runs the script setting the random question as the textbox value
178
  # Unfortunately necessary as the Random Question button is _below_ the textbox
179
- raise st.script_runner.RerunException(st.script_request_queue.RerunData(None))
 
180
  else:
181
  st.session_state.random_question_requested = False
182
-
183
- run_query = (run_pressed or question != st.session_state.question) and not st.session_state.random_question_requested
 
184
 
185
  # Get results for query
186
  if run_query and question:
187
- time_start=time.time()
188
  reset_results()
189
  st.session_state.question = question
190
 
@@ -193,11 +214,13 @@ and see if the AI ​​can find an answer...
193
 
194
  ):
195
  try:
196
- st.session_state.results = query(question)
197
- time_end=time.time()
 
198
  print(f'elapsed time: {time_end - time_start}')
199
  except JSONDecodeError as je:
200
- st.error("👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
 
201
  return
202
  except Exception as e:
203
  logging.exception(e)
@@ -207,28 +230,33 @@ and see if the AI ​​can find an answer...
207
  if st.session_state.results:
208
  st.write("## Results:")
209
 
210
- alert_irrelevance=True
211
- if len(st.session_state.results['answers'])==0:
212
  st.info("🤔 &nbsp;&nbsp; Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!")
213
 
214
  for count, result in enumerate(st.session_state.results['answers']):
215
- result=result.to_dict()
216
  if result["answer"]:
217
- if alert_irrelevance and result['score']<0.50:
218
  alert_irrelevance = False
219
  st.write("""
220
  <h4 style='color: darkred'>Attention, the
221
  following answers have low relevance:</h4>""",
222
- unsafe_allow_html=True)
223
 
224
  answer, context = result["answer"], result["context"]
225
  start_idx = context.find(answer)
226
  end_idx = start_idx + len(answer)
227
  # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
228
- st.write(markdown("- ..."+context[:start_idx] + str(annotation(answer, "ANSWER", "#3e1c21")) + context[end_idx:]+"..."), unsafe_allow_html=True)
 
 
229
  source = ""
230
- name = unquote(result['meta']['name']).replace('_',' ')
231
  url = result['meta']['url']
232
  source = f"[{name}]({url})"
233
- st.markdown(f"**Score:** {result['score']:.2f} - **Source:** {source}")
 
 
 
234
  main()
 
2
  import time
3
  import streamlit as st
4
  import logging
 
5
  from json import JSONDecodeError
6
  from markdown import markdown
7
  import random
 
19
 
20
  # FAISS index directory
21
  INDEX_DIR = 'data/index'
22
+ QUESTIONS_PATH = 'data/questions.txt'
23
+ RETRIEVER_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
24
+ RETRIEVER_MODEL_FORMAT = "sentence_transformers"
25
+ READER_MODEL = "deepset/roberta-base-squad2"
26
+ READER_CONFIG_THRESHOLD = 0.15
27
+ RETRIEVER_TOP_K = 10
28
+ READER_TOP_K = 5
29
  # pipe=None
30
 
31
  # the following function is cached to make index and models load only at start
32
+
33
+
34
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
35
+ allow_output_mutation=True)
36
  def start_haystack():
37
+ """
38
+ load document store, retriever, reader and create pipeline
39
+ """
40
+ shutil.copy(f'{INDEX_DIR}/faiss_document_store.db', '.')
41
+ document_store = FAISSDocumentStore(
42
+ faiss_index_path=f'{INDEX_DIR}/my_faiss_index.faiss',
43
+ faiss_config_path=f'{INDEX_DIR}/my_faiss_index.json')
44
+ print(f'Index size: {document_store.get_document_count()}')
45
+ retriever = EmbeddingRetriever(
46
+ document_store=document_store,
47
+ embedding_model=RETRIEVER_MODEL,
48
+ model_format=RETRIEVER_MODEL_FORMAT
49
+ )
50
+ reader = FARMReader(model_name_or_path=READER_MODEL,
51
  use_gpu=False,
52
+ confidence_threshold=READER_CONFIG_THRESHOLD)
53
+ pipe = ExtractiveQAPipeline(reader, retriever)
54
+ return pipe
55
+
56
 
57
  @st.cache()
58
  def load_questions():
59
+ with open(QUESTIONS_PATH) as fin:
60
  questions = [line.strip() for line in fin.readlines()
61
+ if not line.startswith('#')]
62
+ return questions
63
+
64
 
65
  def set_state_if_absent(key, value):
66
  if key not in st.session_state:
67
  st.session_state[key] = value
68
 
 
69
 
70
+ pipe = start_haystack()
71
+
72
+ # the pipeline is not included as parameter of the following function,
73
+ # because it is difficult to cache
74
+
75
+
76
  @st.cache(persist=True, allow_output_mutation=True)
77
+ def query(question: str, retriever_top_k: int = 10, reader_top_k: int = 5):
78
  """Run query and get answers"""
79
+ params = {"Retriever": {"top_k": retriever_top_k},
80
  "Reader": {"top_k": reader_top_k}}
81
  results = pipe.run(question, params=params)
82
  return results
83
 
84
 
85
  def main():
86
+
 
87
  questions = load_questions()
88
 
89
  # Persistent state
 
101
 
102
  # sidebar style
103
  st.markdown(
104
+ """
105
  <style>
106
  [data-testid="stSidebar"][aria-expanded="true"] > div:first-child{
107
  width: 350px;
 
111
  margin-left: -350px;
112
  }
113
  """,
114
+ unsafe_allow_html=True,
115
  )
116
  # Title
117
  st.write("# Who killed Laura Palmer?")
118
  st.write("### The first Twin Peaks Question Answering system!")
119
+
120
  st.markdown("""
121
+ Ask any question about [Twin Peaks] (https://twinpeaks.fandom.com/wiki/Twin_Peaks)
122
  and see if the AI ​​can find an answer...
123
 
124
  *Note: do not use keywords, but full-fledged questions.*
125
  """)
126
 
127
  # Sidebar
128
+ st.sidebar.header("Who killed Laura Palmer?")
129
+ st.sidebar.image(
130
+ "https://upload.wikimedia.org/wikipedia/it/3/39/Twin-peaks-1990.jpg")
131
+ st.sidebar.markdown('<p align="center"><b>Twin Peaks Question Answering system</b></p>',
132
+ unsafe_allow_html=True)
133
  st.sidebar.markdown(f"""
134
  <style>
135
  a {{
 
155
  <div class="haystack-footer">
156
  <p><a href="https://github.com/anakin87/who-killed-laura-palmer">GitHub</a> -
157
  Built with <a href="https://github.com/deepset-ai/haystack/">Haystack</a><br/>
158
+ <small>Data crawled from <a href="https://twinpeaks.fandom.com/wiki/Twin_Peaks_Wiki">
159
+ Twin Peaks Wiki</a>.</small>
160
  </p>
161
  <img src = 'https://static.wikia.nocookie.net/twinpeaks/images/e/ef/Laura_Palmer%2C_the_Queen_Of_Hearts.jpg'/>
162
  <br/>
 
167
  st.sidebar.markdown("""
168
  <p align="center">
169
  <iframe style="border-radius:12px" src="https://open.spotify.com/embed/playlist/38rrtWgflrw7grB37aMlsO?utm_source=generator" width="85%" height="380" frameBorder="0" allowfullscreen="" allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"></iframe>
170
+ </p>""", unsafe_allow_html=True)
171
 
172
  # Search bar
173
  question = st.text_input("",
174
+ value=st.session_state.question,
175
+ max_chars=100,
176
+ on_change=reset_results
177
+ )
178
  col1, col2 = st.columns(2)
179
+ col1.markdown(
180
+ "<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
181
+ col2.markdown(
182
+ "<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
183
 
184
  # Run button
185
  run_pressed = col1.button("Run")
 
188
  if col2.button("Random question"):
189
  reset_results()
190
  question = random.choice(questions)
191
+ # Avoid picking the same question twice (the change is not visible on the UI)
192
+ while question == st.session_state.question:
193
  question = random.choice(questions)
194
  st.session_state.question = question
 
195
  st.session_state.random_question_requested = True
196
  # Re-runs the script setting the random question as the textbox value
197
  # Unfortunately necessary as the Random Question button is _below_ the textbox
198
+ raise st.script_runner.RerunException(
199
+ st.script_request_queue.RerunData(None))
200
  else:
201
  st.session_state.random_question_requested = False
202
+
203
+ run_query = (run_pressed or question != st.session_state.question) \
204
+ and not st.session_state.random_question_requested
205
 
206
  # Get results for query
207
  if run_query and question:
208
+ time_start = time.time()
209
  reset_results()
210
  st.session_state.question = question
211
 
 
214
 
215
  ):
216
  try:
217
+ st.session_state.results = query(
218
+ question, RETRIEVER_TOP_K, READER_TOP_K)
219
+ time_end = time.time()
220
  print(f'elapsed time: {time_end - time_start}')
221
  except JSONDecodeError as je:
222
+ st.error(
223
+ "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
224
  return
225
  except Exception as e:
226
  logging.exception(e)
 
230
  if st.session_state.results:
231
  st.write("## Results:")
232
 
233
+ alert_irrelevance = True
234
+ if len(st.session_state.results['answers']) == 0:
235
  st.info("🤔 &nbsp;&nbsp; Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!")
236
 
237
  for count, result in enumerate(st.session_state.results['answers']):
238
+ result = result.to_dict()
239
  if result["answer"]:
240
+ if alert_irrelevance and result['score'] < 0.50:
241
  alert_irrelevance = False
242
  st.write("""
243
  <h4 style='color: darkred'>Attention, the
244
  following answers have low relevance:</h4>""",
245
+ unsafe_allow_html=True)
246
 
247
  answer, context = result["answer"], result["context"]
248
  start_idx = context.find(answer)
249
  end_idx = start_idx + len(answer)
250
  # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
251
+ st.write(markdown("- ..."+context[:start_idx] +
252
+ str(annotation(answer, "ANSWER", "#3e1c21")) + context[end_idx:]+"..."),
253
+ unsafe_allow_html=True)
254
  source = ""
255
+ name = unquote(result['meta']['name']).replace('_', ' ')
256
  url = result['meta']['url']
257
  source = f"[{name}]({url})"
258
+ st.markdown(
259
+ f"**Score:** {result['score']:.2f} - **Source:** {source}")
260
+
261
+
262
  main()
data/questions.txt CHANGED
@@ -16,4 +16,11 @@ Who is the log lady?
16
  #Who is Bobby Briggs' father?
17
  who is Susan Hurley
18
  Who is Mike
19
- Why did Windom Earle goes to Twin Peaks?
 
 
 
 
 
 
 
 
16
  #Who is Bobby Briggs' father?
17
  who is Susan Hurley
18
  Who is Mike
19
+ Why did Windom Earle goes to Twin Peaks?
20
+ Who plays Laura Palmer?
21
+ Who was a Twin Peaks psychiatrist?
22
+ Who was Laura's secret boyfriend?
23
+ Who plays Bobby Briggs?
24
+ Who is the bad guy?
25
+ What does Laura die from?
26
+ Why did the movie flop in the United States?