Spaces:
Runtime error
Runtime error
Update styles
Browse files- app.py +46 -14
- retriever.py +3 -3
- style.css +18 -0
app.py
CHANGED
|
@@ -3,30 +3,62 @@
|
|
| 3 |
Here's our first attempt at using data to create a table:
|
| 4 |
"""
|
| 5 |
import streamlit as st
|
| 6 |
-
import pandas as pd
|
| 7 |
-
from load_css import local_css
|
| 8 |
from retriever import do_search
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
local_css('style.css')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
st.header('Semantic search demo')
|
| 13 |
-
search = st.text_input('')
|
| 14 |
|
|
|
|
| 15 |
if search:
|
| 16 |
result = do_search(search)
|
| 17 |
-
col1, col2, col3 = st.columns(3)
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
|
|
|
|
| 3 |
Here's our first attempt at using data to create a table:
|
| 4 |
"""
|
| 5 |
import streamlit as st
|
|
|
|
|
|
|
| 6 |
from retriever import do_search
|
| 7 |
|
| 8 |
+
def local_css(file_name):
|
| 9 |
+
with open(file_name) as f:
|
| 10 |
+
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def render_retrieved_content(content, score):
|
| 14 |
+
print_score = ''
|
| 15 |
+
if score is not None:
|
| 16 |
+
score = round(score, 3)
|
| 17 |
+
print_score = f'<b> Similarity Score: {score}</b>'
|
| 18 |
+
return f'<blockquote>{content} </blockquote> {print_score}'
|
| 19 |
+
|
| 20 |
local_css('style.css')
|
| 21 |
+
st.header('🧐 Where my docs at?')
|
| 22 |
+
st.markdown('✨ Imagine you have a bunch of text documents and looking for one specific passage, '
|
| 23 |
+
'but you can not remember on the exact words. Just about rough content. <br><br>'
|
| 24 |
+
'💡 This demo compares different search approaches that can help you to find the right '
|
| 25 |
+
'information.', unsafe_allow_html=True)
|
| 26 |
+
|
| 27 |
+
option = st.selectbox(
|
| 28 |
+
'Choose a dataset',
|
| 29 |
+
('CDU election program 2021', 'Partisan news 2019 (dutch)'))
|
| 30 |
+
|
| 31 |
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
search = st.text_input('Enter your search query')
|
| 34 |
if search:
|
| 35 |
result = do_search(search)
|
|
|
|
| 36 |
|
| 37 |
+
st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
|
| 38 |
+
st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
|
| 39 |
+
'in your collection. Only documents will be found that contain one of the words of '
|
| 40 |
+
'the given search query. You still have to remember on exact terms that are in the'
|
| 41 |
+
'searched phrase.')
|
| 42 |
+
st.markdown(render_retrieved_content(result[0][0].content, None),
|
| 43 |
+
unsafe_allow_html=True)
|
| 44 |
|
| 45 |
+
st.markdown('### 🧠 Semantic search')
|
| 46 |
+
st.markdown('An alternative approach is semantic search. Instead of using words of the '
|
| 47 |
+
'documents to calculate the score, we use a neural network that calculate the '
|
| 48 |
+
'similarity between the query and the documents of the collection. In other words, '
|
| 49 |
+
'the chance is high to find topic related documents without knowing the exact '
|
| 50 |
+
'terms.')
|
| 51 |
+
st.markdown(render_retrieved_content(result[1][0].content, result[1][0].score),
|
| 52 |
+
unsafe_allow_html=True)
|
| 53 |
|
| 54 |
+
st.markdown('### 🚀 Domain adapted semantic search')
|
| 55 |
+
st.markdown('If our document collection contains a lot of domain specific documents, '
|
| 56 |
+
'we can not use standard models. These models were trained on a large amount of '
|
| 57 |
+
'public available data, that covers probably not your domain specific words. To '
|
| 58 |
+
'improve the search results, we could fine-tune the network to calculate more '
|
| 59 |
+
'accurate similarities between queries and document regarding to your domain.')
|
| 60 |
+
st.markdown(render_retrieved_content(result[2][0].content, result[2][0].score),
|
| 61 |
+
unsafe_allow_html=True)
|
| 62 |
|
| 63 |
|
| 64 |
|
retriever.py
CHANGED
|
@@ -56,9 +56,9 @@ def dense_retrieval(query, retriever='base'):
|
|
| 56 |
|
| 57 |
|
| 58 |
def do_search(query):
|
| 59 |
-
sparse_result = sparse_retrieval(query)['documents']
|
| 60 |
-
dense_base_result =
|
| 61 |
-
dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents']
|
| 62 |
return sparse_result, dense_base_result, dense_adapted_result
|
| 63 |
|
| 64 |
if __name__ == '__main__':
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
def do_search(query):
|
| 59 |
+
sparse_result = sparse_retrieval(query)['documents']
|
| 60 |
+
dense_base_result =dense_retrieval(query, retriever='base')['documents']
|
| 61 |
+
dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents']
|
| 62 |
return sparse_result, dense_base_result, dense_adapted_result
|
| 63 |
|
| 64 |
if __name__ == '__main__':
|
style.css
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
blockquote {
|
| 2 |
+
background: #f9f9f9;
|
| 3 |
+
border-left: 10px solid #ccc;
|
| 4 |
+
margin: 1.5em 10px;
|
| 5 |
+
padding: 0.5em 10px;
|
| 6 |
+
quotes: "\201C""\201D""\2018""\2019";
|
| 7 |
+
}
|
| 8 |
+
blockquote:before {
|
| 9 |
+
color: #ccc;
|
| 10 |
+
content: '';
|
| 11 |
+
font-size: 4em;
|
| 12 |
+
line-height: 0.1em;
|
| 13 |
+
margin-right: 0.25em;
|
| 14 |
+
vertical-align: -0.4em;
|
| 15 |
+
}
|
| 16 |
+
blockquote p {
|
| 17 |
+
display: inline;
|
| 18 |
+
}
|