Spaces:

spacerini
/

gaia

Sleeping

App Files Files Community

ola13 commited on Feb 20, 2023

Commit

b6da1a8

•

1 Parent(s): e55d3fc

choose corpus

Browse files

Files changed (2) hide show

.streamlit/config.toml +2 -0
app.py +27 -67

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [theme]
2	+ base="light"

app.py CHANGED Viewed

@@ -1,10 +1,7 @@
-import http.client as http_client
 import json
-import logging
 import os
 import pprint
 import re
-import string
 import streamlit as st
 import streamlit.components.v1 as components
@@ -12,30 +9,15 @@ import requests
 pp = pprint.PrettyPrinter(indent=2)
 st.set_page_config(page_title="Gaia Search", layout="wide")
 os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
 with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
     file.write('[theme]\nbase="light"')
-LANG_MAPPING = {
-    "Arabic": "ar",
-    "Catalan": "ca",
-    "Code": "code",
-    "English": "en",
-    "Spanish": "es",
-    "French": "fr",
-    "Indonesian": "id",
-    "Indic": "indic",
-    "Niger-Congo": "nigercongo",
-    "Portuguese": "pt",
-    "Vietnamese": "vi",
-    "Chinese": "zh",
-    "Detect Language": "detect_language",
-    "All": "all",
-}
 st.sidebar.markdown(
     """
 <style>
@@ -71,25 +53,10 @@ st.sidebar.markdown(
 )
 query = st.sidebar.text_input(label="Search query", value="")
-language = st.sidebar.selectbox(
-    "Language",
-    (
-        "Arabic",
-        "Catalan",
-        "Code",
-        "English",
-        "Spanish",
-        "French",
-        "Indonesian",
-        "Indic",
-        "Niger-Congo",
-        "Portuguese",
-        "Vietnamese",
-        "Chinese",
-        "Detect Language",
-        "All",
-    ),
-    index=3,
 )
 max_results = st.sidebar.slider(
     "Maximum Number of Results",
@@ -117,15 +84,14 @@ text-align: center;
 st.sidebar.markdown(footer, unsafe_allow_html=True)
-def scisearch(query, language, num_results=10):
     try:
         query = query.strip()
         if query == "" or query is None:
             return
-        post_data = {"query": query, "k": num_results}
-        if language != "detect_language":
-            post_data["lang"] = language
         output = requests.post(
             os.environ.get("address"),
@@ -135,18 +101,10 @@ def scisearch(query, language, num_results=10):
         )
         payload = json.loads(output.text)
-        if "err" in payload:
-            if payload["err"]["type"] == "unsupported_lang":
-                detected_lang = payload["err"]["meta"]["detected_lang"]
-                return f"""
-                    <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
-                    Detected language <b>{detected_lang}</b> is not supported.<br>
-                    Please choose a language from the dropdown or type another query.
-                    </p><br><hr><br>"""
         results = payload["results"]
         highlight_terms = payload["highlight_terms"]
     except Exception as e:
         results_html = f"""
                 <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
@@ -157,7 +115,7 @@ def scisearch(query, language, num_results=10):
             """
         print(e)
-    return results, highlight_terms
 PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
@@ -176,8 +134,9 @@ def process_pii(text):
 def highlight_string(paragraph: str, highlight_terms: list) -> str:
-    for term in highlight_terms:
-        paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
     paragraph = process_pii(paragraph)
     return paragraph
@@ -187,16 +146,17 @@ def process_results(hits: list, highlight_terms: list) -> str:
     for i, hit in enumerate(hits):
         res_head = f"""
                     <div class="searchresult">
-                        <h2>{i+1}. Document ID: {hit['docid']}</h2>
-                        <p>Language: <string>FIX MEEEE</string>, Score: {round(hit['score'], 2)}</p>
                     """
-        for subhit in hit["meta"]["docs"]:
-            res_head += f"""
-                        <button onclick="load_image({subhit['_id']})">Load Image</button><br>
-                        <p><img id='{subhit['_id']}' src='{subhit['URL']}'  style="width:400px;height:auto;display:none;"></p>
-                        <a href='{subhit['URL']}'>{subhit['URL']}</a>
-                        <p>{highlight_string(subhit['TEXT'], highlight_terms)}</p>
-                        """
         res_head += f"""
                     <p>{highlight_string(hit['text'], highlight_terms)}</p>
                     </div>
@@ -207,7 +167,7 @@ def process_results(hits: list, highlight_terms: list) -> str:
 if st.sidebar.button("Search"):
-    hits, highlight_terms = scisearch(query, LANG_MAPPING[language], max_results)
     html_results = process_results(hits, highlight_terms)
     rendered_results = f"""
             <div id="searchresultsarea">

 import json
 import os
 import pprint
 import re
 import streamlit as st
 import streamlit.components.v1 as components
 pp = pprint.PrettyPrinter(indent=2)
+os.environ["address"] = "http://34.79.83.149:8080"
 st.set_page_config(page_title="Gaia Search", layout="wide")
 os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
 with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
     file.write('[theme]\nbase="light"')
 st.sidebar.markdown(
     """
 <style>
 )
 query = st.sidebar.text_input(label="Search query", value="")
+corpus = st.sidebar.selectbox(
+    "Corpus",
+    ("laion", "pile", "c4"),
+    index=0,
 )
 max_results = st.sidebar.slider(
     "Maximum Number of Results",
 st.sidebar.markdown(footer, unsafe_allow_html=True)
+def scisearch(query, corpus, num_results=10):
     try:
+        print(query, corpus, num_results)
         query = query.strip()
         if query == "" or query is None:
             return
+        post_data = {"query": query, "corpus": corpus, "k": num_results}
         output = requests.post(
             os.environ.get("address"),
         )
         payload = json.loads(output.text)
         results = payload["results"]
         highlight_terms = payload["highlight_terms"]
+        return results, highlight_terms
     except Exception as e:
         results_html = f"""
                 <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
             """
         print(e)
 PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
 def highlight_string(paragraph: str, highlight_terms: list) -> str:
+    # TODO:
+    # for term in highlight_terms:
+    #    paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
     paragraph = process_pii(paragraph)
     return paragraph
     for i, hit in enumerate(hits):
         res_head = f"""
                     <div class="searchresult">
+                        <h2>{i+1}. Document ID: {hit['docid']}</h2>, Score: {round(hit['score'], 2)}</p>
                     """
+        if "meta" in hit:
+            if  hit["meta"] is not None and "docs" in hit["meta"]:
+                for subhit in hit["meta"]["docs"]:
+                    res_head += f"""
+                                <button onclick="load_image({subhit['_id']})">Load Image</button><br>
+                                <p><img id='{subhit['_id']}' src='{subhit['URL']}'  style="width:400px;height:auto;display:none;"></p>
+                                <a href='{subhit['URL']}'>{subhit['URL']}</a>
+                                <p>{highlight_string(subhit['TEXT'], highlight_terms)}</p>
+                                """
         res_head += f"""
                     <p>{highlight_string(hit['text'], highlight_terms)}</p>
                     </div>
 if st.sidebar.button("Search"):
+    hits, highlight_terms = scisearch(query, corpus, max_results)
     html_results = process_results(hits, highlight_terms)
     rendered_results = f"""
             <div id="searchresultsarea">