Spaces:
Paused
Paused
exact search
Browse files
app.py
CHANGED
@@ -5,9 +5,13 @@ import requests
|
|
5 |
from huggingface_hub import HfApi
|
6 |
|
7 |
hf_api = HfApi()
|
8 |
-
roots_datasets = {
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
# def get_dataset_metadata():
|
11 |
|
12 |
def get_docid_html(docid):
|
13 |
data_org, dataset, docid = docid.split("/")
|
@@ -29,7 +33,7 @@ def get_docid_html(docid):
|
|
29 |
f'style="color:#2D31FA;"'
|
30 |
f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
|
31 |
f'target="_blank"><b>{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
|
32 |
-
)
|
33 |
return docid_html
|
34 |
|
35 |
|
@@ -41,7 +45,9 @@ def process_pii(text):
|
|
41 |
for tag in PII_TAGS:
|
42 |
text = text.replace(
|
43 |
PII_PREFIX + tag,
|
44 |
-
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
|
|
|
|
|
45 |
)
|
46 |
return text
|
47 |
|
@@ -68,7 +74,9 @@ def process_results(results, highlight_terms):
|
|
68 |
<a href='{}' target='_blank'>{}</a></p>""".format(
|
69 |
result["meta"]["url"], result["meta"]["url"]
|
70 |
)
|
71 |
-
if "meta" in result
|
|
|
|
|
72 |
else ""
|
73 |
)
|
74 |
docid_html = get_docid_html(result["docid"])
|
@@ -83,13 +91,13 @@ def process_results(results, highlight_terms):
|
|
83 |
return results_html + "<hr>"
|
84 |
|
85 |
|
86 |
-
def scisearch(query, language, num_results=10):
|
87 |
try:
|
88 |
query = " ".join(query.split())
|
89 |
if query == "" or query is None:
|
90 |
return ""
|
91 |
|
92 |
-
post_data = {"query": query, "k": num_results}
|
93 |
if language != "detect_language":
|
94 |
post_data["lang"] = language
|
95 |
|
@@ -157,9 +165,57 @@ def scisearch(query, language, num_results=10):
|
|
157 |
return results_html
|
158 |
|
159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
def flag(query, language, num_results, issue_description):
|
161 |
try:
|
162 |
-
post_data = {
|
|
|
|
|
|
|
|
|
|
|
163 |
if language != "detect_language":
|
164 |
post_data["lang"] = language
|
165 |
|
@@ -194,7 +250,12 @@ if __name__ == "__main__":
|
|
194 |
with gr.Row():
|
195 |
gr.Markdown(value=description)
|
196 |
with gr.Row():
|
197 |
-
query = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
198 |
with gr.Row():
|
199 |
lang = gr.Dropdown(
|
200 |
choices=[
|
@@ -220,7 +281,12 @@ if __name__ == "__main__":
|
|
220 |
with gr.Row():
|
221 |
k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
|
222 |
with gr.Row():
|
223 |
-
|
|
|
|
|
|
|
|
|
|
|
224 |
with gr.Row():
|
225 |
results = gr.HTML(label="Results")
|
226 |
flag_description = """
|
@@ -237,16 +303,31 @@ if __name__ == "__main__":
|
|
237 |
flag_btn = gr.Button("Flag Results")
|
238 |
flag_btn.click(flag, inputs=[query, lang, k, flag_txt], outputs=[flag_txt])
|
239 |
|
240 |
-
def submit(query, lang, k):
|
|
|
241 |
query = query.strip()
|
242 |
if query is None or query == "":
|
243 |
return "", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
return {
|
245 |
-
results: scisearch(query, lang, k),
|
246 |
flagging_form: gr.update(visible=True),
|
247 |
}
|
248 |
|
249 |
-
query.submit(
|
250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
-
demo.launch(enable_queue=
|
|
|
5 |
from huggingface_hub import HfApi
|
6 |
|
7 |
hf_api = HfApi()
|
8 |
+
roots_datasets = {
|
9 |
+
dset.id.split("/")[-1]: dset
|
10 |
+
for dset in hf_api.list_datasets(
|
11 |
+
author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
|
12 |
+
)
|
13 |
+
}
|
14 |
|
|
|
15 |
|
16 |
def get_docid_html(docid):
|
17 |
data_org, dataset, docid = docid.split("/")
|
|
|
33 |
f'style="color:#2D31FA;"'
|
34 |
f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
|
35 |
f'target="_blank"><b>{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
|
36 |
+
)
|
37 |
return docid_html
|
38 |
|
39 |
|
|
|
45 |
for tag in PII_TAGS:
|
46 |
text = text.replace(
|
47 |
PII_PREFIX + tag,
|
48 |
+
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
|
49 |
+
tag
|
50 |
+
),
|
51 |
)
|
52 |
return text
|
53 |
|
|
|
74 |
<a href='{}' target='_blank'>{}</a></p>""".format(
|
75 |
result["meta"]["url"], result["meta"]["url"]
|
76 |
)
|
77 |
+
if "meta" in result
|
78 |
+
and result["meta"] is not None
|
79 |
+
and "url" in result["meta"]
|
80 |
else ""
|
81 |
)
|
82 |
docid_html = get_docid_html(result["docid"])
|
|
|
91 |
return results_html + "<hr>"
|
92 |
|
93 |
|
94 |
+
def scisearch(query, language, num_results=10, exact_search=False):
|
95 |
try:
|
96 |
query = " ".join(query.split())
|
97 |
if query == "" or query is None:
|
98 |
return ""
|
99 |
|
100 |
+
post_data = {"query": query, "k": num_results, "exact_search": exact_search}
|
101 |
if language != "detect_language":
|
102 |
post_data["lang"] = language
|
103 |
|
|
|
165 |
return results_html
|
166 |
|
167 |
|
168 |
+
def perform_exact_search(query, num_results=10):
|
169 |
+
try:
|
170 |
+
print("perform_exact_search")
|
171 |
+
query = " ".join(query.split())
|
172 |
+
if query == "" or query is None:
|
173 |
+
return ""
|
174 |
+
|
175 |
+
post_data = {"query": query, "k": num_results, "exact_search": True}
|
176 |
+
|
177 |
+
print("post_data", post_data)
|
178 |
+
|
179 |
+
output = requests.post(
|
180 |
+
"http://34.105.160.81:8080",
|
181 |
+
headers={"Content-type": "application/json"},
|
182 |
+
data=json.dumps(post_data),
|
183 |
+
timeout=60,
|
184 |
+
)
|
185 |
+
|
186 |
+
payload = json.loads(output.text)
|
187 |
+
results = payload["results"]
|
188 |
+
|
189 |
+
results_html = ""
|
190 |
+
for result in results:
|
191 |
+
print(result)
|
192 |
+
result_html = """<br><hr><br>"""
|
193 |
+
query_start = result.find(query)
|
194 |
+
query_end = query_start + len(query)
|
195 |
+
result_html += result[0:query_start]
|
196 |
+
result_html += "<b>{}</b>".format(result[query_start:query_end])
|
197 |
+
result_html += result[query_end:]
|
198 |
+
results_html += result_html
|
199 |
+
return results_html + "<hr>"
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
results_html = f"""
|
203 |
+
<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
|
204 |
+
Raised {type(e).__name__}</p>
|
205 |
+
<p style='font-size:14px; font-family: Arial; '>
|
206 |
+
Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
|
207 |
+
</p>
|
208 |
+
"""
|
209 |
+
|
210 |
+
|
211 |
def flag(query, language, num_results, issue_description):
|
212 |
try:
|
213 |
+
post_data = {
|
214 |
+
"query": query,
|
215 |
+
"k": num_results,
|
216 |
+
"flag": True,
|
217 |
+
"description": issue_description,
|
218 |
+
}
|
219 |
if language != "detect_language":
|
220 |
post_data["lang"] = language
|
221 |
|
|
|
250 |
with gr.Row():
|
251 |
gr.Markdown(value=description)
|
252 |
with gr.Row():
|
253 |
+
query = gr.Textbox(
|
254 |
+
lines=1,
|
255 |
+
max_lines=1,
|
256 |
+
placeholder="Type your query here...",
|
257 |
+
label="Query",
|
258 |
+
)
|
259 |
with gr.Row():
|
260 |
lang = gr.Dropdown(
|
261 |
choices=[
|
|
|
281 |
with gr.Row():
|
282 |
k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
|
283 |
with gr.Row():
|
284 |
+
with gr.Column(scale=1):
|
285 |
+
exact_search = gr.Checkbox(
|
286 |
+
value=False, label="Exact Search", variant="compact"
|
287 |
+
)
|
288 |
+
with gr.Column(scale=4):
|
289 |
+
submit_btn = gr.Button("Submit")
|
290 |
with gr.Row():
|
291 |
results = gr.HTML(label="Results")
|
292 |
flag_description = """
|
|
|
303 |
flag_btn = gr.Button("Flag Results")
|
304 |
flag_btn.click(flag, inputs=[query, lang, k, flag_txt], outputs=[flag_txt])
|
305 |
|
306 |
+
def submit(query, lang, k, exact_search):
|
307 |
+
print("submitting", query, lang, k, exact_search)
|
308 |
query = query.strip()
|
309 |
if query is None or query == "":
|
310 |
return "", ""
|
311 |
+
|
312 |
+
if exact_search:
|
313 |
+
return {
|
314 |
+
results: perform_exact_search(query, k),
|
315 |
+
flagging_form: gr.update(visible=True),
|
316 |
+
}
|
317 |
return {
|
318 |
+
results: scisearch(query, lang, k, exact_search),
|
319 |
flagging_form: gr.update(visible=True),
|
320 |
}
|
321 |
|
322 |
+
query.submit(
|
323 |
+
fn=submit,
|
324 |
+
inputs=[query, lang, k, exact_search],
|
325 |
+
outputs=[results, flagging_form],
|
326 |
+
)
|
327 |
+
submit_btn.click(
|
328 |
+
submit,
|
329 |
+
inputs=[query, lang, k, exact_search],
|
330 |
+
outputs=[results, flagging_form],
|
331 |
+
)
|
332 |
|
333 |
+
demo.launch(enable_queue=False, debug=True)
|