Spaces:
Runtime error
Runtime error
import http.client as http_client | |
import json | |
import logging | |
import os | |
import re | |
import string | |
import gradio as gr | |
import requests | |
def mark_tokens_bold(string, tokens): | |
for token in tokens: | |
pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b" | |
string = re.sub(pattern, "<span style='color: #ff75b3;'><b>" + token + "</b></span>", string) | |
return string | |
def process_results(results, highlight_terms): | |
if len(results) == 0: | |
return """<br><p>No results retrieved.</p><br><hr>""" | |
results_html = "" | |
for result in results: | |
text_html = result["text"] | |
text_html = mark_tokens_bold(text_html, highlight_terms) | |
docid_html = str(result["docid"]) | |
licenses = " | ".join(result["repo_license"]) | |
repo_name = result["repo_name"] | |
repo_path = result["repo_path"] | |
results_html += """\ | |
<p style='font-size:16px; text-align: left; color: white;'>Repository name: <span style='color: #727cd6;'>{}</span></p> | |
<p style='font-size:16px; text-align: left; color: white;'>Repository path: <span style='color: #727cd6;'>{}</span></p> | |
<p style='font-size:16px; text-align: left; color: white;'>Repository licenses: <span style='color: #727cd6;'>{}</span></p> | |
<br> | |
<pre style='height: 600px; overflow-y: scroll; overflow-x: hidden; color: #d9d9d9;border: 1px solid #ff75b3; padding: 10px'><code>{}</code></pre> | |
<br> | |
<hr> | |
<br> | |
""".format(repo_name, repo_path, licenses, text_html) | |
return results_html | |
def scisearch(query, language, num_results=10): | |
query = " ".join(query.split()) | |
if query == "" or query is None: | |
return "" | |
post_data = {"query": query, "k": num_results} | |
output = requests.post( | |
os.environ.get("address"), | |
headers={"Content-type": "application/json"}, | |
data=json.dumps(post_data), | |
timeout=60, | |
) | |
payload = json.loads(output.text) | |
results = payload["results"] | |
highlight_terms = payload["highlight_terms"] | |
return process_results(results, highlight_terms) | |
description = """# <p style="text-align: center; color: white;"><span style='color: #ff75b3;'>π SantaCoder:</span> Dataset Search π </p> | |
<span style='color: white;'>When you use <a href="https://huggingface.co/bigcode/santacoder" style="color: #ff75b3;">SantaCoder</a> to generate code it might produce exact copies of code in the pretraining dataset. | |
In that case, the code license might have requirements to comply with. | |
With this search tool we aim to provide help to find out where the code came from, in order for the user to comply with licensing requirements in case the code produced by SantaCoder belongs to an already existing repository.</span>""" | |
if __name__ == "__main__": | |
demo = gr.Blocks( | |
css=".gradio-container {background-color: #20233fff; color:white}" | |
) | |
with demo: | |
with gr.Row(): | |
gr.Markdown(value=description) | |
with gr.Row(): | |
query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query") | |
with gr.Row(): | |
k = gr.Slider(1, 100, value=10, step=1, label="Max Results") | |
with gr.Row(): | |
submit_btn = gr.Button("Submit") | |
with gr.Row(): | |
results = gr.HTML(label="Results", value="<img src='https://huggingface.co/datasets/bigcode/admin/resolve/main/bigcode_contact.png' alt='contact' style='display: block; margin: auto; max-width: 800px;'>") | |
def submit(query, k, lang="en"): | |
query = query.strip() | |
if query is None or query == "": | |
return "", "" | |
return { | |
results: scisearch(query, lang, k), | |
} | |
query.submit(fn=submit, inputs=[query, k], outputs=[results]) | |
submit_btn.click(submit, inputs=[query, k], outputs=[results]) | |
demo.launch(enable_queue=True, debug=True) | |