Spaces:
Runtime error
Runtime error
File size: 3,970 Bytes
cb35e87 385bf5d fb9e6d1 7f5bdb5 385bf5d cb35e87 7f5bdb5 cb35e87 385bf5d 7f5bdb5 655c971 e146ae1 7f5bdb5 073a510 cb35e87 4d7e580 e9efa59 073a510 92c7818 f5e91d1 073a510 cb35e87 655c971 cb35e87 655c971 cb35e87 655c971 cb35e87 655c971 cb35e87 655c971 cb35e87 28bd1d5 c7035cb cb35e87 7f5bdb5 cb35e87 f881d21 cb35e87 71e0590 cb35e87 01b1b14 cb35e87 01b1b14 cb35e87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import http.client as http_client
import json
import logging
import os
import re
import string
import gradio as gr
import requests
def mark_tokens_bold(string, tokens):
for token in tokens:
pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
string = re.sub(pattern, "<span style='color: #ff75b3;'><b>" + token + "</b></span>", string)
return string
def process_results(results, highlight_terms):
if len(results) == 0:
return """<br><p>No results retrieved.</p><br><hr>"""
results_html = ""
for result in results:
text_html = result["text"]
text_html = mark_tokens_bold(text_html, highlight_terms)
docid_html = str(result["docid"])
licenses = " | ".join(result["repo_license"])
repo_name = result["repo_name"]
repo_path = result["repo_path"]
results_html += """\
<p style='font-size:16px; text-align: left; color: white;'>Repository name: <span style='color: #727cd6;'>{}</span></p>
<p style='font-size:16px; text-align: left; color: white;'>Repository path: <span style='color: #727cd6;'>{}</span></p>
<p style='font-size:16px; text-align: left; color: white;'>Repository licenses: <span style='color: #727cd6;'>{}</span></p>
<br>
<pre style='height: 600px; overflow-y: scroll; overflow-x: hidden; color: #d9d9d9;border: 1px solid #ff75b3; padding: 10px'><code>{}</code></pre>
<br>
<hr>
<br>
""".format(repo_name, repo_path, licenses, text_html)
return results_html
def scisearch(query, language, num_results=10):
query = " ".join(query.split())
if query == "" or query is None:
return ""
post_data = {"query": query, "k": num_results}
output = requests.post(
os.environ.get("address"),
headers={"Content-type": "application/json"},
data=json.dumps(post_data),
timeout=60,
)
payload = json.loads(output.text)
results = payload["results"]
highlight_terms = payload["highlight_terms"]
return process_results(results, highlight_terms)
description = """# <p style="text-align: center; color: white;"><span style='color: #ff75b3;'>🎅 SantaCoder:</span> Dataset Search 🔍 </p>
<span style='color: white;'>When you use <a href="https://huggingface.co/bigcode/santacoder" style="color: #ff75b3;">SantaCoder</a> to generate code it might produce exact copies of code in the pretraining dataset.
In that case, the code license might have requirements to comply with.
With this search tool we aim to provide help to find out where the code came from, in order for the user to comply with licensing requirements in case the code produced by SantaCoder belongs to an already existing repository.</span>"""
if __name__ == "__main__":
demo = gr.Blocks(
css=".gradio-container {background-color: #20233fff; color:white}"
)
with demo:
with gr.Row():
gr.Markdown(value=description)
with gr.Row():
query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query")
with gr.Row():
k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
with gr.Row():
submit_btn = gr.Button("Submit")
with gr.Row():
results = gr.HTML(label="Results", value="<img src='https://huggingface.co/datasets/bigcode/admin/resolve/main/bigcode_contact.png' alt='contact' style='display: block; margin: auto; max-width: 800px;'>")
def submit(query, k, lang="en"):
query = query.strip()
if query is None or query == "":
return "", ""
return {
results: scisearch(query, lang, k),
}
query.submit(fn=submit, inputs=[query, k], outputs=[results])
submit_btn.click(submit, inputs=[query, k], outputs=[results])
demo.launch(enable_queue=True, debug=True)
|