Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 5,840 Bytes
48dc2f2 fcb283e 6203b88 48dc2f2 6127ee8 6203b88 a431d31 48dc2f2 6127ee8 48dc2f2 7666b36 48dc2f2 2cce3db 2e9d4f3 48dc2f2 a431d31 5518841 cac139e 5e62a6d b50ee79 1b0d77f 64f4368 f87f797 aa6340d f87f797 48dc2f2 6127ee8 d26028d 6127ee8 d26028d 6127ee8 48dc2f2 6203b88 c4eee53 cac139e 5ee220a cac139e 6127ee8 b0c1665 48dc2f2 6127ee8 48dc2f2 f87f797 48dc2f2 6127ee8 48dc2f2 6127ee8 48dc2f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import gradio as gr
from huggingface_hub import hf_hub_download
import json_stream as json
import gzip
import urllib
from collections import defaultdict
import gc
import sys
usernames = defaultdict(dict)
versions = ["v1.0", "v1.1", "v1.2", "v2.0", "v2.0.1"]
versions = [sys.intern(version) for version in versions]
for version in versions:
print(f"Loading {version}")
branch = version if version != "v1.0" else "main"
filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision=branch)
with gzip.open(filepath, 'r') as f:
data = json.load(f)
for username, repos in data.items():
for repo in repos:
if repo not in usernames[username]:
usernames[username][repo] = []
usernames[username][repo].append(version)
del data
gc.collect()
text = """\
![](https://huggingface.co/spaces/lvwerra/in-the-stack-gr/resolve/main/banner.png)
**_The Stack is an open governance interface between the AI community and the open source community._**
# Am I in The Stack?
As part of the BigCode project, we released and maintain [The Stack V2](https://huggingface.co/datasets/bigcode/the-stack-v2), a 67 TB dataset of source code over 600 programming languages. One of our goals in this project is to give people agency over their source code by letting them decide whether or not it should be used to develop and evaluate machine learning models, as we acknowledge that not all developers may wish to have their data used for that purpose.
""" + """\
This tool lets you check if a repository under a given username is part of The Stack dataset. Would you like to have your data removed from future versions of The Stack? You can opt-out following the instructions [here](https://www.bigcode-project.org/docs/about/the-stack/#how-can-i-request-that-my-data-be-removed-from-the-stack). Note that previous opt-outs might still be displayed in the release candidate (denoted with "-rc"), which will be removed for the release.
**Note:** The Stack v2.0 is built from public GitHub code provided by the [Software Heriage Archive](https://archive.softwareheritage.org/). It may include repositories that are no longer present on GitHub but were archived by Software Heritage. Before training the StarCoder 1 and 2 models an additional PII pipeline was run to remove names, emails, passwords and API keys from the code files. For more information see the [paper](https://arxiv.org/abs/2402.19173).
**Data source**:\
<img src="https://annex.softwareheritage.org/public/logo/software-heritage-logo-title.2048px.png" alt="Logo" style="height: 3em; vertical-align: middle;" />
**Model training**:\
- StarCoder1 was trained on repos listed in `v1.2`.
- StarCoder2 was trained on repos listed in `v2.0.1`.
"""
opt_out_text_template = """\
### Opt-out
If you want your data to be removed from the stack and model training \
open an issue with <a href="https://github.com/bigcode-project/opt-out-v2/issues/new?title={title}&body={body}" target="_blank">this link</a> \
(if the link doesn't work try right a right click and open it in a new tab) or visit [https://github.com/bigcode-project/opt-out-v2/issues/new?&template=opt-out-request.md](https://github.com/bigcode-project/opt-out-v2/issues/new?&template=opt-out-request.md) .\
"""
opt_out_issue_title = """Opt-out request for {username}"""
opt_out_issue_body = """\
I request that the following data is removed from The Stack and StackOverflow:
- Commits
- GitHub issue
- StackOverflow: <INSERT_STACKOVERFLOW_USERNAME_HERE>
{repo_list}
_Note_: If you don't want all resources to be included just remove the elements from the list above. If you would like to exclude all repositories and resources just add a single element "all" to the list.
"""
def issue_url(username, repos):
title = urllib.parse.quote(opt_out_issue_title.format(username=username))
body = urllib.parse.quote(opt_out_issue_body.format(repo_list=" - "+ "\n - ".join(repos)))
opt_out_text = opt_out_text_template.format(title=title, body=body)
return opt_out_text
def check_username(username, version):
output_md = ""
repos = []
if username in usernames:
repos = [repo for repo, versions in usernames[username].items() if version in versions]
if repos:
repo_word = "repository" if len(repos)==1 else "repositories"
if version[:2] == "v2":
output_md += f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack. Check the links to see when it was archived by Software Heritage:\n\n"
else:
output_md += f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack:\n\n"
for repo in repos:
if version[:2] == "v2":
output_md += f"[{repo}](https://archive.softwareheritage.org/browse/origin/visits/?origin_url=https://github.com/{repo})\n\n"
else:
output_md += f"_{repo}_\n\n"
return output_md.strip(), issue_url(username, repos)
else:
output_md += "**No**, your code is not in The Stack."
return output_md.strip(), ""
with gr.Blocks() as demo:
with gr.Row():
_, colum_2, _ = gr.Column(scale=1), gr.Column(scale=6), gr.Column(scale=1)
with colum_2:
gr.Markdown(text)
version = gr.Dropdown(["v2.0.1", "v2.0", "v1.2", "v1.1", "v1.0"], label="The Stack version:", value="v2.0.1")
username = gr.Text("", label="Your GitHub username:")
check_button = gr.Button("Check!")
repos = gr.Markdown()
opt_out = gr.Markdown()
check_button.click(check_username, [username, version], [repos, opt_out])
demo.launch() |