File size: 1,844 Bytes
a431d31
 
fcb283e
 
 
a431d31
 
a6c2f56
b0c1665
fcb283e
 
 
 
 
a431d31
7666b36
 
a431d31
039d726
7666b36
2cce3db
 
a431d31
2cce3db
fcb283e
a431d31
b0c1665
 
fcb283e
2cce3db
 
 
b3c4ada
b0c1665
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from datasets import load_dataset
import streamlit as st
from huggingface_hub import hf_hub_download
import gzip
import json


@st.cache(allow_output_mutation=True)
def load_all_usernames():
    filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset")

    with gzip.open(filepath, 'r') as f:
        usernames = json.loads(f.read().decode('utf-8'))
    return usernames

st.image("./banner.png", use_column_width=True)

st.title("Am I in The Stack?")
st.markdown("> The Stack is an open governance interface between the AI and open source communities.")
st.markdown("As part of the BigCode project, we released and maintain [The Stack](https://huggingface.co/datasets/bigcode/the-stack), a 3.1 TB dataset of permissively licensed source code in 30 programming languages. One of our goals in this project is to give the people who wrote this source code a choice as to whether or not it can be employed to develop and evaluate LLMs, as we acknowledge that not all developers may wish to have their data used for that purpose.")

st.markdown("This tool lets you check if a repository under a given username is part of The Stack dataset.")

usernames = load_all_usernames()
username = st.text_input("Your GitHub Username:")

if st.button("Check!"):
    if username in usernames:
        st.markdown("**Yes**, your data is in The Stack:")
        for repo_name in usernames[username]:
            st.markdown(f"`{repo_name}`")
            
        st.markdown("Would you like to have your data removed from future versions of The Stack? You can opt-out following the instructions [here](https://www.bigcode-project.org/docs/about/the-stack/#how-can-i-request-that-my-data-be-removed-from-the-stack).")
    else:
        st.markdown("**No**, your data is not in The Stack.")