File size: 1,729 Bytes
724b1ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
import streamlit.components.v1 as components
import json

BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    return data


if 'idx' not in st.session_state:
    st.session_state.idx = 0

def get_next_item():
    st.session_state.idx += 1

def save_and_get_next_item(sample):

    with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'a') as f:
        f.write(json.dumps(sample) + '\n')

    get_next_item()


datasets = ['gutenberg_raw', "stackexchange2", "bigcode_python_code", "bigcode_python_github_issues", "bigcode_python_jupyter_scripts_dedup_filtered", "books3", "c4", "s2orc_raw"]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f'{DATA_PATH}/{dataset}_examples_with_stats.json')

# create bad file if it does not exists
with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'a') as f:
    pass

st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__('idx'))

with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', "r+") as f:
   st.sidebar.download_button('Download bad example JSON file', f)

st.sidebar.button("Clear bad examples file", on_click=lambda: open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'w').close())

with st.form(key='checkbox', clear_on_submit=True):
    sample = data[st.session_state.idx]
    text = sample["text"]
    st.text_area(f"text id: {st.session_state.idx}", text, height=500)

    good = st.form_submit_button('GOOD', on_click=get_next_item)
    bad = st.form_submit_button('BAD', on_click=save_and_get_next_item, args=(sample,))