|
import streamlit as st |
|
import streamlit.components.v1 as components |
|
import json |
|
|
|
BAD_EXAMPLES_PATH = "bad_examples" |
|
DATA_PATH = "data" |
|
|
|
def load_jsonl(file_path): |
|
data = [] |
|
with open(file_path, 'r') as f: |
|
for line in f: |
|
data.append(json.loads(line)) |
|
|
|
return data |
|
|
|
|
|
if 'idx' not in st.session_state: |
|
st.session_state.idx = 0 |
|
|
|
def get_next_item(): |
|
st.session_state.idx += 1 |
|
|
|
def save_and_get_next_item(sample): |
|
|
|
with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'a') as f: |
|
f.write(json.dumps(sample) + '\n') |
|
|
|
get_next_item() |
|
|
|
|
|
datasets = ['gutenberg_raw', "stackexchange2", "bigcode_python_code", "bigcode_python_github_issues", "bigcode_python_jupyter_scripts_dedup_filtered", "books3", "c4", "s2orc_raw"] |
|
dataset = st.sidebar.selectbox("Dataset", datasets) |
|
data = load_jsonl(f'{DATA_PATH}/{dataset}_examples_with_stats.json') |
|
|
|
|
|
with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'a') as f: |
|
pass |
|
|
|
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__('idx')) |
|
|
|
with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', "r+") as f: |
|
st.sidebar.download_button('Download bad example JSON file', f) |
|
|
|
st.sidebar.button("Clear bad examples file", on_click=lambda: open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'w').close()) |
|
|
|
with st.form(key='checkbox', clear_on_submit=True): |
|
sample = data[st.session_state.idx] |
|
text = sample["text"] |
|
st.text_area(f"text id: {st.session_state.idx}", text, height=500) |
|
|
|
good = st.form_submit_button('GOOD', on_click=get_next_item) |
|
bad = st.form_submit_button('BAD', on_click=save_and_get_next_item, args=(sample,)) |
|
|