import streamlit as st import streamlit.components.v1 as components import json BAD_EXAMPLES_PATH = "bad_examples" DATA_PATH = "data" def load_jsonl(file_path): data = [] with open(file_path, 'r') as f: for line in f: data.append(json.loads(line)) return data if 'idx' not in st.session_state: st.session_state.idx = 0 def get_next_item(): st.session_state.idx += 1 def save_and_get_next_item(sample): with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'a') as f: f.write(json.dumps(sample) + '\n') get_next_item() datasets = ['gutenberg_raw', "stackexchange2", "bigcode_python_code", "bigcode_python_github_issues", "bigcode_python_jupyter_scripts_dedup_filtered", "books3", "c4", "s2orc_raw"] dataset = st.sidebar.selectbox("Dataset", datasets) data = load_jsonl(f'{DATA_PATH}/{dataset}_examples_with_stats.json') # create bad file if it does not exists with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'a') as f: pass st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__('idx')) with open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', "r+") as f: st.sidebar.download_button('Download bad example JSON file', f) st.sidebar.button("Clear bad examples file", on_click=lambda: open(f'{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl', 'w').close()) with st.form(key='checkbox', clear_on_submit=True): sample = data[st.session_state.idx] text = sample["text"] st.text_area(f"text id: {st.session_state.idx}", text, height=500) good = st.form_submit_button('GOOD', on_click=get_next_item) bad = st.form_submit_button('BAD', on_click=save_and_get_next_item, args=(sample,))