File size: 4,293 Bytes
c9e8e4a 3bce3fb a16fa71 41d27ac c9e8e4a fa5e188 7c0d726 8d58283 68bc50c fa5e188 c9e8e4a f4313df c9e8e4a 7c0d726 41d27ac 7c0d726 f7b6a4b 7c0d726 50f4554 7c0d726 2dc5a7a 807f36d c5fafcd 0d5adbc a7dffcb 50f4554 4bd868a 7d968ad a5b4c8d 9d2b32b 0b16412 4bd868a 0d5adbc 7036561 816c983 12798fb 29136c5 46dbbb1 0b16412 58551fa 0d5adbc a7dffcb 0b16412 cbaefe9 8d58283 0d5adbc 7036561 33147c8 29136c5 606a970 29136c5 68bc50c 99db140 596c6fa 606a970 12798fb 33147c8 12798fb 606a970 12798fb 06d2b63 33147c8 06d2b63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import json
import pandas as pd
import requests
from multiprocessing import Pool
from functools import partial
import streamlit as st
GITHUB_CODE = "https://huggingface.co/datasets/lvwerra/github-code"
INCODER_IMG = (
"https://huggingface.co/datasets/loubnabnl/repo-images/raw/main/incoder.png"
)
HUMANEVAL_IMG = (
"https://huggingface.co/datasets/loubnabnl/repo-images/raw/main/humaneval_scores.png"
)
MODELS = ["CodeParrot", "InCoder", "CodeGen", "PolyCoder"]
GENERATION_MODELS = ["CodeParrot", "InCoder"]
@st.cache()
def load_examples():
with open("utils/examples.json", "r") as f:
examples = json.load(f)
return examples
def generate_code(model_name, gen_prompt, max_new_tokens, temperature, seed):
url = (
f"https://hf.space/embed/loubnabnl/{model_name.lower()}-subspace/+/api/predict/"
)
r = requests.post(
url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
)
generated_text = r.json()["data"][0]
return generated_text
def read_markdown(path):
with open(path, "r") as f:
output = f.read()
st.markdown(output)
st.set_page_config(page_icon=":laptop:", layout="wide")
with open("utils/table_contents.txt", "r") as f:
contents = f.read()
st.sidebar.markdown(contents)
# Introduction
st.title("Code generation with 🤗")
with open("utils/intro.txt", "r") as f:
intro = f.read()
st.markdown(intro)
# Pretraining datasets
st.subheader("1 - Pretraining datasets")
read_markdown("datasets/intro.txt")
read_markdown("datasets/github_code.txt")
#st.markdown(f"Preview of some code files from Github repositories in [Github-code dataset]({GITHUB_CODE}):")
#df = pd.read_csv("utils/data_preview.csv")
#st.dataframe(df)
col1, col2= st.columns([1,2])
with col1:
selected_model = st.selectbox("", MODELS, key=1)
read_markdown(f"datasets/{selected_model.lower()}.txt")
# Model architecture
st.subheader("2 - Model architecture")
read_markdown("architectures/intro.txt")
col1, col2= st.columns([1,2])
with col1:
selected_model = st.selectbox("", MODELS, key=2)
read_markdown(f"architectures/{selected_model.lower()}.txt")
if selected_model == "InCoder":
st.image(INCODER_IMG, caption="Figure 1: InCoder training", width=700)
# Model evaluation
st.subheader("3 - Code models evaluation")
read_markdown("evaluation/intro.txt")
st.image(HUMANEVAL_IMG, caption="Table 1: HumanEval scores", width=600)
read_markdown("evaluation/demo_humaneval.txt")
# Code generation
st.subheader("4 - Code generation ✨")
col1, col2, col3 = st.columns([7,1,6])
with col1:
st.markdown("**Models**")
selected_models = st.multiselect(
"Select code generation models to compare:", GENERATION_MODELS, default=["CodeParrot"], key=3
)
st.markdown(" ")
st.markdown("**Examples**")
examples = load_examples()
example_names = [example["name"] for example in examples]
name2id = dict([(name, i) for i, name in enumerate(example_names)])
selected_example = st.selectbox(
"Select one of the following examples or implement yours:", example_names
)
example_text = examples[name2id[selected_example]]["value"]
default_length = examples[name2id[selected_example]]["length"]
with col3:
st.markdown("**Generation settings**")
temperature = st.slider(
"Temperature:", value=0.2, min_value=0.0, step=0.1, max_value=2.0
)
max_new_tokens = st.slider(
"Number of tokens to generate:",
value=default_length,
min_value=8,
step=8,
max_value=256,
)
seed = st.slider(
"Random seed:", value=42, min_value=0, step=1, max_value=1000
)
gen_prompt = st.text_area(
"Generate code with prompt:",
value=example_text,
height=200,
).strip()
if st.button("Generate code!"):
with st.spinner("Generating code..."):
# Create a multiprocessing Pool
pool = Pool()
generate_parallel = partial(
generate_code,
gen_prompt=gen_prompt,
max_new_tokens=max_new_tokens,
temperature=temperature,
seed=seed,
)
output = pool.map(generate_parallel, selected_models)
for i in range(len(output)):
st.markdown(f"**{selected_models[i]}**")
st.code(output[i])
|