|
import json |
|
import pandas as pd |
|
import requests |
|
from multiprocessing import Pool |
|
from functools import partial |
|
import streamlit as st |
|
|
|
|
|
GITHUB_CODE = "https://huggingface.co/datasets/lvwerra/github-code" |
|
INCODER_IMG = ( |
|
"https://huggingface.co/datasets/loubnabnl/repo-images/raw/main/incoder.png" |
|
) |
|
MODELS = ["CodeParrot", "InCoder"] |
|
|
|
@st.cache() |
|
def load_examples(): |
|
with open("utils/examples.json", "r") as f: |
|
examples = json.load(f) |
|
return examples |
|
|
|
|
|
def generate_code(model_name, gen_prompt, max_new_tokens, temperature, seed): |
|
url = ( |
|
f"https://hf.space/embed/loubnabnl/{model_name.lower()}-subspace/+/api/predict/" |
|
) |
|
r = requests.post( |
|
url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]} |
|
) |
|
generated_text = r.json()["data"][0] |
|
return generated_text |
|
|
|
|
|
|
|
|
|
|
|
st.title("Code generation with π€") |
|
with open("utils/intro.txt", "r") as f: |
|
intro = f.read() |
|
st.markdown(intro) |
|
|
|
|
|
st.title("1 - Pretraining datasets π") |
|
st.markdown( |
|
f"Preview of some code files from Github repositories in [Github-code dataset]({GITHUB_CODE}):" |
|
) |
|
df = pd.read_csv("utils/data_preview.csv") |
|
st.dataframe(df) |
|
st.header("Model") |
|
selected_model = st.selectbox( |
|
"Select a code generation model", MODELS, key=1 |
|
) |
|
with open(f"datasets/{selected_model.lower()}.txt", "r") as f: |
|
text = f.read() |
|
st.markdown(text) |
|
|
|
|
|
st.title("2 - Model architecture") |
|
st.markdown("Most code generation models use GPT style architectures trained on code. Some use encoder-decoder architectures such as AlphaCode.") |
|
st.header("Model") |
|
selected_model = st.selectbox( |
|
"Select a code generation model", MODELS, key=2 |
|
) |
|
with open(f"architectures/{selected_model.lower()}.txt", "r") as f: |
|
text = f.read() |
|
st.markdown(text) |
|
if selected_model == "InCoder": |
|
st.image(INCODER_IMG, caption="Figure 1: InCoder training", width=700) |
|
|
|
|
|
st.title("3 - Code models evaluation π") |
|
with open("evaluation/intro.txt", "r") as f: |
|
intro = f.read() |
|
st.markdown(intro) |
|
|
|
|
|
st.title("4 - Code generation π»") |
|
st.header("Models") |
|
selected_models = st.multiselect( |
|
"Select code generation models to compare", MODELS, default=["CodeParrot"], key=3 |
|
) |
|
st.header("Examples") |
|
examples = load_examples() |
|
example_names = [example["name"] for example in examples] |
|
name2id = dict([(name, i) for i, name in enumerate(example_names)]) |
|
selected_example = st.selectbox( |
|
"Select one of the following examples or implement yours", example_names |
|
) |
|
example_text = examples[name2id[selected_example]]["value"] |
|
default_length = examples[name2id[selected_example]]["length"] |
|
st.header("Generation settings") |
|
temperature = st.slider( |
|
"Temperature:", value=0.2, min_value=0.0, step=0.1, max_value=2.0 |
|
) |
|
max_new_tokens = st.slider( |
|
"Number of tokens to generate:", |
|
value=default_length, |
|
min_value=8, |
|
step=8, |
|
max_value=256, |
|
) |
|
seed = st.slider( |
|
"Random seed:", value=42, min_value=0, step=1, max_value=1000 |
|
) |
|
gen_prompt = st.text_area( |
|
"Generate code with prompt:", |
|
value=example_text, |
|
height=220, |
|
).strip() |
|
if st.button("Generate code!"): |
|
with st.spinner("Generating code..."): |
|
|
|
pool = Pool() |
|
generate_parallel = partial( |
|
generate_code, |
|
gen_prompt=gen_prompt, |
|
max_new_tokens=max_new_tokens, |
|
temperature=temperature, |
|
seed=seed, |
|
) |
|
output = pool.map(generate_parallel, selected_models) |
|
for i in range(len(output)): |
|
st.markdown(f"**{selected_models[i]}**") |
|
st.code(output[i]) |
|
|