|
import streamlit as st |
|
|
|
from defaults import ARGILLA_URL |
|
from utils import project_sidebar |
|
|
|
st.set_page_config( |
|
page_title="Domain Data Grower", |
|
page_icon="🧑🌾", |
|
) |
|
|
|
project_sidebar() |
|
|
|
|
|
|
|
|
|
|
|
st.header("🧑🌾 Domain Data Grower") |
|
st.divider() |
|
st.subheader("Step 3. Run the pipeline to generate synthetic data") |
|
st.write("Define the distilabel pipeline for generating the dataset.") |
|
|
|
|
|
|
|
|
|
|
|
hub_username = st.session_state.get("hub_username") |
|
project_name = st.session_state.get("project_name") |
|
hub_token = st.session_state.get("hub_token") |
|
|
|
st.divider() |
|
|
|
st.markdown("#### 🤖 Inference configuration") |
|
|
|
st.write( |
|
"Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:" |
|
) |
|
|
|
with st.expander("🤗 Recommended Models"): |
|
st.write("All inference endpoint compatible models can be found via the link below") |
|
st.link_button( |
|
"🤗 Inference compaptible models on the hub", |
|
"https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending", |
|
) |
|
st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b") |
|
st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B") |
|
|
|
st.write("🪫Projects with less resources could take advantage of LLama 3 8b") |
|
st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B") |
|
|
|
st.write("🍃Projects with even less resources could take advantage of Phi-2") |
|
st.code("https://api-inference.huggingface.co/models/microsoft/phi-2") |
|
|
|
st.write("Note Hugggingface Pro gives access to more compute resources") |
|
st.link_button( |
|
"🤗 Huggingface Pro", |
|
"https://huggingface.co/pricing", |
|
) |
|
|
|
|
|
base_url = st.text_input( |
|
label="Base URL for the Inference API", |
|
value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta", |
|
) |
|
st.divider() |
|
st.markdown("#### 🔬 Argilla API details to push the generated dataset") |
|
argilla_url = st.text_input("Argilla API URL", ARGILLA_URL) |
|
argilla_api_key = st.text_input("Argilla API Key", "owner.apikey") |
|
argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name) |
|
st.divider() |
|
|
|
|
|
|
|
|
|
|
|
st.markdown("## Run the pipeline") |
|
|
|
st.markdown( |
|
"Once you've defined the pipeline configuration above, you can run the pipeline from your local machine." |
|
) |
|
|
|
|
|
if all( |
|
[ |
|
argilla_api_key, |
|
argilla_url, |
|
base_url, |
|
hub_token, |
|
project_name, |
|
hub_token, |
|
argilla_dataset_name, |
|
] |
|
): |
|
st.markdown( |
|
"To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:" |
|
) |
|
|
|
st.code( |
|
f""" |
|
|
|
# Install the distilabel library |
|
pip install git+https://github.com/argilla-io/distilabel.git |
|
""" |
|
) |
|
|
|
st.markdown("Next, you'll need to clone your dataset repo and run the pipeline:") |
|
|
|
st.code( |
|
f""" |
|
# Clone the project and install the requirements |
|
git clone https://huggingface.co/datasets/{hub_username}/{project_name} |
|
cd {project_name} |
|
pip install -r requirements.txt |
|
|
|
# Run the pipeline |
|
python pipeline.py |
|
--argilla-api-key {argilla_api_key} |
|
--argilla-api-url {argilla_url} |
|
--argilla-dataset-name {argilla_dataset_name} |
|
--endpoint-base-url {base_url} |
|
--hub-token {st.session_state["hub_token"]} |
|
""", |
|
language="bash", |
|
) |
|
st.markdown( |
|
"👩🚀 If you want to customise the pipeline take a look in `pipeline.py` and teh [distilabel docs](https://distilabel.argilla.io/)" |
|
) |
|
|
|
else: |
|
st.info("Please fill all the required fields.") |
|
|