|
distilabel: |
|
version: 1.0.1 |
|
pipeline: |
|
name: farming |
|
description: null |
|
steps: |
|
- step: |
|
name: load_data |
|
input_mappings: {} |
|
output_mappings: {} |
|
batch_size: 64 |
|
data: |
|
- input: animal welfare from a Family Farming perspective |
|
- input: animal welfare from a Agribusiness perspective |
|
- input: animal welfare from a Permaculture perspective |
|
- input: animal welfare from a Agroforestery perspective |
|
- input: animal welfare from a Conventional Farming perspective |
|
- input: economic growth from a Family Farming perspective |
|
- input: economic growth from a Agribusiness perspective |
|
- input: economic growth from a Permaculture perspective |
|
- input: economic growth from a Agroforestery perspective |
|
- input: economic growth from a Conventional Farming perspective |
|
- input: land from a Family Farming perspective |
|
- input: land from a Agribusiness perspective |
|
- input: land from a Permaculture perspective |
|
- input: land from a Agroforestery perspective |
|
- input: land from a Conventional Farming perspective |
|
- input: resources from a Family Farming perspective |
|
- input: resources from a Agribusiness perspective |
|
- input: resources from a Permaculture perspective |
|
- input: resources from a Agroforestery perspective |
|
- input: resources from a Conventional Farming perspective |
|
- input: efficiency from a Family Farming perspective |
|
- input: efficiency from a Agribusiness perspective |
|
- input: efficiency from a Permaculture perspective |
|
- input: efficiency from a Agroforestery perspective |
|
- input: efficiency from a Conventional Farming perspective |
|
runtime_parameters_info: |
|
- name: batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches generated by |
|
the step. |
|
type_info: |
|
module: distilabel.steps.generators.data |
|
name: LoadDataFromDicts |
|
name: load_data |
|
- step: |
|
name: self-instruct |
|
input_mappings: {} |
|
output_mappings: {} |
|
input_batch_size: 8 |
|
llm: |
|
generation_kwargs: {} |
|
model_id: null |
|
endpoint_name: null |
|
endpoint_namespace: null |
|
base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2 |
|
tokenizer_id: null |
|
model_display_name: null |
|
use_openai_client: false |
|
type_info: |
|
module: distilabel.llms.huggingface.inference_endpoints |
|
name: InferenceEndpointsLLM |
|
group_generations: false |
|
num_generations: 1 |
|
num_instructions: 5 |
|
criteria_for_query_generation: 'Incorporate a diverse range of verbs, avoiding |
|
repetition. |
|
|
|
Ensure queries are compatible with AI model''s text generation functions and |
|
are limited to 1-2 sentences. |
|
|
|
Design queries to be self-contained and standalone. |
|
|
|
Blend interrogative (e.g., "What is the significance of x?") and imperative |
|
(e.g., "Detail the process of x.") styles.' |
|
application_description: 'You are an AI assistant than generates queries around |
|
the domain of farming. |
|
|
|
Your should not expect basic but profound questions from your users. |
|
|
|
The queries should reflect a diversity of vision and economic positions and |
|
political positions. |
|
|
|
The queries may know about different methods of farming. |
|
|
|
The queries can be positioned politically, economically, socially, or practically. |
|
|
|
Also take into account the impact of diverse causes on diverse domains.' |
|
runtime_parameters_info: |
|
- name: input_batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches processed by |
|
the step. |
|
- name: llm |
|
runtime_parameters_info: |
|
- name: generation_kwargs |
|
description: The kwargs to be propagated to either `generate` or `agenerate` |
|
methods within each `LLM`. |
|
keys: |
|
- name: max_new_tokens |
|
optional: true |
|
description: the maximum number of new tokens that the model will generate. Defaults |
|
to `128`. |
|
- name: frequency_penalty |
|
optional: true |
|
description: the repetition penalty to use for the generation. Defaults to |
|
`0.0`. Only applies if `use_openai_client=True`. |
|
- name: presence_penalty |
|
optional: true |
|
description: the presence penalty to use for the generation. Defaults |
|
to `0.0`. Only applies if `use_openai_client=True`. |
|
- name: repetition_penalty |
|
optional: true |
|
description: the repetition penalty to use for the generation. Defaults to |
|
`None`. Only applies if `use_openai_client=False`. |
|
- name: temperature |
|
optional: true |
|
description: the temperature to use for the generation. Defaults to `1.0`. |
|
- name: do_sample |
|
optional: true |
|
description: whether to use sampling for the generation. Defaults to `False`. Only |
|
applies if `use_openai_client=False`. |
|
- name: top_k |
|
optional: true |
|
description: the top-k value to use for the generation. Defaults to `0.8`, |
|
since neither `0.0` nor `1.0` are valid values in TGI. |
|
- name: top_p |
|
optional: true |
|
description: the top-p value to use for the generation. Defaults to `1.0`. |
|
- name: typical_p |
|
optional: true |
|
description: the typical-p value to use for the generation. Defaults to |
|
`0.5`. |
|
- name: endpoint_name |
|
optional: true |
|
description: The name of the Inference Endpoint to use for the LLM. |
|
- name: endpoint_namespace |
|
optional: true |
|
description: The namespace of the Inference Endpoint to use for the LLM. |
|
- name: base_url |
|
optional: true |
|
description: The base URL to use for the Inference Endpoints API requests. |
|
- name: api_key |
|
optional: true |
|
description: The API key to authenticate the requests to the Inference Endpoints |
|
API. |
|
- name: num_generations |
|
optional: true |
|
description: The number of generations to be produced per input. |
|
type_info: |
|
module: distilabel.steps.tasks.self_instruct |
|
name: SelfInstruct |
|
name: self-instruct |
|
- step: |
|
name: evol_instruction_complexity |
|
input_mappings: |
|
instruction: question |
|
output_mappings: {} |
|
input_batch_size: 8 |
|
llm: |
|
generation_kwargs: {} |
|
model_id: null |
|
endpoint_name: null |
|
endpoint_namespace: null |
|
base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2 |
|
tokenizer_id: null |
|
model_display_name: null |
|
use_openai_client: false |
|
type_info: |
|
module: distilabel.llms.huggingface.inference_endpoints |
|
name: InferenceEndpointsLLM |
|
group_generations: false |
|
num_generations: 1 |
|
num_evolutions: 2 |
|
store_evolutions: true |
|
generate_answers: false |
|
include_original_instruction: true |
|
mutation_templates: |
|
CONSTRAINTS: "I want you act as a Prompt Rewriter.\n\nYour objective is to\ |
|
\ rewrite a given prompt into a more complex version to make those famous\ |
|
\ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\ |
|
\ rewritten prompt must be reasonable and must be understood and responded\ |
|
\ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ |
|
\ table and code in #The Given Prompt#:. Also, please do not omit the input\ |
|
\ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ |
|
\ the following method: \nPlease add one more constraints/requirements into\ |
|
\ '#The Given Prompt#'\n\nYou should try your best not to make the #Rewritten\ |
|
\ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\ |
|
\ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\ |
|
\ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\ |
|
\ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" |
|
DEEPENING: "I want you act as a Prompt Rewriter.\n\nYour objective is to rewrite\ |
|
\ a given prompt into a more complex version to make those famous AI systems\ |
|
\ (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the rewritten\ |
|
\ prompt must be reasonable and must be understood and responded by humans.\n\ |
|
\nYour rewriting cannot omit the non-text parts such as the table and code\ |
|
\ in #The Given Prompt#:. Also, please do not omit the input in #The Given\ |
|
\ Prompt#.\n\nYou SHOULD complicate the given prompt using the following\ |
|
\ method: \nIf #The Given Prompt# contains inquiries about certain issues,\ |
|
\ the depth and breadth of the inquiry can be increased.\n\nYou should try\ |
|
\ your best not to make the #Rewritten Prompt# become verbose, #Rewritten\ |
|
\ Prompt# can only add 10 to 20 words into #The Given Prompt#.\n\n'#The\ |
|
\ Given Prompt#', '#Rewritten Prompt#', 'given prompt' and 'rewritten prompt'\ |
|
\ are not allowed to appear in #Rewritten Prompt#\n\n#The Given Prompt#:\n\ |
|
<PROMPT>\n#Rewritten Prompt#:\n\n" |
|
CONCRETIZING: "I want you act as a Prompt Rewriter.\n\nYour objective is to\ |
|
\ rewrite a given prompt into a more complex version to make those famous\ |
|
\ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\ |
|
\ rewritten prompt must be reasonable and must be understood and responded\ |
|
\ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ |
|
\ table and code in #The Given Prompt#:. Also, please do not omit the input\ |
|
\ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ |
|
\ the following method: \nPlease replace general concepts with more specific\ |
|
\ concepts.\n\nYou should try your best not to make the #Rewritten Prompt#\ |
|
\ become verbose, #Rewritten Prompt# can only add 10 to 20 words into #The\ |
|
\ Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#', 'given prompt'\ |
|
\ and 'rewritten prompt' are not allowed to appear in #Rewritten Prompt#\n\ |
|
\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" |
|
INCREASED_REASONING_STEPS: "I want you act as a Prompt Rewriter.\n\nYour objective\ |
|
\ is to rewrite a given prompt into a more complex version to make those\ |
|
\ famous AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\n\ |
|
But the rewritten prompt must be reasonable and must be understood and responded\ |
|
\ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\ |
|
\ table and code in #The Given Prompt#:. Also, please do not omit the input\ |
|
\ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\ |
|
\ the following method: \nIf #The Given Prompt# can be solved with just\ |
|
\ a few simple thinking processes, you can rewrite it to explicitly request\ |
|
\ multiple-step reasoning.\n\nYou should try your best not to make the #Rewritten\ |
|
\ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\ |
|
\ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\ |
|
\ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\ |
|
\ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n" |
|
BREADTH: 'I want you act as a Prompt Creator. |
|
|
|
|
|
Your goal is to draw inspiration from the #Given Prompt# to create a brand |
|
new prompt. |
|
|
|
|
|
This new prompt should belong to the same domain as the #Given Prompt# but |
|
be even more rare. |
|
|
|
|
|
The LENGTH and complexity of the #Created Prompt# should be similar to that |
|
of the #Given Prompt#. |
|
|
|
|
|
The #Created Prompt# must be reasonable and must be understood and responded |
|
by humans. |
|
|
|
|
|
''#Given Prompt#'', ''#Created Prompt#'', ''given prompt'' and ''created |
|
prompt'' are not allowed to appear in #Created Prompt# |
|
|
|
|
|
#Given Prompt#: |
|
|
|
<PROMPT> |
|
|
|
#Created Prompt#: |
|
|
|
|
|
' |
|
seed: 42 |
|
runtime_parameters_info: |
|
- name: input_batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches processed by |
|
the step. |
|
- name: llm |
|
runtime_parameters_info: |
|
- name: generation_kwargs |
|
description: The kwargs to be propagated to either `generate` or `agenerate` |
|
methods within each `LLM`. |
|
keys: |
|
- name: max_new_tokens |
|
optional: true |
|
description: the maximum number of new tokens that the model will generate. Defaults |
|
to `128`. |
|
- name: frequency_penalty |
|
optional: true |
|
description: the repetition penalty to use for the generation. Defaults to |
|
`0.0`. Only applies if `use_openai_client=True`. |
|
- name: presence_penalty |
|
optional: true |
|
description: the presence penalty to use for the generation. Defaults |
|
to `0.0`. Only applies if `use_openai_client=True`. |
|
- name: repetition_penalty |
|
optional: true |
|
description: the repetition penalty to use for the generation. Defaults to |
|
`None`. Only applies if `use_openai_client=False`. |
|
- name: temperature |
|
optional: true |
|
description: the temperature to use for the generation. Defaults to `1.0`. |
|
- name: do_sample |
|
optional: true |
|
description: whether to use sampling for the generation. Defaults to `False`. Only |
|
applies if `use_openai_client=False`. |
|
- name: top_k |
|
optional: true |
|
description: the top-k value to use for the generation. Defaults to `0.8`, |
|
since neither `0.0` nor `1.0` are valid values in TGI. |
|
- name: top_p |
|
optional: true |
|
description: the top-p value to use for the generation. Defaults to `1.0`. |
|
- name: typical_p |
|
optional: true |
|
description: the typical-p value to use for the generation. Defaults to |
|
`0.5`. |
|
- name: endpoint_name |
|
optional: true |
|
description: The name of the Inference Endpoint to use for the LLM. |
|
- name: endpoint_namespace |
|
optional: true |
|
description: The namespace of the Inference Endpoint to use for the LLM. |
|
- name: base_url |
|
optional: true |
|
description: The base URL to use for the Inference Endpoints API requests. |
|
- name: api_key |
|
optional: true |
|
description: The API key to authenticate the requests to the Inference Endpoints |
|
API. |
|
- name: num_generations |
|
optional: true |
|
description: The number of generations to be produced per input. |
|
- name: seed |
|
optional: true |
|
description: As `numpy` is being used in order to randomly pick a mutation |
|
method, then is nice to seed a random seed. |
|
type_info: |
|
module: distilabel.steps.tasks.evol_instruct.base |
|
name: EvolInstruct |
|
name: evol_instruction_complexity |
|
- step: |
|
name: expand_columns |
|
input_mappings: {} |
|
output_mappings: {} |
|
input_batch_size: 50 |
|
columns: |
|
instructions: question |
|
runtime_parameters_info: |
|
- name: input_batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches processed by |
|
the step. |
|
type_info: |
|
module: distilabel.steps.expand |
|
name: ExpandColumns |
|
name: expand_columns |
|
- step: |
|
name: clean_numbered_list |
|
input_mappings: {} |
|
output_mappings: {} |
|
input_batch_size: 50 |
|
runtime_parameters_info: |
|
- name: input_batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches processed by |
|
the step. |
|
type_info: |
|
module: domain |
|
name: CleanNumberedList |
|
name: clean_numbered_list |
|
- step: |
|
name: expand_columns_evolved |
|
input_mappings: {} |
|
output_mappings: {} |
|
input_batch_size: 50 |
|
columns: |
|
evolved_instructions: evolved_questions |
|
runtime_parameters_info: |
|
- name: input_batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches processed by |
|
the step. |
|
type_info: |
|
module: distilabel.steps.expand |
|
name: ExpandColumns |
|
name: expand_columns_evolved |
|
- step: |
|
name: domain_expert |
|
input_mappings: |
|
instruction: evolved_questions |
|
output_mappings: |
|
generation: domain_expert_answer |
|
input_batch_size: 8 |
|
llm: |
|
generation_kwargs: {} |
|
model_id: null |
|
endpoint_name: null |
|
endpoint_namespace: null |
|
base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2 |
|
tokenizer_id: null |
|
model_display_name: null |
|
use_openai_client: false |
|
type_info: |
|
module: distilabel.llms.huggingface.inference_endpoints |
|
name: InferenceEndpointsLLM |
|
group_generations: false |
|
num_generations: 1 |
|
runtime_parameters_info: |
|
- name: input_batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches processed by |
|
the step. |
|
- name: llm |
|
runtime_parameters_info: |
|
- name: generation_kwargs |
|
description: The kwargs to be propagated to either `generate` or `agenerate` |
|
methods within each `LLM`. |
|
keys: |
|
- name: max_new_tokens |
|
optional: true |
|
description: the maximum number of new tokens that the model will generate. Defaults |
|
to `128`. |
|
- name: frequency_penalty |
|
optional: true |
|
description: the repetition penalty to use for the generation. Defaults to |
|
`0.0`. Only applies if `use_openai_client=True`. |
|
- name: presence_penalty |
|
optional: true |
|
description: the presence penalty to use for the generation. Defaults |
|
to `0.0`. Only applies if `use_openai_client=True`. |
|
- name: repetition_penalty |
|
optional: true |
|
description: the repetition penalty to use for the generation. Defaults to |
|
`None`. Only applies if `use_openai_client=False`. |
|
- name: temperature |
|
optional: true |
|
description: the temperature to use for the generation. Defaults to `1.0`. |
|
- name: do_sample |
|
optional: true |
|
description: whether to use sampling for the generation. Defaults to `False`. Only |
|
applies if `use_openai_client=False`. |
|
- name: top_k |
|
optional: true |
|
description: the top-k value to use for the generation. Defaults to `0.8`, |
|
since neither `0.0` nor `1.0` are valid values in TGI. |
|
- name: top_p |
|
optional: true |
|
description: the top-p value to use for the generation. Defaults to `1.0`. |
|
- name: typical_p |
|
optional: true |
|
description: the typical-p value to use for the generation. Defaults to |
|
`0.5`. |
|
- name: endpoint_name |
|
optional: true |
|
description: The name of the Inference Endpoint to use for the LLM. |
|
- name: endpoint_namespace |
|
optional: true |
|
description: The namespace of the Inference Endpoint to use for the LLM. |
|
- name: base_url |
|
optional: true |
|
description: The base URL to use for the Inference Endpoints API requests. |
|
- name: api_key |
|
optional: true |
|
description: The API key to authenticate the requests to the Inference Endpoints |
|
API. |
|
- name: num_generations |
|
optional: true |
|
description: The number of generations to be produced per input. |
|
type_info: |
|
module: domain |
|
name: DomainExpert |
|
name: domain_expert |
|
- step: |
|
name: keep_columns |
|
input_mappings: {} |
|
output_mappings: {} |
|
input_batch_size: 50 |
|
columns: |
|
- model_name |
|
- evolved_questions |
|
- domain_expert_answer |
|
runtime_parameters_info: |
|
- name: input_batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches processed by |
|
the step. |
|
type_info: |
|
module: distilabel.steps.keep |
|
name: KeepColumns |
|
name: keep_columns |
|
- step: |
|
name: text_generation_to_argilla |
|
input_mappings: |
|
instruction: evolved_questions |
|
generation: domain_expert_answer |
|
output_mappings: {} |
|
input_batch_size: 50 |
|
dataset_name: test_3 |
|
dataset_workspace: admin |
|
api_url: https://burtenshaw-test-3-argilla-space.hf.space |
|
runtime_parameters_info: |
|
- name: input_batch_size |
|
optional: true |
|
description: The number of rows that will contain the batches processed by |
|
the step. |
|
- name: dataset_name |
|
optional: false |
|
description: The name of the dataset in Argilla. |
|
- name: dataset_workspace |
|
optional: true |
|
description: The workspace where the dataset will be created in Argilla. Defaultsto |
|
`None` which means it will be created in the default workspace. |
|
- name: api_url |
|
optional: true |
|
description: The base URL to use for the Argilla API requests. |
|
- name: api_key |
|
optional: true |
|
description: The API key to authenticate the requests to the Argilla API. |
|
type_info: |
|
module: distilabel.steps.argilla.text_generation |
|
name: TextGenerationToArgilla |
|
name: text_generation_to_argilla |
|
connections: |
|
- from: load_data |
|
to: |
|
- self-instruct |
|
- from: self-instruct |
|
to: |
|
- expand_columns |
|
- from: evol_instruction_complexity |
|
to: |
|
- expand_columns_evolved |
|
- from: expand_columns |
|
to: |
|
- clean_numbered_list |
|
- from: clean_numbered_list |
|
to: |
|
- evol_instruction_complexity |
|
- from: expand_columns_evolved |
|
to: |
|
- domain_expert |
|
- from: domain_expert |
|
to: |
|
- keep_columns |
|
- from: keep_columns |
|
to: |
|
- text_generation_to_argilla |
|
- from: text_generation_to_argilla |
|
to: [] |
|
type_info: |
|
module: distilabel.pipeline.local |
|
name: Pipeline |
|
|