Alexander Watson commited on
Commit
76d1b05
Β·
1 Parent(s): 2106945

Add evolutionary method support

Browse files
Files changed (1) hide show
  1. app.py +161 -107
app.py CHANGED
@@ -7,12 +7,11 @@ import zipfile
7
  from io import StringIO
8
 
9
  import pandas as pd
10
- import requests
11
  import streamlit as st
12
  from datasets import load_dataset
13
  from gretel_client import Gretel
14
  from navigator_helpers import (
15
- DataSynthesisConfig,
16
  TrainingDataSynthesizer,
17
  StreamlitLogHandler,
18
  )
@@ -31,50 +30,40 @@ logger.addHandler(handler)
31
 
32
 
33
  SAMPLE_DATASET_URL = "https://gretel-public-website.s3.us-west-2.amazonaws.com/datasets/llm-training-data/dolly-examples-qa-with-context.csv"
34
- WELCOME_MARKDOWN = """
35
- Gretel Navigator is a compound AI system designed to help you create high-quality, diverse training data examples through synthetic data generation techniques. It aims to assist in scenarios where you have limited training data or want to enhance the quality and diversity of your existing dataset.
36
 
37
- Key Use Cases
38
 
39
- 1. **Create Diverse Training or Evaluation Data from a seed**: Generate diverse training or evaluation data from plain text or seed examples. This ensures your AI models are exposed to a wide range of scenarios and edge cases during training.
40
- 2. **Enhance Limited Training Data**: Expand your existing training data with additional synthetic examples generated by Gretel Navigator. This can help improve the robustness and generalization of your AI models.
41
- 3. **Mitigate Bias and Toxicity**: Generate training examples that are unbiased and non-toxic by incorporating diverse perspectives and adhering to ethical guidelines. This promotes fairness and responsible AI development.
42
- 4. **Enhance Model Performance**: Improve the performance of your AI models across various tasks by training them on domain specific synthetic data generated by Gretel Navigator.
 
43
 
44
- ## 🌟 Synthetic Data Generation
45
 
46
- Gretel Navigator utilizes an agent-based system to generate high-quality synthetic data:
 
 
 
47
 
48
- - Diverse Instruction and Response Generation
49
- - Quality Evaluation and Ranking
50
- - AI-Aligning-AI Methodology (AAA) for iterative data quality enhancement
51
- - Co-teach, suggestions, and self-teaching for iterative improvement.
52
 
53
- Leveraging these techniques, Gretel Navigator helps you create training data that leads to more robust, unbiased, and high-performing AI models.
 
 
 
54
 
55
- ## πŸ”§ Getting Started
56
 
57
- To start using Gretel Navigator, you'll need:
 
58
 
59
- 1. A Gretel account (free accounts are available).
60
- 2. Seed text or input/output pairs to create or augment AI training data.
61
-
62
- ## πŸ“‚ Input Data Formats
63
-
64
- Gretel Navigator supports the following formats for input data:
65
-
66
- - Seed data
67
- - Input/Output pairs (or instruction/response) with any number of ground truth or "context fields".
68
- - Plain text (ground truth data)
69
- - File formats: Hugging Face dataset, CSV, JSON, JSONL
70
-
71
- ## πŸ“€ Output
72
-
73
- Gretel Navigator generates one additional training example per row in the input/output pair format. You can specify requirements for the input and output pairs in the configuration. Run the process multiple times to scale your data to any desired level.
74
 
75
  ---
76
 
77
- Ready to enhance your AI training data and unlock the full potential of your models? Let's get started with Gretel Navigator! πŸš€
78
  """
79
 
80
 
@@ -210,17 +199,20 @@ def main():
210
  st.session_state.selected_fields = selected_fields
211
 
212
  with st.expander("Advanced Options", expanded=False):
213
-
214
  output_instruction_field = st.text_input(
215
  "Synthetic instruction field",
216
- value=st.session_state.get("output_instruction_field", "instruction"),
 
 
217
  help="Specify the name of the output field for generated instructions",
218
  )
219
  st.session_state.output_instruction_field = output_instruction_field
220
 
221
  output_response_field = st.text_input(
222
  "Synthetic response field",
223
- value=st.session_state.get("output_response_field", "response"),
 
 
224
  help="Specify the name of the output field for generated responses",
225
  )
226
  st.session_state.output_response_field = output_response_field
@@ -234,76 +226,67 @@ def main():
234
  )
235
  st.session_state.num_records = num_records
236
 
237
- num_instructions = st.number_input(
238
- "Number of diverse candidate instructions",
239
  min_value=1,
240
- value=st.session_state.get("num_instructions", 5),
241
- help="Specify the number of instructions to generate",
242
  )
243
- st.session_state.num_instructions = num_instructions
244
 
245
- num_responses = st.number_input(
246
- "Number of diverse candidateresponses",
247
  min_value=1,
248
- value=st.session_state.get("num_responses", 5),
249
- help="Specify the number of responses to generate",
 
 
 
 
 
 
 
 
 
 
250
  )
251
- st.session_state.num_responses = num_responses
252
 
253
  temperature = st.slider(
254
  "Temperature",
255
  min_value=0.0,
256
  max_value=1.0,
257
- value=st.session_state.get("temperature", 0.8),
258
  step=0.1,
259
  help="Adjust the temperature for response generation",
260
  )
261
  st.session_state.temperature = temperature
262
 
263
- max_tokens_instruction = st.slider(
264
- "Max tokens (instruction)",
265
  min_value=1,
266
  max_value=1024,
267
- value=st.session_state.get("max_tokens_instruction", 100),
268
- help="Specify the maximum number of tokens for instructions",
269
  )
270
- st.session_state.max_tokens_instruction = max_tokens_instruction
271
-
272
- max_tokens_response = st.slider(
273
- "Max tokens (response)",
274
- min_value=1,
275
- max_value=1024,
276
- value=st.session_state.get("max_tokens_response", 100),
277
- help="Specify the maximum number of tokens for responses",
278
- )
279
- st.session_state.max_tokens_response = max_tokens_response
280
 
281
  with st.expander("Model Configuration", expanded=True):
282
  st.markdown("### Primary Navigator Models")
283
 
284
- tabular_models = st.session_state.gretel.factories.get_navigator_model_list(
285
- "tabular"
286
- )
287
  navigator_tabular = st.selectbox(
288
  "Navigator Tabular",
289
- options=tabular_models,
290
- index=st.session_state.get("navigator_tabular_index", 0),
291
  help="Select the primary Navigator tabular model",
292
  )
293
- st.session_state.navigator_tabular_index = tabular_models.index(
294
- navigator_tabular
295
- )
296
 
297
- nl_models = st.session_state.gretel.factories.get_navigator_model_list(
298
- "natural_language"
299
- )
300
  navigator_llm = st.selectbox(
301
  "Navigator LLM",
302
- options=nl_models,
303
- index=st.session_state.get("navigator_llm_index", 0),
304
  help="Select the primary Navigator LLM",
305
  )
306
- st.session_state.navigator_llm_index = nl_models.index(navigator_llm)
307
 
308
  st.markdown("---")
309
  st.markdown("### AI Align AI (AAA)")
@@ -318,7 +301,7 @@ def main():
318
  )
319
  st.session_state.use_aaa = use_aaa
320
 
321
- co_teach_llms = [] # Initialize co_teach_llms with an empty list
322
 
323
  if use_aaa:
324
  st.markdown("#### Navigator Co-teaching LLMs")
@@ -326,13 +309,10 @@ def main():
326
  "Select additional Navigator LLMs for co-teaching in AAA. It is recommended to use different LLMs than the primary Navigator LLM for this step."
327
  )
328
 
329
- for model in nl_models:
330
- if model != navigator_llm:
331
- if st.checkbox(model, value=True, key=f"checkbox_{model}"):
332
- co_teach_llms.append(model)
333
- else:
334
- if st.checkbox(model, value=False, key=f"checkbox_{model}"):
335
- co_teach_llms.append(model)
336
  st.session_state.co_teach_llms = co_teach_llms
337
 
338
  st.markdown("---")
@@ -342,10 +322,7 @@ def main():
342
  "System Prompt",
343
  value=st.session_state.get(
344
  "system_prompt",
345
- "You are an AI assistant tasked with generating high-quality instruction-response pairs.\n"
346
- "Your goal is to create diverse, engaging, and informative content that covers a wide range of topics.\n"
347
- "When generating instructions, aim for clear, concise questions or commands that prompt thoughtful responses.\n"
348
- "When generating responses, provide detailed, accurate, and helpful information that directly addresses the instruction.",
349
  ),
350
  help="Specify the system prompt for the LLM",
351
  )
@@ -355,32 +332,95 @@ def main():
355
  "Instruction Format Prompt",
356
  value=st.session_state.get(
357
  "instruction_format_prompt",
358
- "A well-formulated question or command in everyday English.",
359
  ),
360
  help="Specify the format prompt for instructions",
361
  )
362
  st.session_state.instruction_format_prompt = instruction_format_prompt
363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  response_format_prompt = st.text_area(
365
  "Response Format Prompt",
366
  value=st.session_state.get(
367
  "response_format_prompt",
368
- "A well-formulated response to the question in everyday English.",
369
  ),
370
  help="Specify the format prompt for responses",
371
  )
372
  st.session_state.response_format_prompt = response_format_prompt
373
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  with st.expander("Download SDK Code", expanded=False):
375
  st.markdown("### Ready to generate data at scale?")
376
  st.write(
377
  "Get started with your current configuration using the SDK code below:"
378
  )
 
379
  config_text = f"""#!pip install -Uqq git+https://github.com/gretelai/navigator-helpers.git
380
 
381
  import logging
382
  import pandas as pd
383
- from navigator_helpers import DataSynthesisConfig, TrainingDataSynthesizer
384
 
385
  # Configure the logger
386
  logging.basicConfig(level=logging.INFO, format="%(message)s")
@@ -390,34 +430,42 @@ API_KEY = "YOUR_API_KEY"
390
 
391
  df = pd.read_csv(DATASET)
392
 
393
- # Create the data synthesis configuration
394
- config = DataSynthesisConfig(
395
  input_fields={st.session_state.selected_fields},
396
  output_instruction_field="{output_instruction_field}",
397
  output_response_field="{output_response_field}",
398
- num_instructions={num_instructions},
399
- num_responses={num_responses},
 
400
  temperature={temperature},
401
- max_tokens_instruction={max_tokens_instruction},
402
- max_tokens_response={max_tokens_response},
403
  api_key=API_KEY,
404
  navigator_tabular="{navigator_tabular}",
405
  navigator_llm="{navigator_llm}",
406
  co_teach_llms={co_teach_llms},
407
  system_prompt='''{system_prompt}''',
408
  instruction_format_prompt='''{instruction_format_prompt}''',
409
- response_format_prompt='''{response_format_prompt}'''
 
 
 
 
 
 
 
410
  )
411
 
412
  # Create the training data synthesizer and perform synthesis
413
  synthesizer = TrainingDataSynthesizer(
414
  df,
415
  config,
416
- use_aaa={use_aaa},
417
- output_file="results.csv",
418
  verbose=True,
419
  )
420
- new_df = synthesizer.generate()"""
 
 
421
  st.code(config_text, language="python")
422
  st.download_button(
423
  label="Download SDK Code",
@@ -473,22 +521,29 @@ new_df = synthesizer.generate()"""
473
  handler = StreamlitLogHandler(custom_log_handler)
474
  logger.addHandler(handler)
475
 
476
- config = DataSynthesisConfig(
477
  input_fields=selected_fields,
478
  output_instruction_field=output_instruction_field,
479
  output_response_field=output_response_field,
480
- num_instructions=num_instructions,
481
- num_responses=num_responses,
 
482
  temperature=temperature,
483
- max_tokens_instruction=max_tokens_instruction,
484
- max_tokens_response=max_tokens_response,
485
  api_key=api_key,
486
  navigator_tabular=navigator_tabular,
487
  navigator_llm=navigator_llm,
488
  co_teach_llms=co_teach_llms,
489
  system_prompt=system_prompt,
490
  instruction_format_prompt=instruction_format_prompt,
 
 
 
491
  response_format_prompt=response_format_prompt,
 
 
 
 
492
  )
493
 
494
  start_time = time.time()
@@ -498,7 +553,6 @@ new_df = synthesizer.generate()"""
498
  synthesizer = TrainingDataSynthesizer(
499
  pd.DataFrame([row]),
500
  config,
501
- use_aaa=use_aaa,
502
  output_file="results.csv",
503
  verbose=True,
504
  )
 
7
  from io import StringIO
8
 
9
  import pandas as pd
 
10
  import streamlit as st
11
  from datasets import load_dataset
12
  from gretel_client import Gretel
13
  from navigator_helpers import (
14
+ InstructionResponseConfig,
15
  TrainingDataSynthesizer,
16
  StreamlitLogHandler,
17
  )
 
30
 
31
 
32
  SAMPLE_DATASET_URL = "https://gretel-public-website.s3.us-west-2.amazonaws.com/datasets/llm-training-data/dolly-examples-qa-with-context.csv"
33
+ WELCOME_MARKDOWN = """Gretel Navigator is an advanced AI system for generating high-quality, diverse synthetic data to train AI models and LLMs. It combines cutting-edge techniques from recent research with Gretel's proprietary methods to enhance your training data.
 
34
 
35
+ ### 🌟 Key Features & Techniques
36
 
37
+ - **Evolutionary Text Generation**: Inspired by WizardLM-2's diverse knowledge generation
38
+ - **AI-Aligning-AI (AAA)**: Leveraging concepts from Self-Rewarding Language Models
39
+ - **Quality Evaluation & Ranking**: Using Gretel's proprietary scoring methods
40
+ - **Instruction-Response Generation**: Influenced by StarCoder2-Instruct's approach
41
+ - **Comprehensive Training Data**: Inspired by "Textbooks Are All You Need II"
42
 
43
+ ### πŸš€ Use Cases
44
 
45
+ 1. Create diverse training/evaluation data from seeds
46
+ 2. Enhance limited datasets
47
+ 3. Mitigate bias and toxicity
48
+ 4. Improve model performance with domain-specific data
49
 
50
+ ### πŸ”§ How It Works
 
 
 
51
 
52
+ 1. Initialize with custom configuration
53
+ 2. Generate and evolve text populations
54
+ 3. Apply AI Align AI (AAA) for quality enhancement
55
+ 4. Evaluate and output high-quality synthetic data
56
 
57
+ ### πŸ“‚ Input & Output
58
 
59
+ - **Input**: Seed data (text or input/output pairs) in various formats (CSV, JSON, JSONL, Hugging Face datasets)
60
+ - **Output**: High-quality synthetic training examples
61
 
62
+ Ready to elevate your AI training data? Let's get started with Gretel Navigator! πŸš€
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  ---
65
 
66
+ *Gretel Navigator combines techniques from recent academic research with Gretel's innovative approaches to deliver state-of-the-art synthetic data generation.*
67
  """
68
 
69
 
 
199
  st.session_state.selected_fields = selected_fields
200
 
201
  with st.expander("Advanced Options", expanded=False):
 
202
  output_instruction_field = st.text_input(
203
  "Synthetic instruction field",
204
+ value=st.session_state.get(
205
+ "output_instruction_field", "synthetic_instruction"
206
+ ),
207
  help="Specify the name of the output field for generated instructions",
208
  )
209
  st.session_state.output_instruction_field = output_instruction_field
210
 
211
  output_response_field = st.text_input(
212
  "Synthetic response field",
213
+ value=st.session_state.get(
214
+ "output_response_field", "synthetic_response"
215
+ ),
216
  help="Specify the name of the output field for generated responses",
217
  )
218
  st.session_state.output_response_field = output_response_field
 
226
  )
227
  st.session_state.num_records = num_records
228
 
229
+ num_generations = st.number_input(
230
+ "Number of generations",
231
  min_value=1,
232
+ value=st.session_state.get("num_generations", 3),
233
+ help="Specify the number of generations for the evolutionary algorithm",
234
  )
235
+ st.session_state.num_generations = num_generations
236
 
237
+ population_size = st.number_input(
238
+ "Population size",
239
  min_value=1,
240
+ value=st.session_state.get("population_size", 5),
241
+ help="Specify the population size for the evolutionary algorithm",
242
+ )
243
+ st.session_state.population_size = population_size
244
+
245
+ mutation_rate = st.slider(
246
+ "Mutation rate",
247
+ min_value=0.0,
248
+ max_value=1.0,
249
+ value=st.session_state.get("mutation_rate", 0.5),
250
+ step=0.1,
251
+ help="Adjust the mutation rate for the evolutionary algorithm",
252
  )
253
+ st.session_state.mutation_rate = mutation_rate
254
 
255
  temperature = st.slider(
256
  "Temperature",
257
  min_value=0.0,
258
  max_value=1.0,
259
+ value=st.session_state.get("temperature", 0.7),
260
  step=0.1,
261
  help="Adjust the temperature for response generation",
262
  )
263
  st.session_state.temperature = temperature
264
 
265
+ max_tokens = st.slider(
266
+ "Max tokens",
267
  min_value=1,
268
  max_value=1024,
269
+ value=st.session_state.get("max_tokens", 200),
270
+ help="Specify the maximum number of tokens for generated text",
271
  )
272
+ st.session_state.max_tokens = max_tokens
 
 
 
 
 
 
 
 
 
273
 
274
  with st.expander("Model Configuration", expanded=True):
275
  st.markdown("### Primary Navigator Models")
276
 
 
 
 
277
  navigator_tabular = st.selectbox(
278
  "Navigator Tabular",
279
+ options=["gretelai/auto"],
280
+ index=0,
281
  help="Select the primary Navigator tabular model",
282
  )
 
 
 
283
 
 
 
 
284
  navigator_llm = st.selectbox(
285
  "Navigator LLM",
286
+ options=["gretelai/gpt-auto", "gretelai/gpt-llama3-8b"],
287
+ index=0,
288
  help="Select the primary Navigator LLM",
289
  )
 
290
 
291
  st.markdown("---")
292
  st.markdown("### AI Align AI (AAA)")
 
301
  )
302
  st.session_state.use_aaa = use_aaa
303
 
304
+ co_teach_llms = []
305
 
306
  if use_aaa:
307
  st.markdown("#### Navigator Co-teaching LLMs")
 
309
  "Select additional Navigator LLMs for co-teaching in AAA. It is recommended to use different LLMs than the primary Navigator LLM for this step."
310
  )
311
 
312
+ co_teach_options = ["gretelai/gpt-llama3-8b", "gretelai/gpt-mistral7b"]
313
+ for model in co_teach_options:
314
+ if st.checkbox(model, value=True, key=f"checkbox_{model}"):
315
+ co_teach_llms.append(model)
 
 
 
316
  st.session_state.co_teach_llms = co_teach_llms
317
 
318
  st.markdown("---")
 
322
  "System Prompt",
323
  value=st.session_state.get(
324
  "system_prompt",
325
+ "You are an expert in generating balanced, context-rich questions and comprehensive answers based on given contexts. Your goal is to create question-answer pairs that are informative, detailed when necessary, and understandable without prior knowledge, while not revealing the answer in the question.",
 
 
 
326
  ),
327
  help="Specify the system prompt for the LLM",
328
  )
 
332
  "Instruction Format Prompt",
333
  value=st.session_state.get(
334
  "instruction_format_prompt",
335
+ "Generate a specific and clear question directly related to a key point in the given context. The question should include enough background information to be understood without prior knowledge, while being answerable using only the information provided. Do not reveal the answer in the question. Ensure the question is focused and can be answered concisely if the information allows, but also accommodate for more detailed responses when appropriate.",
336
  ),
337
  help="Specify the format prompt for instructions",
338
  )
339
  st.session_state.instruction_format_prompt = instruction_format_prompt
340
 
341
+ instruction_mutation_prompt = st.text_area(
342
+ "Instruction Mutation Prompt",
343
+ value=st.session_state.get(
344
+ "instruction_mutation_prompt",
345
+ "Refine this question to include necessary context for understanding, without revealing the answer. Ensure it remains clear and can be comprehensively answered using only the information in the given context. Adjust the question to allow for a concise answer if possible, but also consider if a more detailed response is warranted based on the complexity of the topic.",
346
+ ),
347
+ help="Specify the mutation prompt for instructions",
348
+ )
349
+ st.session_state.instruction_mutation_prompt = instruction_mutation_prompt
350
+
351
+ instruction_quality_prompt = st.text_area(
352
+ "Instruction Quality Prompt",
353
+ value=st.session_state.get(
354
+ "instruction_quality_prompt",
355
+ "Evaluate the quality of this question based on its specificity, inclusion of necessary context, relevance to the original context, clarity for someone unfamiliar with the topic, and ability to be answered appropriately (either concisely or in detail) without revealing the answer:",
356
+ ),
357
+ help="Specify the quality evaluation prompt for instructions",
358
+ )
359
+ st.session_state.instruction_quality_prompt = instruction_quality_prompt
360
+
361
+ instruction_complexity_target = st.slider(
362
+ "Instruction Complexity Target",
363
+ min_value=0.0,
364
+ max_value=1.0,
365
+ value=st.session_state.get("instruction_complexity_target", 0.7),
366
+ step=0.1,
367
+ help="Specify the target complexity for instructions",
368
+ )
369
+ st.session_state.instruction_complexity_target = (
370
+ instruction_complexity_target
371
+ )
372
+
373
  response_format_prompt = st.text_area(
374
  "Response Format Prompt",
375
  value=st.session_state.get(
376
  "response_format_prompt",
377
+ "Generate an informative answer to the given question. Use only the information provided in the original context. The response should be as concise as possible while fully addressing the question, including relevant context and explanations where necessary. For complex topics, provide a more detailed response. Ensure the answer provides enough background information to be understood by someone unfamiliar with the topic.",
378
  ),
379
  help="Specify the format prompt for responses",
380
  )
381
  st.session_state.response_format_prompt = response_format_prompt
382
 
383
+ response_mutation_prompt = st.text_area(
384
+ "Response Mutation Prompt",
385
+ value=st.session_state.get(
386
+ "response_mutation_prompt",
387
+ "Refine this answer to balance conciseness with comprehensiveness. For straightforward questions, aim for brevity while ensuring accuracy. For complex topics, provide more detail and context. Add relevant information from the context as needed. Verify factual accuracy and correct any inaccuracies or missing key information. Ensure the answer can be understood without prior knowledge of the topic.",
388
+ ),
389
+ help="Specify the mutation prompt for responses",
390
+ )
391
+ st.session_state.response_mutation_prompt = response_mutation_prompt
392
+
393
+ response_quality_prompt = st.text_area(
394
+ "Response Quality Prompt",
395
+ value=st.session_state.get(
396
+ "response_quality_prompt",
397
+ "Evaluate the quality of this answer based on its accuracy, appropriate level of detail (concise for simple questions, comprehensive for complex ones), relevance to the question, clarity for someone unfamiliar with the topic, inclusion of necessary background information, and whether it provides a satisfactory response using only the information from the given context:",
398
+ ),
399
+ help="Specify the quality evaluation prompt for responses",
400
+ )
401
+ st.session_state.response_quality_prompt = response_quality_prompt
402
+
403
+ response_complexity_target = st.slider(
404
+ "Response Complexity Target",
405
+ min_value=0.0,
406
+ max_value=1.0,
407
+ value=st.session_state.get("response_complexity_target", 0.8),
408
+ step=0.1,
409
+ help="Specify the target complexity for responses",
410
+ )
411
+ st.session_state.response_complexity_target = response_complexity_target
412
+
413
  with st.expander("Download SDK Code", expanded=False):
414
  st.markdown("### Ready to generate data at scale?")
415
  st.write(
416
  "Get started with your current configuration using the SDK code below:"
417
  )
418
+
419
  config_text = f"""#!pip install -Uqq git+https://github.com/gretelai/navigator-helpers.git
420
 
421
  import logging
422
  import pandas as pd
423
+ from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
424
 
425
  # Configure the logger
426
  logging.basicConfig(level=logging.INFO, format="%(message)s")
 
430
 
431
  df = pd.read_csv(DATASET)
432
 
433
+ # Create the instruction response configuration
434
+ config = InstructionResponseConfig(
435
  input_fields={st.session_state.selected_fields},
436
  output_instruction_field="{output_instruction_field}",
437
  output_response_field="{output_response_field}",
438
+ num_generations={num_generations},
439
+ population_size={population_size},
440
+ mutation_rate={mutation_rate},
441
  temperature={temperature},
442
+ max_tokens={max_tokens},
 
443
  api_key=API_KEY,
444
  navigator_tabular="{navigator_tabular}",
445
  navigator_llm="{navigator_llm}",
446
  co_teach_llms={co_teach_llms},
447
  system_prompt='''{system_prompt}''',
448
  instruction_format_prompt='''{instruction_format_prompt}''',
449
+ instruction_mutation_prompt='''{instruction_mutation_prompt}''',
450
+ instruction_quality_prompt='''{instruction_quality_prompt}''',
451
+ instruction_complexity_target={instruction_complexity_target},
452
+ response_format_prompt='''{response_format_prompt}''',
453
+ response_mutation_prompt='''{response_mutation_prompt}''',
454
+ response_quality_prompt='''{response_quality_prompt}''',
455
+ response_complexity_target={response_complexity_target},
456
+ use_aaa={use_aaa}
457
  )
458
 
459
  # Create the training data synthesizer and perform synthesis
460
  synthesizer = TrainingDataSynthesizer(
461
  df,
462
  config,
463
+ output_file="results.jsonl",
 
464
  verbose=True,
465
  )
466
+ new_df = synthesizer.generate()
467
+ """
468
+
469
  st.code(config_text, language="python")
470
  st.download_button(
471
  label="Download SDK Code",
 
521
  handler = StreamlitLogHandler(custom_log_handler)
522
  logger.addHandler(handler)
523
 
524
+ config = InstructionResponseConfig(
525
  input_fields=selected_fields,
526
  output_instruction_field=output_instruction_field,
527
  output_response_field=output_response_field,
528
+ num_generations=num_generations,
529
+ population_size=population_size,
530
+ mutation_rate=mutation_rate,
531
  temperature=temperature,
532
+ max_tokens=max_tokens,
 
533
  api_key=api_key,
534
  navigator_tabular=navigator_tabular,
535
  navigator_llm=navigator_llm,
536
  co_teach_llms=co_teach_llms,
537
  system_prompt=system_prompt,
538
  instruction_format_prompt=instruction_format_prompt,
539
+ instruction_mutation_prompt=instruction_mutation_prompt,
540
+ instruction_quality_prompt=instruction_quality_prompt,
541
+ instruction_complexity_target=instruction_complexity_target,
542
  response_format_prompt=response_format_prompt,
543
+ response_mutation_prompt=response_mutation_prompt,
544
+ response_quality_prompt=response_quality_prompt,
545
+ response_complexity_target=response_complexity_target,
546
+ use_aaa=use_aaa,
547
  )
548
 
549
  start_time = time.time()
 
553
  synthesizer = TrainingDataSynthesizer(
554
  pd.DataFrame([row]),
555
  config,
 
556
  output_file="results.csv",
557
  verbose=True,
558
  )