Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

sdiazlor HF staff commited on 14 days ago

Commit

d4af916

1 Parent(s): 0d14ea5

fix minor errors and improve prompt

Browse files

Files changed (2) hide show

src/synthetic_dataset_generator/apps/rag.py +11 -5
src/synthetic_dataset_generator/pipelines/rag.py +9 -9

src/synthetic_dataset_generator/apps/rag.py CHANGED Viewed

@@ -116,7 +116,7 @@ def _preprocess_input_data(file_paths, num_rows, progress=gr.Progress(track_tqdm
     return (
         dataframe,
         gr.Dropdown(
-            choices=["chucks"],
             label="Documents column",
             value=col_doc,
             interactive=(False if col_doc == "" else True),
@@ -170,7 +170,7 @@ def generate_dataset(
     progress=gr.Progress(),
 ):
     num_rows = test_max_num_rows(num_rows)
-    progress(0.0, desc="Generating questions")
     if input_type == "prompt-input":
         chunk_generator = get_chunks_generator(
             temperature=temperature, is_sample=is_sample
@@ -399,7 +399,9 @@ def push_dataset(
     retrieval = "Retrieval" in retrieval_reranking
     reranking = "Reranking" in retrieval_reranking
-    if input_type != "prompt-input":
         dataframe, _ = load_dataset_file(
             repo_id=original_repo_id,
             file_paths=file_paths,
@@ -522,8 +524,12 @@ def push_dataset(
         )
         for item in ["context", "question", "response"]:
-            dataframe[f"{item}_length"] = dataframe[item].apply(len)
-            dataframe[f"{item}_embeddings"] = get_embeddings(dataframe[item].to_list())
         rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
         if rg_dataset is None:

     return (
         dataframe,
         gr.Dropdown(
+            choices=["chunks"],
             label="Documents column",
             value=col_doc,
             interactive=(False if col_doc == "" else True),
     progress=gr.Progress(),
 ):
     num_rows = test_max_num_rows(num_rows)
+    progress(0.0, desc="Initializing dataset generation")
     if input_type == "prompt-input":
         chunk_generator = get_chunks_generator(
             temperature=temperature, is_sample=is_sample
     retrieval = "Retrieval" in retrieval_reranking
     reranking = "Reranking" in retrieval_reranking
+    if input_type == "prompt-input":
+        dataframe = pd.DataFrame(columns=["context", "question", "response"])
+    else:
         dataframe, _ = load_dataset_file(
             repo_id=original_repo_id,
             file_paths=file_paths,
         )
         for item in ["context", "question", "response"]:
+            dataframe[f"{item}_length"] = dataframe[item].apply(
+                lambda x: len(x) if x is not None else 0
+            )
+            dataframe[f"{item}_embeddings"] = get_embeddings(
+                dataframe[item].apply(lambda x: x if x is not None else "").to_list()
+            )
         rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
         if rg_dataset is None:

src/synthetic_dataset_generator/pipelines/rag.py CHANGED Viewed

@@ -18,11 +18,11 @@ DEFAULT_DATASET_DESCRIPTIONS = [
 PROMPT_CREATION_PROMPT = """
-You are an AI assistant specialized in designing retrieval-augmented generation (RAG) tasks for dataset creation.
-Your task is to generate a well-structured and descriptive prompt based on the provided dataset description and company context. Respond with only the generated prompt and nothing else.
-The prompt should closely follow the style and structure of the example prompts below. Ensure that you include all relevant details from the dataset description and reflect the company context accurately.
 Description: A dataset to retrieve information from legal documents.
 Output: A dataset to retrieve information from a collection of legal documents related to the US law system and the status of contracts.
@@ -48,9 +48,9 @@ Do not include or reference the retrieval task itself in the generated chunks.
 CHUNKS_TEMPLATE = """You have been assigned to generate text chunks based on the following retrieval task: {{ task }}.
-Provide only the text chunks without explaining your process or reasoning.
-Ensure the chunks are clear, accurate, and directly relevant to the task.
 Use your general knowledge to create informative and precise outputs.
 """
@@ -145,12 +145,12 @@ def generate_pipeline_code(
     retrieval_reranking: list[str],
     num_rows: int = 10,
 ) -> str:
-    if repo_id is None:
-        subset = "default"
-        split = "train"
-    else:
         subset = get_dataset_config_names(repo_id)[0]
         split = get_dataset_split_names(repo_id, subset)[0]
     retrieval = "Retrieval" in retrieval_reranking
     reranking = "Reranking" in retrieval_reranking
     base_code = f"""

 PROMPT_CREATION_PROMPT = """
+You are an AI assistant specialized in designing retrieval-augmented generation (RAG) tasks for dataset generation.
+Your task is to generate a well-structured and descriptive prompt based on the provided dataset description. Respond with only the generated prompt and nothing else.
+The prompt should closely follow the style and structure of the example prompts below. Ensure that you include all relevant details from the dataset description.
 Description: A dataset to retrieve information from legal documents.
 Output: A dataset to retrieve information from a collection of legal documents related to the US law system and the status of contracts.
 CHUNKS_TEMPLATE = """You have been assigned to generate text chunks based on the following retrieval task: {{ task }}.
+Provide only the text chunks without explaining your process or reasoning. Do not include any additional information. Do not indicate that it is a text chunk.
+Ensure the chunks are concise, clear, and directly relevant to the task.
 Use your general knowledge to create informative and precise outputs.
 """
     retrieval_reranking: list[str],
     num_rows: int = 10,
 ) -> str:
+    if input_type == "dataset-input" and repo_id is not None:
         subset = get_dataset_config_names(repo_id)[0]
         split = get_dataset_split_names(repo_id, subset)[0]
+    else:
+        subset = "default"
+        split = "train"
     retrieval = "Retrieval" in retrieval_reranking
     reranking = "Reranking" in retrieval_reranking
     base_code = f"""