Spaces:

asoria
/

auto-notebook-creator

Running

File size: 13,010 Bytes

import os
import json


def replace_wildcards(
    templates, wildcards, replacements, has_numeric_columns, has_categoric_columns
):
    if len(wildcards) != len(replacements):
        raise ValueError(
            "The number of wildcards must match the number of replacements."
        )

    new_templates = []
    for tmp in templates:
        if "type" in tmp and tmp["type"] == "numeric" and not has_numeric_columns:
            continue
        if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
            continue
        tmp_text = tmp["source"].strip()
        for wildcard, replacement in zip(wildcards, replacements):
            tmp_text = tmp_text.replace(wildcard, replacement)
        new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})

    return new_templates


embeddings_cells = [
    {
        "cell_type": "markdown",
        "source": """
---
# **Embeddings Notebook for {dataset_name} dataset**
---
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 1. Setup necessary libraries and load the dataset",
    },
    {
        "cell_type": "code",
        "source": """
# Install and import necessary libraries.
!pip install pandas sentence-transformers faiss-cpu
""",
    },
    {
        "cell_type": "code",
        "source": """
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
""",
    },
    {
        "cell_type": "code",
        "source": """
# Load the dataset as a DataFrame
{first_code}
""",
    },
    {
        "cell_type": "code",
        "source": """
# Specify the column name that contains the text data to generate embeddings
column_to_generate_embeddings = '{longest_col}'
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 2. Loading embedding model and creating FAISS index",
    },
    {
        "cell_type": "code",
        "source": """
# Remove duplicate entries based on the specified column
df = df.drop_duplicates(subset=column_to_generate_embeddings)
""",
    },
    {
        "cell_type": "code",
        "source": """
# Convert the column data to a list of text entries
text_list = df[column_to_generate_embeddings].tolist()
""",
    },
    {
        "cell_type": "code",
        "source": """
# Specify the embedding model you want to use
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
""",
    },
    {
        "cell_type": "code",
        "source": """
vectors = model.encode(text_list)
vector_dimension = vectors.shape[1]

# Initialize the FAISS index with the appropriate dimension (384 for this model)
index = faiss.IndexFlatL2(vector_dimension)

# Encode the text list into embeddings and add them to the FAISS index
index.add(vectors)
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 3. Perform a text search",
    },
    {
        "cell_type": "code",
        "source": """
# Specify the text you want to search for in the list
text_to_search = text_list[0]
print(f"Text to search: {text_to_search}")
""",
    },
    {
        "cell_type": "code",
        "source": """
# Generate the embedding for the search query
query_embedding = model.encode([text_to_search])
""",
    },
    {
        "cell_type": "code",
        "source": """
# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
D, I = index.search(query_embedding, k=10)

# Print the similar documents found
print(f"Similar documents: {[text_list[i] for i in I[0]]}")
""",
    },
]

eda_cells = [
    {
        "cell_type": "markdown",
        "source": """
---
# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**
---
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 1. Setup necessary libraries and load the dataset",
    },
    {
        "cell_type": "code",
        "source": """
# Install and import necessary libraries.
!pip install pandas matplotlib seaborn
""",
    },
    {
        "cell_type": "code",
        "source": """
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
""",
    },
    {
        "cell_type": "code",
        "source": """
# Load the dataset as a DataFrame
{first_code}
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 2. Understanding the Dataset",
    },
    {
        "cell_type": "code",
        "source": """
# First rows of the dataset and info
print(df.head())
print(df.info())
""",
    },
    {
        "cell_type": "code",
        "source": """
# Check for missing values
print(df.isnull().sum())
""",
    },
    {
        "cell_type": "code",
        "source": """
# Identify data types of each column
print(df.dtypes)
""",
    },
    {
        "cell_type": "code",
        "source": """
# Detect duplicated rows
print(df.duplicated().sum())
""",
    },
    {
        "cell_type": "code",
        "source": """
# Generate descriptive statistics
print(df.describe())
""",
    },
    {
        "type": "categoric",
        "cell_type": "code",
        "source": """
# Unique values in categorical columns
df.select_dtypes(include=['object']).nunique()
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 3. Data Visualization",
    },
    {
        "type": "numeric",
        "cell_type": "code",
        "source": """
# Correlation matrix for numerical columns
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()
""",
    },
    {
        "type": "numeric",
        "cell_type": "code",
        "source": """
# Distribution plots for numerical columns
for column in df.select_dtypes(include=['int64', 'float64']).columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()
""",
    },
    {
        "type": "categoric",
        "cell_type": "code",
        "source": """
# Count plots for categorical columns
for column in df.select_dtypes(include=['object']).columns:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=column, data=df)
    plt.title(f'Count Plot of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.show()
""",
    },
    {
        "type": "numeric",
        "cell_type": "code",
        "source": """
# Box plots for detecting outliers in numerical columns
for column in df.select_dtypes(include=['int64', 'float64']).columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(df[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.show()
""",
    },
]


rag_cells = [
    {
        "cell_type": "markdown",
        "source": """
---
# **Retrieval-Augmented Generation Notebook for {dataset_name} dataset**
---
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 1. Setup necessary libraries and load the dataset",
    },
    {
        "cell_type": "code",
        "source": """
# Install and import necessary libraries.
!pip install pandas sentence-transformers faiss-cpu transformers torch huggingface_hub
""",
    },
    {
        "cell_type": "code",
        "source": """
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import InferenceClient
import pandas as pd
import faiss
import torch
""",
    },
    {
        "cell_type": "code",
        "source": """
# Load the dataset as a DataFrame
{first_code}
""",
    },
    {
        "cell_type": "code",
        "source": """
# Specify the column name that contains the text data to generate embeddings
column_to_generate_embeddings = '{longest_col}'
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 2. Loading embedding model and creating FAISS index",
    },
    {
        "cell_type": "code",
        "source": """
# Remove duplicate entries based on the specified column
df = df.drop_duplicates(subset=column_to_generate_embeddings)
""",
    },
    {
        "cell_type": "code",
        "source": """
# Convert the column data to a list of text entries
text_list = df[column_to_generate_embeddings].tolist()
""",
    },
    {
        "cell_type": "code",
        "source": """
# Specify the embedding model you want to use
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
""",
    },
    {
        "cell_type": "code",
        "source": """
vectors = model.encode(text_list)
vector_dimension = vectors.shape[1]

# Initialize the FAISS index with the appropriate dimension (384 for this model)
index = faiss.IndexFlatL2(vector_dimension)

# Encode the text list into embeddings and add them to the FAISS index
index.add(vectors)
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 3. Perform a text search",
    },
    {
        "cell_type": "code",
        "source": """
# Specify the text you want to search for in the list
query = "How to cook sushi?"

# Generate the embedding for the search query
query_embedding = model.encode([query])
""",
    },
    {
        "cell_type": "code",
        "source": """
# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
D, I = index.search(query_embedding, k=10)

# Print the similar documents found
print(f"Similar documents: {[text_list[i] for i in I[0]]}")
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 4. Load pipeline and perform inference locally",
    },
    {
        "cell_type": "code",
        "source": """
# Adjust model name as needed
checkpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct'

device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
""",
    },
    {
        "cell_type": "code",
        "source": """
# Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query
selected_elements = [text_list[i] for i in I[0].tolist()]
context = ','.join(selected_elements)
messages = [
    {
        "role": "system",
        "content": f"You are an intelligent assistant tasked with providing accurate and concise answers based on the following context. Use the information retrieved to construct your response. Context: {context}",
    },
    {"role": "user", "content": query},
]
""",
    },
    {
        "cell_type": "code",
        "source": """
# Send the prompt to the pipeline and show the answer
output = generator(messages)
print("Generated result:")
print(output[0]['generated_text'][-1]['content']) # Print the assistant's response content
""",
    },
    {
        "cell_type": "markdown",
        "source": "## 5. Alternatively call the inference client",
    },
    {
        "cell_type": "code",
        "source": """
# Adjust model name as needed
checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"

# Change here your Hugging Face API token
token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 

inference_client = InferenceClient(checkpoint, token=token)
output = inference_client.chat_completion(messages=messages, stream=False)
print("Generated result:")
print(output.choices[0].message.content)
""",
    },
]


def generate_rag_system_prompt():
    """

    1. Install necessary libraries.
    2. Import libraries.
    3. Load the dataset as a DataFrame using the provided code.
    4. Select the column for generating embeddings.
    5. Remove duplicate data.
    6. Convert the selected column to a list.
    7. Load the sentence-transformers model.
    8. Create a FAISS index.
    9. Encode a query sample.
    10. Search for similar documents using the FAISS index.
    11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
    12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
    13. Send the prompt to the pipeline and display the answer.

    Ensure the notebook is well-organized with explanations for each step.
    The output should be Markdown content with Python code snippets enclosed in "```python" and "```".

    The user will provide the dataset information in the following format:

    ## Columns and Data Types

    ## Sample Data

    ## Loading Data code

    Use the provided code to load the dataset; do not use any other method.
    """


def load_json_files_from_folder(folder_path):
    components = {}

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, "r") as json_file:
                data = json.load(json_file)
                components[data["notebook_title"]] = data

    return components