import os import json def replace_wildcards( templates, wildcards, replacements, has_numeric_columns, has_categoric_columns ): if len(wildcards) != len(replacements): raise ValueError( "The number of wildcards must match the number of replacements." ) new_templates = [] for tmp in templates: if "type" in tmp and tmp["type"] == "numeric" and not has_numeric_columns: continue if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns: continue tmp_text = tmp["source"].strip() for wildcard, replacement in zip(wildcards, replacements): tmp_text = tmp_text.replace(wildcard, replacement) new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text}) return new_templates embeddings_cells = [ { "cell_type": "markdown", "source": """ --- # **Embeddings Notebook for {dataset_name} dataset** --- """, }, { "cell_type": "markdown", "source": "## 1. Setup necessary libraries and load the dataset", }, { "cell_type": "code", "source": """ # Install and import necessary libraries. !pip install pandas sentence-transformers faiss-cpu """, }, { "cell_type": "code", "source": """ import pandas as pd from sentence_transformers import SentenceTransformer import faiss """, }, { "cell_type": "code", "source": """ # Load the dataset as a DataFrame {first_code} """, }, { "cell_type": "code", "source": """ # Specify the column name that contains the text data to generate embeddings column_to_generate_embeddings = '{longest_col}' """, }, { "cell_type": "markdown", "source": "## 2. Loading embedding model and creating FAISS index", }, { "cell_type": "code", "source": """ # Remove duplicate entries based on the specified column df = df.drop_duplicates(subset=column_to_generate_embeddings) """, }, { "cell_type": "code", "source": """ # Convert the column data to a list of text entries text_list = df[column_to_generate_embeddings].tolist() """, }, { "cell_type": "code", "source": """ # Specify the embedding model you want to use model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') """, }, { "cell_type": "code", "source": """ vectors = model.encode(text_list) vector_dimension = vectors.shape[1] # Initialize the FAISS index with the appropriate dimension (384 for this model) index = faiss.IndexFlatL2(vector_dimension) # Encode the text list into embeddings and add them to the FAISS index index.add(vectors) """, }, { "cell_type": "markdown", "source": "## 3. Perform a text search", }, { "cell_type": "code", "source": """ # Specify the text you want to search for in the list text_to_search = text_list[0] print(f"Text to search: {text_to_search}") """, }, { "cell_type": "code", "source": """ # Generate the embedding for the search query query_embedding = model.encode([text_to_search]) """, }, { "cell_type": "code", "source": """ # Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed) D, I = index.search(query_embedding, k=10) # Print the similar documents found print(f"Similar documents: {[text_list[i] for i in I[0]]}") """, }, ] eda_cells = [ { "cell_type": "markdown", "source": """ --- # **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset** --- """, }, { "cell_type": "markdown", "source": "## 1. Setup necessary libraries and load the dataset", }, { "cell_type": "code", "source": """ # Install and import necessary libraries. !pip install pandas matplotlib seaborn """, }, { "cell_type": "code", "source": """ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns """, }, { "cell_type": "code", "source": """ # Load the dataset as a DataFrame {first_code} """, }, { "cell_type": "markdown", "source": "## 2. Understanding the Dataset", }, { "cell_type": "code", "source": """ # First rows of the dataset and info print(df.head()) print(df.info()) """, }, { "cell_type": "code", "source": """ # Check for missing values print(df.isnull().sum()) """, }, { "cell_type": "code", "source": """ # Identify data types of each column print(df.dtypes) """, }, { "cell_type": "code", "source": """ # Detect duplicated rows print(df.duplicated().sum()) """, }, { "cell_type": "code", "source": """ # Generate descriptive statistics print(df.describe()) """, }, { "type": "categoric", "cell_type": "code", "source": """ # Unique values in categorical columns df.select_dtypes(include=['object']).nunique() """, }, { "cell_type": "markdown", "source": "## 3. Data Visualization", }, { "type": "numeric", "cell_type": "code", "source": """ # Correlation matrix for numerical columns corr_matrix = df.corr(numeric_only=True) plt.figure(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True) plt.title('Correlation Matrix') plt.show() """, }, { "type": "numeric", "cell_type": "code", "source": """ # Distribution plots for numerical columns for column in df.select_dtypes(include=['int64', 'float64']).columns: plt.figure(figsize=(8, 4)) sns.histplot(df[column], kde=True) plt.title(f'Distribution of {column}') plt.xlabel(column) plt.ylabel('Frequency') plt.show() """, }, { "type": "categoric", "cell_type": "code", "source": """ # Count plots for categorical columns for column in df.select_dtypes(include=['object']).columns: plt.figure(figsize=(8, 4)) sns.countplot(x=column, data=df) plt.title(f'Count Plot of {column}') plt.xlabel(column) plt.ylabel('Count') plt.show() """, }, { "type": "numeric", "cell_type": "code", "source": """ # Box plots for detecting outliers in numerical columns for column in df.select_dtypes(include=['int64', 'float64']).columns: plt.figure(figsize=(8, 4)) sns.boxplot(df[column]) plt.title(f'Box Plot of {column}') plt.xlabel(column) plt.show() """, }, ] rag_cells = [ { "cell_type": "markdown", "source": """ --- # **Retrieval-Augmented Generation Notebook for {dataset_name} dataset** --- """, }, { "cell_type": "markdown", "source": "## 1. Setup necessary libraries and load the dataset", }, { "cell_type": "code", "source": """ # Install and import necessary libraries. !pip install pandas sentence-transformers faiss-cpu transformers torch huggingface_hub """, }, { "cell_type": "code", "source": """ from sentence_transformers import SentenceTransformer from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from huggingface_hub import InferenceClient import pandas as pd import faiss import torch """, }, { "cell_type": "code", "source": """ # Load the dataset as a DataFrame {first_code} """, }, { "cell_type": "code", "source": """ # Specify the column name that contains the text data to generate embeddings column_to_generate_embeddings = '{longest_col}' """, }, { "cell_type": "markdown", "source": "## 2. Loading embedding model and creating FAISS index", }, { "cell_type": "code", "source": """ # Remove duplicate entries based on the specified column df = df.drop_duplicates(subset=column_to_generate_embeddings) """, }, { "cell_type": "code", "source": """ # Convert the column data to a list of text entries text_list = df[column_to_generate_embeddings].tolist() """, }, { "cell_type": "code", "source": """ # Specify the embedding model you want to use model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') """, }, { "cell_type": "code", "source": """ vectors = model.encode(text_list) vector_dimension = vectors.shape[1] # Initialize the FAISS index with the appropriate dimension (384 for this model) index = faiss.IndexFlatL2(vector_dimension) # Encode the text list into embeddings and add them to the FAISS index index.add(vectors) """, }, { "cell_type": "markdown", "source": "## 3. Perform a text search", }, { "cell_type": "code", "source": """ # Specify the text you want to search for in the list query = "How to cook sushi?" # Generate the embedding for the search query query_embedding = model.encode([query]) """, }, { "cell_type": "code", "source": """ # Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed) D, I = index.search(query_embedding, k=10) # Print the similar documents found print(f"Similar documents: {[text_list[i] for i in I[0]]}") """, }, { "cell_type": "markdown", "source": "## 4. Load pipeline and perform inference locally", }, { "cell_type": "code", "source": """ # Adjust model name as needed checkpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct' device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device) generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1) """, }, { "cell_type": "code", "source": """ # Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query selected_elements = [text_list[i] for i in I[0].tolist()] context = ','.join(selected_elements) messages = [ { "role": "system", "content": f"You are an intelligent assistant tasked with providing accurate and concise answers based on the following context. Use the information retrieved to construct your response. Context: {context}", }, {"role": "user", "content": query}, ] """, }, { "cell_type": "code", "source": """ # Send the prompt to the pipeline and show the answer output = generator(messages) print("Generated result:") print(output[0]['generated_text'][-1]['content']) # Print the assistant's response content """, }, { "cell_type": "markdown", "source": "## 5. Alternatively call the inference client", }, { "cell_type": "code", "source": """ # Adjust model name as needed checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct" # Change here your Hugging Face API token token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" inference_client = InferenceClient(checkpoint, token=token) output = inference_client.chat_completion(messages=messages, stream=False) print("Generated result:") print(output.choices[0].message.content) """, }, ] def generate_rag_system_prompt(): """ 1. Install necessary libraries. 2. Import libraries. 3. Load the dataset as a DataFrame using the provided code. 4. Select the column for generating embeddings. 5. Remove duplicate data. 6. Convert the selected column to a list. 7. Load the sentence-transformers model. 8. Create a FAISS index. 9. Encode a query sample. 10. Search for similar documents using the FAISS index. 11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline. 12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query. 13. Send the prompt to the pipeline and display the answer. Ensure the notebook is well-organized with explanations for each step. The output should be Markdown content with Python code snippets enclosed in "```python" and "```". The user will provide the dataset information in the following format: ## Columns and Data Types ## Sample Data ## Loading Data code Use the provided code to load the dataset; do not use any other method. """ def load_json_files_from_folder(folder_path): components = {} for filename in os.listdir(folder_path): if filename.endswith(".json"): file_path = os.path.join(folder_path, filename) with open(file_path, "r") as json_file: data = json.load(json_file) components[data["notebook_title"]] = data return components