asoria HF staff commited on
Commit
aa1bdb0
·
1 Parent(s): 939f6ae

Adjust template for embeddings

Browse files
Files changed (2) hide show
  1. app.py +22 -2
  2. utils/notebook_utils.py +107 -4
app.py CHANGED
@@ -15,6 +15,8 @@ from dotenv import load_dotenv
15
  import os
16
 
17
  # TODOS:
 
 
18
  # Add template for RAG and embeddings
19
 
20
  load_dotenv()
@@ -91,6 +93,19 @@ def generate_rag_cells(dataset_id):
91
  yield from generate_cells(dataset_id, rag_cells, "rag")
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def generate_embedding_cells(dataset_id):
95
  yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
96
 
@@ -143,9 +158,10 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
143
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
144
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
145
 
 
146
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
147
- wildcards = ["{dataset_name}", "{first_code}", "{html_code}"]
148
- replacements = [dataset_id, first_code, html_code]
149
  has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
150
  has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
151
  cells = replace_wildcards(
@@ -248,4 +264,8 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
248
  outputs=[code_component, go_to_notebook],
249
  )
250
 
 
 
 
 
251
  demo.launch()
 
15
  import os
16
 
17
  # TODOS:
18
+ # Validate dataset type for type before generating the notebook
19
+ # Add template for training
20
  # Add template for RAG and embeddings
21
 
22
  load_dotenv()
 
93
  yield from generate_cells(dataset_id, rag_cells, "rag")
94
 
95
 
96
+ def longest_string_column(df):
97
+ longest_col = None
98
+ max_length = 0
99
+
100
+ for col in df.select_dtypes(include=["object", "string"]):
101
+ max_col_length = df[col].str.len().max()
102
+ if max_col_length > max_length:
103
+ max_length = max_col_length
104
+ longest_col = col
105
+
106
+ return longest_col
107
+
108
+
109
  def generate_embedding_cells(dataset_id):
110
  yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
111
 
 
158
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
159
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
160
 
161
+ longest_col = longest_string_column(df)
162
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
163
+ wildcards = ["{dataset_name}", "{first_code}", "{html_code}", "{longest_col}"]
164
+ replacements = [dataset_id, first_code, html_code, longest_col]
165
  has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
166
  has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
167
  cells = replace_wildcards(
 
264
  outputs=[code_component, go_to_notebook],
265
  )
266
 
267
+ gr.Markdown(
268
+ "🚧 Note: Some code may not be compatible with datasets that contain binary data or complex structures. 🚧"
269
+ )
270
+
271
  demo.launch()
utils/notebook_utils.py CHANGED
@@ -31,9 +31,112 @@ rag_cells = [
31
  embeggins_cells = [
32
  {
33
  "cell_type": "markdown",
34
- "source": "# Embeddings Generation Notebook",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  },
36
- {"cell_type": "code", "source": ""},
37
  ]
38
 
39
  eda_cells = [
@@ -52,7 +155,7 @@ eda_cells = [
52
  {
53
  "cell_type": "code",
54
  "source": """
55
- # 1. Install and import necessary libraries.
56
  !pip install pandas matplotlib seaborn
57
  """,
58
  },
@@ -67,7 +170,7 @@ import seaborn as sns
67
  {
68
  "cell_type": "code",
69
  "source": """
70
- # 2. Load the dataset as a DataFrame
71
  {first_code}
72
  """,
73
  },
 
31
  embeggins_cells = [
32
  {
33
  "cell_type": "markdown",
34
+ "source": """
35
+ ---
36
+ # **Embeddings Notebook for {dataset_name} dataset**
37
+ ---
38
+ """,
39
+ },
40
+ {
41
+ "cell_type": "markdown",
42
+ "source": "## 1. Setup necessary libraries and load the dataset",
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "source": """
47
+ # Install and import necessary libraries.
48
+ !pip install pandas sentence-transformers faiss-cpu
49
+ """,
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "source": """
54
+ import pandas as pd
55
+ from sentence_transformers import SentenceTransformer
56
+ import faiss
57
+ """,
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "source": """
62
+ # Load the dataset as a DataFrame
63
+ {first_code}
64
+ """,
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "source": """
69
+ # Specify the column name that contains the text data to generate embeddings
70
+ column_to_generate_embeddings = '{longest_col}'
71
+ """,
72
+ },
73
+ {
74
+ "cell_type": "markdown",
75
+ "source": "## 2. Loading embedding model and creating FAISS index",
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "source": """
80
+ # Remove duplicate entries based on the specified column
81
+ df = df.drop_duplicates(subset=column_to_generate_embeddings)
82
+ """,
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "source": """
87
+ # Convert the column data to a list of text entries
88
+ text_list = df[column_to_generate_embeddings].tolist()
89
+ """,
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "source": """
94
+ # Specify the embedding model you want to use
95
+ model = SentenceTransformer('distiluse-base-multilingual-cased')
96
+ """,
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "source": """
101
+ vectors = model.encode(text_list)
102
+ vector_dimension = vectors.shape[1]
103
+
104
+ # Initialize the FAISS index with the appropriate dimension (384 for this model)
105
+ index = faiss.IndexFlatL2(vector_dimension)
106
+
107
+ # Encode the text list into embeddings and add them to the FAISS index
108
+ index.add(vectors)
109
+ """,
110
+ },
111
+ {
112
+ "cell_type": "markdown",
113
+ "source": "## 3. Perform a text search",
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "source": """
118
+ # Specify the text you want to search for in the list
119
+ text_to_search = text_list[0]
120
+ print(f"Text to search: {text_to_search}")
121
+ """,
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "source": """
126
+ # Generate the embedding for the search query
127
+ query_embedding = model.encode([text_to_search])
128
+ """,
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "source": """
133
+ # Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
134
+ D, I = index.search(query_embedding, k=10)
135
+
136
+ # Print the similar documents found
137
+ print(f"Similar documents: {[text_list[i] for i in I[0]]}")
138
+ """,
139
  },
 
140
  ]
141
 
142
  eda_cells = [
 
155
  {
156
  "cell_type": "code",
157
  "source": """
158
+ # Install and import necessary libraries.
159
  !pip install pandas matplotlib seaborn
160
  """,
161
  },
 
170
  {
171
  "cell_type": "code",
172
  "source": """
173
+ # Load the dataset as a DataFrame
174
  {first_code}
175
  """,
176
  },