lstetson commited on
Commit
0685af6
·
verified ·
1 Parent(s): 26b1bee

Upload folder using huggingface_hub

Browse files
Files changed (42) hide show
  1. .gitattributes +2 -0
  2. .gitignore +138 -0
  3. ETL_Walkthrough.ipynb +375 -0
  4. README.md +3 -9
  5. TESTING.ipynb +165 -0
  6. __init__.py +0 -0
  7. data/logs/videos_subset_more_context_load_log.json +1 -0
  8. data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/data_level0.bin +3 -0
  9. data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/header.bin +3 -0
  10. data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/length.bin +3 -0
  11. data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/link_lists.bin +0 -0
  12. data/single_video.db/chroma.sqlite3 +0 -0
  13. data/single_video.json +6 -0
  14. data/videos.json +70 -0
  15. data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/data_level0.bin +3 -0
  16. data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/header.bin +3 -0
  17. data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/index_metadata.pickle +3 -0
  18. data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/length.bin +3 -0
  19. data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/link_lists.bin +3 -0
  20. data/videos_subset.db/chroma.sqlite3 +3 -0
  21. data/videos_subset.json +14 -0
  22. data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/data_level0.bin +3 -0
  23. data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/header.bin +3 -0
  24. data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/index_metadata.pickle +3 -0
  25. data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/length.bin +3 -0
  26. data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/link_lists.bin +3 -0
  27. data/videos_subset_more_context.db/chroma.sqlite3 +3 -0
  28. main.py +25 -0
  29. models/__init__.py +0 -0
  30. models/etl.py +126 -0
  31. models/llm.py +50 -0
  32. models/retrieval.py +11 -0
  33. package-lock.json +23 -0
  34. package.json +5 -0
  35. requirements.txt +127 -0
  36. run_etl.py +14 -0
  37. tests/__init__.py +0 -0
  38. tests/test_main.py +43 -0
  39. tests/test_retrieval.py +20 -0
  40. utils/__init__.py +0 -0
  41. utils/embedding_utils.py +23 -0
  42. utils/general_utils.py +12 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/videos_subset.db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ data/videos_subset_more_context.db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
ETL_Walkthrough.ipynb ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# ETL to get the text data from the playlist\n",
8
+ "\n",
9
+ "This notebook shows the process of building the corpus of transcripts from the YouTube playlist.\n",
10
+ "\n",
11
+ "**Extract**: Pull data (transcripts) from each video. \n",
12
+ "**Transform**: \n",
13
+ "**Load**: Load data into our database where it will be retrieved from. "
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 1,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "from models import etl\n",
23
+ "import json"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {},
29
+ "source": [
30
+ "First we load the video information. This includes the video IDs and titles."
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 2,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "with open('data/single_video.json') as f:\n",
40
+ " video_info = json.load(f)"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "markdown",
45
+ "metadata": {},
46
+ "source": [
47
+ "Then we must extract the transcripts using the YouTube Transcript API. This is done over all of the videos. \n",
48
+ "This produces a list of video segments with timestamps. \n",
49
+ "Next, we format the transcript by adding metadata so that the segments are easily identified for retreival later. \n",
50
+ "Since the original segments are small, they are batched with overlap to preserve semantic meaning."
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 3,
56
+ "metadata": {},
57
+ "outputs": [
58
+ {
59
+ "name": "stdout",
60
+ "output_type": "stream",
61
+ "text": [
62
+ "get_video_transcript took 0.84 seconds.\n",
63
+ "Transcript for video 5sLYAQS9sWQ fetched.\n"
64
+ ]
65
+ }
66
+ ],
67
+ "source": [
68
+ "videos = []\n",
69
+ "for video in video_info:\n",
70
+ " video_id = video[\"id\"]\n",
71
+ " video_title = video[\"title\"]\n",
72
+ " transcript = etl.get_video_transcript(video_id)\n",
73
+ " print(f\"Transcript for video {video_id} fetched.\")\n",
74
+ " if transcript:\n",
75
+ " formatted_transcript = etl.format_transcript(transcript, video_id, video_title, batch_size=5, overlap=2)\n",
76
+ " \n",
77
+ " videos.extend(formatted_transcript)"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "metadata": {},
83
+ "source": [
84
+ "The last step is to load the data into a database. We will use a Chromadb database. \n",
85
+ "The embedding function is the ____ model from HuggingFace."
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 4,
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "Database created at data/single_video.db\n"
98
+ ]
99
+ }
100
+ ],
101
+ "source": [
102
+ "# Initialize the database\n",
103
+ "from utils.embedding_utils import MyEmbeddingFunction\n",
104
+ "import chromadb\n",
105
+ "\n",
106
+ "embed_text = MyEmbeddingFunction()\n",
107
+ "\n",
108
+ "db_path = \"data/single_video.db\"\n",
109
+ "client = chromadb.PersistentClient(path=db_path)\n",
110
+ "\n",
111
+ "client.create_collection(\n",
112
+ " name=\"huberman_videos\",\n",
113
+ " embedding_function=embed_text,\n",
114
+ " metadata={\"hnsw:space\": \"cosine\"}\n",
115
+ ")\n",
116
+ "\n",
117
+ "print(f\"Database created at {db_path}\")"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 5,
123
+ "metadata": {},
124
+ "outputs": [
125
+ {
126
+ "name": "stdout",
127
+ "output_type": "stream",
128
+ "text": [
129
+ "Data loaded to database at data/single_video.db.\n"
130
+ ]
131
+ }
132
+ ],
133
+ "source": [
134
+ "# Add the data to the database\n",
135
+ "client = chromadb.PersistentClient(path=db_path)\n",
136
+ " \n",
137
+ "collection = client.get_collection(\"huberman_videos\")\n",
138
+ "\n",
139
+ "documents = [segment['text'] for segment in videos]\n",
140
+ "metadata = [segment['metadata'] for segment in videos]\n",
141
+ "ids = [segment['metadata']['segment_id'] for segment in videos]\n",
142
+ "\n",
143
+ "collection.add(\n",
144
+ " documents=documents,\n",
145
+ " metadatas=metadata,\n",
146
+ " ids=ids\n",
147
+ ")\n",
148
+ "\n",
149
+ "print(f\"Data loaded to database at {db_path}.\")"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "markdown",
154
+ "metadata": {},
155
+ "source": [
156
+ "Here is some of the data:"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 8,
162
+ "metadata": {},
163
+ "outputs": [
164
+ {
165
+ "name": "stdout",
166
+ "output_type": "stream",
167
+ "text": [
168
+ "Number of segments: 26\n"
169
+ ]
170
+ },
171
+ {
172
+ "data": {
173
+ "text/html": [
174
+ "<div>\n",
175
+ "<style scoped>\n",
176
+ " .dataframe tbody tr th:only-of-type {\n",
177
+ " vertical-align: middle;\n",
178
+ " }\n",
179
+ "\n",
180
+ " .dataframe tbody tr th {\n",
181
+ " vertical-align: top;\n",
182
+ " }\n",
183
+ "\n",
184
+ " .dataframe thead th {\n",
185
+ " text-align: right;\n",
186
+ " }\n",
187
+ "</style>\n",
188
+ "<table border=\"1\" class=\"dataframe\">\n",
189
+ " <thead>\n",
190
+ " <tr style=\"text-align: right;\">\n",
191
+ " <th></th>\n",
192
+ " <th>ids</th>\n",
193
+ " <th>embeddings</th>\n",
194
+ " <th>metadatas</th>\n",
195
+ " <th>documents</th>\n",
196
+ " <th>uris</th>\n",
197
+ " <th>data</th>\n",
198
+ " </tr>\n",
199
+ " </thead>\n",
200
+ " <tbody>\n",
201
+ " <tr>\n",
202
+ " <th>0</th>\n",
203
+ " <td>5sLYAQS9sWQ__0</td>\n",
204
+ " <td>[-0.11489544063806534, -0.03262839838862419, -...</td>\n",
205
+ " <td>{'segment_id': '5sLYAQS9sWQ__0', 'source': 'ht...</td>\n",
206
+ " <td>GPT, or Generative Pre-trained Transformer, is...</td>\n",
207
+ " <td>None</td>\n",
208
+ " <td>None</td>\n",
209
+ " </tr>\n",
210
+ " <tr>\n",
211
+ " <th>1</th>\n",
212
+ " <td>5sLYAQS9sWQ__12</td>\n",
213
+ " <td>[0.094169981777668, -0.10430295020341873, 0.02...</td>\n",
214
+ " <td>{'segment_id': '5sLYAQS9sWQ__12', 'source': 'h...</td>\n",
215
+ " <td>Now foundation models are pre-trained on large...</td>\n",
216
+ " <td>None</td>\n",
217
+ " <td>None</td>\n",
218
+ " </tr>\n",
219
+ " <tr>\n",
220
+ " <th>2</th>\n",
221
+ " <td>5sLYAQS9sWQ__15</td>\n",
222
+ " <td>[0.042587604373693466, -0.061460819095373154, ...</td>\n",
223
+ " <td>{'segment_id': '5sLYAQS9sWQ__15', 'source': 'h...</td>\n",
224
+ " <td>I'm talking about things like code. Now, large...</td>\n",
225
+ " <td>None</td>\n",
226
+ " <td>None</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>3</th>\n",
230
+ " <td>5sLYAQS9sWQ__18</td>\n",
231
+ " <td>[-0.0245895367115736, -0.058405470103025436, -...</td>\n",
232
+ " <td>{'segment_id': '5sLYAQS9sWQ__18', 'source': 'h...</td>\n",
233
+ " <td>these models can be tens of gigabytes in size ...</td>\n",
234
+ " <td>None</td>\n",
235
+ " <td>None</td>\n",
236
+ " </tr>\n",
237
+ " <tr>\n",
238
+ " <th>4</th>\n",
239
+ " <td>5sLYAQS9sWQ__21</td>\n",
240
+ " <td>[0.05348338559269905, -0.016104578971862793, -...</td>\n",
241
+ " <td>{'segment_id': '5sLYAQS9sWQ__21', 'source': 'h...</td>\n",
242
+ " <td>So to put that into perspective, a text file t...</td>\n",
243
+ " <td>None</td>\n",
244
+ " <td>None</td>\n",
245
+ " </tr>\n",
246
+ " <tr>\n",
247
+ " <th>5</th>\n",
248
+ " <td>5sLYAQS9sWQ__24</td>\n",
249
+ " <td>[0.07004527002573013, -0.08996045589447021, -0...</td>\n",
250
+ " <td>{'segment_id': '5sLYAQS9sWQ__24', 'source': 'h...</td>\n",
251
+ " <td>A lot of words just in one Gb. And how many gi...</td>\n",
252
+ " <td>None</td>\n",
253
+ " <td>None</td>\n",
254
+ " </tr>\n",
255
+ " <tr>\n",
256
+ " <th>6</th>\n",
257
+ " <td>5sLYAQS9sWQ__27</td>\n",
258
+ " <td>[0.0283487681299448, -0.11020224541425705, -0....</td>\n",
259
+ " <td>{'segment_id': '5sLYAQS9sWQ__27', 'source': 'h...</td>\n",
260
+ " <td>Yeah, that's truly a lot of text. And LLMs are...</td>\n",
261
+ " <td>None</td>\n",
262
+ " <td>None</td>\n",
263
+ " </tr>\n",
264
+ " <tr>\n",
265
+ " <th>7</th>\n",
266
+ " <td>5sLYAQS9sWQ__3</td>\n",
267
+ " <td>[-0.0700172707438469, -0.061202701181173325, -...</td>\n",
268
+ " <td>{'segment_id': '5sLYAQS9sWQ__3', 'source': 'ht...</td>\n",
269
+ " <td>And I've been using GPT in its various forms f...</td>\n",
270
+ " <td>None</td>\n",
271
+ " <td>None</td>\n",
272
+ " </tr>\n",
273
+ " <tr>\n",
274
+ " <th>8</th>\n",
275
+ " <td>5sLYAQS9sWQ__30</td>\n",
276
+ " <td>[-0.04904637485742569, -0.1277533322572708, -0...</td>\n",
277
+ " <td>{'segment_id': '5sLYAQS9sWQ__30', 'source': 'h...</td>\n",
278
+ " <td>and the more parameters a model has, the more ...</td>\n",
279
+ " <td>None</td>\n",
280
+ " <td>None</td>\n",
281
+ " </tr>\n",
282
+ " <tr>\n",
283
+ " <th>9</th>\n",
284
+ " <td>5sLYAQS9sWQ__33</td>\n",
285
+ " <td>[0.03286760300397873, -0.041724931448698044, 0...</td>\n",
286
+ " <td>{'segment_id': '5sLYAQS9sWQ__33', 'source': 'h...</td>\n",
287
+ " <td>All right, so how do they work? Well, we can t...</td>\n",
288
+ " <td>None</td>\n",
289
+ " <td>None</td>\n",
290
+ " </tr>\n",
291
+ " </tbody>\n",
292
+ "</table>\n",
293
+ "</div>"
294
+ ],
295
+ "text/plain": [
296
+ " ids embeddings \\\n",
297
+ "0 5sLYAQS9sWQ__0 [-0.11489544063806534, -0.03262839838862419, -... \n",
298
+ "1 5sLYAQS9sWQ__12 [0.094169981777668, -0.10430295020341873, 0.02... \n",
299
+ "2 5sLYAQS9sWQ__15 [0.042587604373693466, -0.061460819095373154, ... \n",
300
+ "3 5sLYAQS9sWQ__18 [-0.0245895367115736, -0.058405470103025436, -... \n",
301
+ "4 5sLYAQS9sWQ__21 [0.05348338559269905, -0.016104578971862793, -... \n",
302
+ "5 5sLYAQS9sWQ__24 [0.07004527002573013, -0.08996045589447021, -0... \n",
303
+ "6 5sLYAQS9sWQ__27 [0.0283487681299448, -0.11020224541425705, -0.... \n",
304
+ "7 5sLYAQS9sWQ__3 [-0.0700172707438469, -0.061202701181173325, -... \n",
305
+ "8 5sLYAQS9sWQ__30 [-0.04904637485742569, -0.1277533322572708, -0... \n",
306
+ "9 5sLYAQS9sWQ__33 [0.03286760300397873, -0.041724931448698044, 0... \n",
307
+ "\n",
308
+ " metadatas \\\n",
309
+ "0 {'segment_id': '5sLYAQS9sWQ__0', 'source': 'ht... \n",
310
+ "1 {'segment_id': '5sLYAQS9sWQ__12', 'source': 'h... \n",
311
+ "2 {'segment_id': '5sLYAQS9sWQ__15', 'source': 'h... \n",
312
+ "3 {'segment_id': '5sLYAQS9sWQ__18', 'source': 'h... \n",
313
+ "4 {'segment_id': '5sLYAQS9sWQ__21', 'source': 'h... \n",
314
+ "5 {'segment_id': '5sLYAQS9sWQ__24', 'source': 'h... \n",
315
+ "6 {'segment_id': '5sLYAQS9sWQ__27', 'source': 'h... \n",
316
+ "7 {'segment_id': '5sLYAQS9sWQ__3', 'source': 'ht... \n",
317
+ "8 {'segment_id': '5sLYAQS9sWQ__30', 'source': 'h... \n",
318
+ "9 {'segment_id': '5sLYAQS9sWQ__33', 'source': 'h... \n",
319
+ "\n",
320
+ " documents uris data \n",
321
+ "0 GPT, or Generative Pre-trained Transformer, is... None None \n",
322
+ "1 Now foundation models are pre-trained on large... None None \n",
323
+ "2 I'm talking about things like code. Now, large... None None \n",
324
+ "3 these models can be tens of gigabytes in size ... None None \n",
325
+ "4 So to put that into perspective, a text file t... None None \n",
326
+ "5 A lot of words just in one Gb. And how many gi... None None \n",
327
+ "6 Yeah, that's truly a lot of text. And LLMs are... None None \n",
328
+ "7 And I've been using GPT in its various forms f... None None \n",
329
+ "8 and the more parameters a model has, the more ... None None \n",
330
+ "9 All right, so how do they work? Well, we can t... None None "
331
+ ]
332
+ },
333
+ "execution_count": 8,
334
+ "metadata": {},
335
+ "output_type": "execute_result"
336
+ }
337
+ ],
338
+ "source": [
339
+ "import pandas as pd\n",
340
+ "\n",
341
+ "client = chromadb.PersistentClient('data/single_video.db')\n",
342
+ "collection= client.get_collection('huberman_videos')\n",
343
+ "\n",
344
+ "num_segments = collection.count()\n",
345
+ "sample_data = collection.peek()\n",
346
+ "\n",
347
+ "transcript_df = pd.DataFrame(sample_data)\n",
348
+ "\n",
349
+ "print(f\"Number of segments: {num_segments}\")\n",
350
+ "transcript_df"
351
+ ]
352
+ }
353
+ ],
354
+ "metadata": {
355
+ "kernelspec": {
356
+ "display_name": "Python 3",
357
+ "language": "python",
358
+ "name": "python3"
359
+ },
360
+ "language_info": {
361
+ "codemirror_mode": {
362
+ "name": "ipython",
363
+ "version": 3
364
+ },
365
+ "file_extension": ".py",
366
+ "mimetype": "text/x-python",
367
+ "name": "python",
368
+ "nbconvert_exporter": "python",
369
+ "pygments_lexer": "ipython3",
370
+ "version": "3.11.1"
371
+ }
372
+ },
373
+ "nbformat": 4,
374
+ "nbformat_minor": 2
375
+ }
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Fitness QA Bot
3
- emoji: 🔥
4
- colorFrom: pink
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 4.17.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Fitness_QA_Bot
3
+ app_file: main.py
 
 
4
  sdk: gradio
5
+ sdk_version: 4.16.0
 
 
6
  ---
 
 
TESTING.ipynb ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from models import etl"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "etl.main(json_path='data/single_video.json', db='data/single_video.db', batch_size=5, overlap=2)"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 1,
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "import chromadb\n",
28
+ "from models import etl"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "client = chromadb.PersistentClient('data/single_video.db')\n",
38
+ "collection= client.get_collection('huberman_videos')\n",
39
+ "# collection.count()\n",
40
+ "# collection.peek()\n",
41
+ "\n",
42
+ "query_text = \"What are the components of an LLM?\"\n",
43
+ "query_embedding = etl.embed_text(query_text)\n",
44
+ "results = collection.query(query_texts=[query_text], n_results=5)"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 3,
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "data": {
54
+ "text/plain": [
55
+ "{'ids': [['5sLYAQS9sWQ__33',\n",
56
+ " '5sLYAQS9sWQ__36',\n",
57
+ " '5sLYAQS9sWQ__3',\n",
58
+ " '5sLYAQS9sWQ__6',\n",
59
+ " '5sLYAQS9sWQ__27']],\n",
60
+ " 'distances': [[0.27329726119651687,\n",
61
+ " 0.3594438065792097,\n",
62
+ " 0.4730243492988927,\n",
63
+ " 0.5004446084705303,\n",
64
+ " 0.5766584257317211]],\n",
65
+ " 'metadatas': [[{'segment_id': '5sLYAQS9sWQ__33',\n",
66
+ " 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=145.328s',\n",
67
+ " 'title': 'How Large Language Models Work',\n",
68
+ " 'video_id': '5sLYAQS9sWQ'},\n",
69
+ " {'segment_id': '5sLYAQS9sWQ__36',\n",
70
+ " 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=154.367s',\n",
71
+ " 'title': 'How Large Language Models Work',\n",
72
+ " 'video_id': '5sLYAQS9sWQ'},\n",
73
+ " {'segment_id': '5sLYAQS9sWQ__3',\n",
74
+ " 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=10.783s',\n",
75
+ " 'title': 'How Large Language Models Work',\n",
76
+ " 'video_id': '5sLYAQS9sWQ'},\n",
77
+ " {'segment_id': '5sLYAQS9sWQ__6',\n",
78
+ " 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=22.544s',\n",
79
+ " 'title': 'How Large Language Models Work',\n",
80
+ " 'video_id': '5sLYAQS9sWQ'},\n",
81
+ " {'segment_id': '5sLYAQS9sWQ__27',\n",
82
+ " 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=117.572s',\n",
83
+ " 'title': 'How Large Language Models Work',\n",
84
+ " 'video_id': '5sLYAQS9sWQ'}]],\n",
85
+ " 'embeddings': None,\n",
86
+ " 'documents': [['All right, so how do they work? Well, we can think of it like this. LLM equals three things: data, architecture, and lastly, we can think of it as training. Those three things are really the components of an LLM.',\n",
87
+ " \"data, architecture, and lastly, we can think of it as training. Those three things are really the components of an LLM. Now, we've already discussed the enormous amounts of text data that goes into these things. As for the architecture, this is a neural network and for GPT that is a transformer.\",\n",
88
+ " 'And I\\'ve been using GPT in its various forms for years. In this video we are going to number 1, ask \"what is an LLM?\" Number 2, we are going to describe how they work. And then number 3,',\n",
89
+ " 'Number 2, we are going to describe how they work. And then number 3, we\\'re going to ask, \"what are the business applications of LLMs?\" So let\\'s start with number 1, \"what is a large language model?\" Well, a large language model',\n",
90
+ " \"Yeah, that's truly a lot of text. And LLMs are also among the biggest models when it comes to parameter count. A parameter is a value the model can change independently as it learns, and the more parameters a model has, the more complex it can be. GPT-3, for example, is pre-trained on a corpus of actually 45 terabytes of data,\"]],\n",
91
+ " 'uris': None,\n",
92
+ " 'data': None}"
93
+ ]
94
+ },
95
+ "execution_count": 3,
96
+ "metadata": {},
97
+ "output_type": "execute_result"
98
+ }
99
+ ],
100
+ "source": [
101
+ "results"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 10,
107
+ "metadata": {},
108
+ "outputs": [
109
+ {
110
+ "name": "stdout",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "CONTEXT: All right, so how do they work? Well, we can think of it like this. LLM equals three things: data, architecture, and lastly, we can think of it as training. Those three things are really the components of an LLM.\n",
114
+ "TITLE: How Large Language Models Work\n",
115
+ "SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=145.328s\n",
116
+ "\n",
117
+ "CONTEXT: data, architecture, and lastly, we can think of it as training. Those three things are really the components of an LLM. Now, we've already discussed the enormous amounts of text data that goes into these things. As for the architecture, this is a neural network and for GPT that is a transformer.\n",
118
+ "TITLE: How Large Language Models Work\n",
119
+ "SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=154.367s\n",
120
+ "\n",
121
+ "CONTEXT: And I've been using GPT in its various forms for years. In this video we are going to number 1, ask \"what is an LLM?\" Number 2, we are going to describe how they work. And then number 3,\n",
122
+ "TITLE: How Large Language Models Work\n",
123
+ "SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=10.783s\n",
124
+ "\n",
125
+ "CONTEXT: Number 2, we are going to describe how they work. And then number 3, we're going to ask, \"what are the business applications of LLMs?\" So let's start with number 1, \"what is a large language model?\" Well, a large language model\n",
126
+ "TITLE: How Large Language Models Work\n",
127
+ "SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=22.544s\n",
128
+ "\n",
129
+ "CONTEXT: Yeah, that's truly a lot of text. And LLMs are also among the biggest models when it comes to parameter count. A parameter is a value the model can change independently as it learns, and the more parameters a model has, the more complex it can be. GPT-3, for example, is pre-trained on a corpus of actually 45 terabytes of data,\n",
130
+ "TITLE: How Large Language Models Work\n",
131
+ "SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=117.572s\n",
132
+ "\n",
133
+ "\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "from models.llm import format_context\n",
139
+ "\n",
140
+ "print(format_context(results))"
141
+ ]
142
+ }
143
+ ],
144
+ "metadata": {
145
+ "kernelspec": {
146
+ "display_name": "Python 3",
147
+ "language": "python",
148
+ "name": "python3"
149
+ },
150
+ "language_info": {
151
+ "codemirror_mode": {
152
+ "name": "ipython",
153
+ "version": 3
154
+ },
155
+ "file_extension": ".py",
156
+ "mimetype": "text/x-python",
157
+ "name": "python",
158
+ "nbconvert_exporter": "python",
159
+ "pygments_lexer": "ipython3",
160
+ "version": "3.11.1"
161
+ }
162
+ },
163
+ "nbformat": 4,
164
+ "nbformat_minor": 2
165
+ }
__init__.py ADDED
File without changes
data/logs/videos_subset_more_context_load_log.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"videos_info_path": "data/videos_subset.json", "db_path": "data/videos_subset_more_context.db", "batch_size": 15, "overlap": 10, "load_time": "2024-02-07 19:44:01.376335"}
data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
3
+ size 1676000
data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:135da168ae29fd55e9cbd2f05414e81b32cd0783d38340b404e315cebd7a7647
3
+ size 4000
data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/link_lists.bin ADDED
File without changes
data/single_video.db/chroma.sqlite3 ADDED
Binary file (311 kB). View file
 
data/single_video.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "5sLYAQS9sWQ",
4
+ "title": "How Large Language Models Work"
5
+ }
6
+ ]
data/videos.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "LYYyQcAJZfk",
4
+ "title": "Science-Supported Tools to Accelerate Your Fitness Goals | Huberman Lab Podcast"
5
+ },
6
+ {
7
+ "id": "q37ARYnRDGc",
8
+ "title": "Dr. Andy Galpin: Optimal Nutrition & Supplementation for Fitness | Huberman Lab Guest Series"
9
+ },
10
+ {
11
+ "id": "juD99_sPWGU",
12
+ "title": "Dr. Andy Galpin: Maximize Recovery to Achieve Fitness & Performance Goals | Huberman Lab"
13
+ },
14
+ {
15
+ "id": "UIy-WQCZd4M",
16
+ "title": "Dr. Andy Galpin: Optimize Your Training Program for Fitness & Longevity | Huberman Lab Guest Series"
17
+ },
18
+ {
19
+ "id": "oNkDA2F7CjM",
20
+ "title": "Dr. Andy Galpin: How to Build Physical Endurance & Lose Fat | Huberman Lab Guest Series"
21
+ },
22
+ {
23
+ "id": "CyDLbrZK75U",
24
+ "title": "Dr. Andy Galpin: Optimal Protocols to Build Strength & Grow Muscles | Huberman Lab Guest Series"
25
+ },
26
+ {
27
+ "id": "zEYE-vcVKy8",
28
+ "title": "Dr. Andy Galpin: How to Assess & Improve All Aspects of Your Fitness | Huberman Lab Guest Series"
29
+ },
30
+ {
31
+ "id": "q1Ss8sTbFBY",
32
+ "title": "Fitness Toolkit: Protocol & Tools to Optimize Physical Health | Huberman Lab Podcast #94"
33
+ },
34
+ {
35
+ "id": "DTCmprPCDqc",
36
+ "title": "Dr. Peter Attia: Exercise, Nutrition, Hormones for Vitality & Longevity | Huberman Lab Podcast #85"
37
+ },
38
+ {
39
+ "id": "UNCwdFxPtE8",
40
+ "title": "Jeff Cavaliere: Optimize Your Exercise Program with Science-Based Tools | Huberman Lab Podcast #79"
41
+ },
42
+ {
43
+ "id": "a9yFKPmPZ90",
44
+ "title": "Ido Portal: The Science & Practice of Movement | Huberman Lab Podcast #77"
45
+ },
46
+ {
47
+ "id": "tkH2-_jMCSk",
48
+ "title": "Improve Flexibility with Research-Supported Stretching Protocols | Huberman Lab Podcast #76"
49
+ },
50
+ {
51
+ "id": "IAnhFUUCq6c",
52
+ "title": "Dr. Andy Galpin: How to Build Strength, Muscle Size & Endurance | Huberman Lab Podcast #65"
53
+ },
54
+ {
55
+ "id": "GLgKkG44MGo",
56
+ "title": "Dr. Jack Feldman: Breathing for Mental & Physical Health & Performance | Huberman Lab Podcast #54"
57
+ },
58
+ {
59
+ "id": "iMvtHqLmEkI",
60
+ "title": "Dr. Duncan French: How to Exercise for Strength Gains & Hormone Optimization | Huberman Lab #45"
61
+ },
62
+ {
63
+ "id": "VQLU7gpk_X8",
64
+ "title": "How to Build Endurance in Your Brain & Body"
65
+ },
66
+ {
67
+ "id": "XLr2RKoD-oY",
68
+ "title": "Science of Muscle Growth, Increasing Strength & Muscular Recovery"
69
+ }
70
+ ]
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1dc504103bbe12bbc64b0a140e9c7eac03ceb4ad42d6b251e8399e6035b6407
3
+ size 3352000
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2a3cca693212a055b68369ac5fd6015123fe35f04aafb696bf97a944142a76
3
+ size 100
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c76fc6a10e46df31db8a10e076a256a53418217dca6d9d7833623c4c1fcbee63
3
+ size 75509
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ad502e522081d0f08a17da3b8d9af56c4a78798cd4b76926924a473e7f210fb
3
+ size 8000
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:858c69984324e9f15a6fbd27b9f4efdd2ec55fe7240c7c438f39a2d8cbd808fb
3
+ size 18268
data/videos_subset.db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6b150b1e85be961e1bcb3b67fa9ded5a20b79bb53ee59d05f5f2b6c09d7501e
3
+ size 13717504
data/videos_subset.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "LYYyQcAJZfk",
4
+ "title": "Science-Supported Tools to Accelerate Your Fitness Goals | Huberman Lab Podcast"
5
+ },
6
+ {
7
+ "id": "q37ARYnRDGc",
8
+ "title": "Dr. Andy Galpin: Optimal Nutrition & Supplementation for Fitness | Huberman Lab Guest Series"
9
+ },
10
+ {
11
+ "id": "oNkDA2F7CjM",
12
+ "title": "Dr. Andy Galpin: How to Build Physical Endurance & Lose Fat | Huberman Lab Guest Series"
13
+ }
14
+ ]
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31040549c44b1b419d5f60f849fd1abd006c4dd33bf077cf498e92360a7a890f
3
+ size 3352000
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:092185a3873be9e8dcabd69df1976197a0afb6cf7d175f3ee73ccae7283d0abc
3
+ size 100
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2bc9be2510d046cef8a9e0aabc77141b639c086811cdbd4203b15977c7b2f6e
3
+ size 75323
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cd4536124c25ea26dc53deadb45f017008f2815fbe38deb6e22ae1ecec8e57a
3
+ size 8000
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0e5fb78cc60a74d205b2b5797f65b15ce3ecd5d63d72f4ef82daf716e787a39
3
+ size 18200
data/videos_subset_more_context.db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfa8d1ef4a95be2ae4714fbb86e740c4ed4bd334aa068fb64817f84e25857bda
3
+ size 21721088
main.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models import llm, retrieval
2
+ import gradio as gr
3
+
4
+
5
+ def run_query(question, db_path="data/videos_subset_more_context.db", #dev path
6
+ num_rel_segments=5,
7
+ llm_model="gpt-3.5-turbo-0125",
8
+ llm_temp=0.1):
9
+
10
+ relevant_segments = retrieval.get_relevant_segments(question,
11
+ db_path=db_path,
12
+ n_results=num_rel_segments)
13
+
14
+ answer = llm.answer_with_context(question,
15
+ relevant_segments,
16
+ model=llm_model,
17
+ temperature=llm_temp)
18
+
19
+ return answer
20
+
21
+
22
+ if __name__ == "__main__":
23
+ demo = gr.Interface(fn=run_query, inputs="text", outputs="text")
24
+
25
+ demo.launch(share=True)
models/__init__.py ADDED
File without changes
models/etl.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import chromadb
3
+ from datetime import datetime
4
+
5
+ from utils.general_utils import timeit
6
+ from utils.embedding_utils import MyEmbeddingFunction
7
+ from youtube_transcript_api import YouTubeTranscriptApi
8
+
9
+
10
+ @timeit
11
+ def run_etl(json_path="data/videos.json", db=None, batch_size=None, overlap=None):
12
+ with open(json_path) as f:
13
+ video_info = json.load(f)
14
+
15
+ videos = []
16
+ for video in video_info:
17
+ video_id = video["id"]
18
+ video_title = video["title"]
19
+ transcript = get_video_transcript(video_id)
20
+ print(f"Transcript for video {video_id} fetched.")
21
+ if transcript:
22
+ formatted_transcript = format_transcript(transcript, video_id, video_title, batch_size=batch_size, overlap=overlap)
23
+
24
+ videos.extend(formatted_transcript)
25
+
26
+ if db:
27
+ initialize_db(db)
28
+ load_data_to_db(db, videos)
29
+ log_data_load(json_path, db, batch_size, overlap)
30
+ else:
31
+ print("No database specified. Skipping database load.")
32
+ print(videos)
33
+
34
+
35
+ @timeit
36
+ def get_video_transcript(video_id):
37
+ try:
38
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'en-US'])
39
+ return transcript
40
+ except Exception as e:
41
+ print(f"Error fetching transcript for video {video_id}: {str(e)}")
42
+ return None
43
+
44
+
45
+ def format_transcript(transcript, video_id, video_title, batch_size=None, overlap=None):
46
+ formatted_data = []
47
+ base_url = f"https://www.youtube.com/watch?v={video_id}"
48
+ query_params = "&t={start}s"
49
+
50
+ if not batch_size:
51
+ batch_size = 1
52
+ overlap = 0
53
+
54
+ for i in range(0, len(transcript), batch_size - overlap):
55
+ batch = list(transcript[i:i+batch_size])
56
+
57
+ start_time = batch[0]["start"]
58
+
59
+ text = " ".join(entry["text"] for entry in batch)
60
+
61
+ url = base_url + query_params.format(start=start_time)
62
+
63
+ metadata = {
64
+ "video_id": video_id,
65
+ "segment_id": video_id + "__" + str(i),
66
+ "title": video_title,
67
+ "source": url
68
+ }
69
+
70
+ segment = {"text": text, "metadata": metadata}
71
+
72
+ formatted_data.append(segment)
73
+
74
+ return formatted_data
75
+
76
+
77
+ embed_text = MyEmbeddingFunction()
78
+
79
+ def initialize_db(db_path, distance_metric="cosine"):
80
+ client = chromadb.PersistentClient(path=db_path)
81
+
82
+ # Clear existing data
83
+ # client.reset()
84
+
85
+ client.create_collection(
86
+ name="huberman_videos",
87
+ embedding_function=embed_text,
88
+ metadata={"hnsw:space": distance_metric}
89
+ )
90
+
91
+ print(f"Database created at {db_path}")
92
+
93
+
94
+ def load_data_to_db(db_path, data):
95
+ client = chromadb.PersistentClient(path=db_path)
96
+
97
+ collection = client.get_collection("huberman_videos")
98
+
99
+ documents = [segment['text'] for segment in data]
100
+ metadata = [segment['metadata'] for segment in data]
101
+ ids = [segment['metadata']['segment_id'] for segment in data]
102
+
103
+ collection.add(
104
+ documents=documents,
105
+ metadatas=metadata,
106
+ ids=ids
107
+ )
108
+
109
+ print(f"Data loaded to database at {db_path}.")
110
+
111
+
112
+ def log_data_load(json_path, db_path, batch_size, overlap):
113
+ log_json = json.dumps({
114
+ "videos_info_path": json_path,
115
+ "db_path": db_path,
116
+ "batch_size": batch_size,
117
+ "overlap": overlap,
118
+ "load_time": str(datetime.now())
119
+ })
120
+
121
+ db_file = db_path.split("/")[-1]
122
+ db_name = db_file.split(".")[0]
123
+ log_path = f"data/logs/{db_name}_load_log.json"
124
+
125
+ with open(log_path, "w") as f:
126
+ f.write(log_json)
models/llm.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables from .env file
6
+ load_dotenv()
7
+
8
+ # Get the API key from the environment variable
9
+ api_key = os.getenv("OPENAI_API_KEY")
10
+
11
+ # Create the OpenAI client
12
+ client = openai.OpenAI(api_key=api_key)
13
+
14
+ context_result_base = "CONTEXT: {text}\nTITLE: {title}\nSOURCE: {source}\n\n"
15
+
16
+ def format_context(db_query_results):
17
+ documents = db_query_results['documents'][0]
18
+ metadatas = db_query_results['metadatas'][0]
19
+
20
+ formatted_context = ""
21
+ for i in range(len(documents)):
22
+ result_text = context_result_base.format(text=documents[i], title=metadatas[i]['title'], source=metadatas[i]['source'])
23
+ formatted_context += result_text
24
+
25
+ return formatted_context
26
+
27
+
28
+ def answer_with_context(question, context, model="gpt-3.5-turbo-1106", temperature=0.5):
29
+ formatted_context = format_context(context)
30
+
31
+ instruction = '''You are a question-answering bot. The user will ask a question about fitness and recovery. First, you will be provided relevant context. The relevant context are segments of transcripts from Andrew Huberman's playlist on fitness and recovery where he has conversations about these topics. Answer the user's question and include the video title and link to the relevant context where they talk about the topic of the user's question. When referencing relevant context, return its TITLE and SOURCE. If no context are related to the question, answer the question yourself and state that "No relevant clips were found". Use this format:
32
+ User: ```What is muscle atrophy?```
33
+ AI: ```Muscle atrophy is the decrease in size and wasting of muscle tissue.
34
+ VIDEO: Example video title
35
+ SOURCE: Example video url```
36
+ '''
37
+
38
+ formatted_context = "RELEVANT CONTEXT:\n```" + formatted_context + "```"
39
+
40
+ response = client.chat.completions.create(
41
+ model=model,
42
+ temperature=temperature,
43
+ messages=[
44
+ {"role": "system", "content": instruction},
45
+ {"role": "user", "content": formatted_context},
46
+ {"role": "user", "content": question}
47
+ ]
48
+ )
49
+
50
+ return response.choices[0].message.content
models/retrieval.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+
3
+
4
+ def get_relevant_segments(query, db_path, n_results=5):
5
+ client = chromadb.PersistentClient(db_path)
6
+ collection= client.get_collection('huberman_videos')
7
+
8
+ results = collection.query(query_texts=[query], n_results=n_results)
9
+
10
+ return results
11
+
package-lock.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "ml-qa-chatbot",
3
+ "lockfileVersion": 3,
4
+ "requires": true,
5
+ "packages": {
6
+ "": {
7
+ "dependencies": {
8
+ "dotenv": "^16.4.1"
9
+ }
10
+ },
11
+ "node_modules/dotenv": {
12
+ "version": "16.4.1",
13
+ "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.1.tgz",
14
+ "integrity": "sha512-CjA3y+Dr3FyFDOAMnxZEGtnW9KBR2M0JvvUtXNW+dYJL5ROWxP9DUHCwgFqpMk0OXCc0ljhaNTr2w/kutYIcHQ==",
15
+ "engines": {
16
+ "node": ">=12"
17
+ },
18
+ "funding": {
19
+ "url": "https://github.com/motdotla/dotenv?sponsor=1"
20
+ }
21
+ }
22
+ }
23
+ }
package.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "dependencies": {
3
+ "dotenv": "^16.4.1"
4
+ }
5
+ }
requirements.txt ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ altair==5.2.0
3
+ annotated-types==0.6.0
4
+ anyio==4.2.0
5
+ asgiref==3.7.2
6
+ attrs==23.2.0
7
+ backoff==2.2.1
8
+ bcrypt==4.1.2
9
+ build==1.0.3
10
+ cachetools==5.3.2
11
+ certifi==2024.2.2
12
+ charset-normalizer==3.3.2
13
+ chroma-hnswlib==0.7.3
14
+ chromadb==0.4.22
15
+ click==8.1.7
16
+ colorama==0.4.6
17
+ coloredlogs==15.0.1
18
+ contourpy==1.2.0
19
+ cycler==0.12.1
20
+ Deprecated==1.2.14
21
+ distro==1.9.0
22
+ fastapi==0.109.2
23
+ ffmpy==0.3.1
24
+ filelock==3.13.1
25
+ flatbuffers==23.5.26
26
+ fonttools==4.47.2
27
+ fsspec==2024.2.0
28
+ google-auth==2.27.0
29
+ googleapis-common-protos==1.62.0
30
+ gradio==4.16.0
31
+ gradio_client==0.8.1
32
+ grpcio==1.60.1
33
+ h11==0.14.0
34
+ httpcore==1.0.2
35
+ httptools==0.6.1
36
+ httpx==0.26.0
37
+ huggingface-hub==0.20.3
38
+ humanfriendly==10.0
39
+ idna==3.6
40
+ importlib-metadata==6.11.0
41
+ importlib-resources==6.1.1
42
+ Jinja2==3.1.3
43
+ joblib==1.3.2
44
+ jsonschema==4.21.1
45
+ jsonschema-specifications==2023.12.1
46
+ kiwisolver==1.4.5
47
+ kubernetes==29.0.0
48
+ markdown-it-py==3.0.0
49
+ MarkupSafe==2.1.5
50
+ matplotlib==3.8.2
51
+ mdurl==0.1.2
52
+ mmh3==4.1.0
53
+ monotonic==1.6
54
+ mpmath==1.3.0
55
+ networkx==3.2.1
56
+ numpy==1.26.3
57
+ oauthlib==3.2.2
58
+ onnxruntime==1.17.0
59
+ openai==1.11.1
60
+ opentelemetry-api==1.22.0
61
+ opentelemetry-exporter-otlp-proto-common==1.22.0
62
+ opentelemetry-exporter-otlp-proto-grpc==1.22.0
63
+ opentelemetry-instrumentation==0.43b0
64
+ opentelemetry-instrumentation-asgi==0.43b0
65
+ opentelemetry-instrumentation-fastapi==0.43b0
66
+ opentelemetry-proto==1.22.0
67
+ opentelemetry-sdk==1.22.0
68
+ opentelemetry-semantic-conventions==0.43b0
69
+ opentelemetry-util-http==0.43b0
70
+ orjson==3.9.13
71
+ overrides==7.7.0
72
+ packaging==23.2
73
+ pandas==2.2.0
74
+ pillow==10.2.0
75
+ posthog==3.4.0
76
+ protobuf==4.25.2
77
+ pulsar-client==3.4.0
78
+ pyasn1==0.5.1
79
+ pyasn1-modules==0.3.0
80
+ pydantic==2.6.1
81
+ pydantic_core==2.16.2
82
+ pydub==0.25.1
83
+ Pygments==2.17.2
84
+ pyparsing==3.1.1
85
+ PyPika==0.48.9
86
+ pyproject_hooks==1.0.0
87
+ pyreadline3==3.4.1
88
+ python-dateutil==2.8.2
89
+ python-dotenv==1.0.1
90
+ python-multipart==0.0.7
91
+ pytz==2024.1
92
+ PyYAML==6.0.1
93
+ referencing==0.33.0
94
+ regex==2023.12.25
95
+ requests==2.31.0
96
+ requests-oauthlib==1.3.1
97
+ rich==13.7.0
98
+ rpds-py==0.17.1
99
+ rsa==4.9
100
+ ruff==0.2.1
101
+ safetensors==0.4.2
102
+ scikit-learn==1.4.0
103
+ scipy==1.12.0
104
+ semantic-version==2.10.0
105
+ shellingham==1.5.4
106
+ six==1.16.0
107
+ sniffio==1.3.0
108
+ starlette==0.36.3
109
+ sympy==1.12
110
+ tenacity==8.2.3
111
+ threadpoolctl==3.2.0
112
+ tokenizers==0.15.1
113
+ tomlkit==0.12.0
114
+ toolz==0.12.1
115
+ torch==2.2.0
116
+ tqdm==4.66.1
117
+ transformers==4.37.2
118
+ typer==0.9.0
119
+ typing_extensions==4.9.0
120
+ tzdata==2023.4
121
+ urllib3==2.2.0
122
+ uvicorn==0.27.0.post1
123
+ watchfiles==0.21.0
124
+ websocket-client==1.7.0
125
+ websockets==11.0.3
126
+ wrapt==1.16.0
127
+ zipp==3.17.0
run_etl.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models.etl import run_etl
2
+
3
+ if __name__ == "__main__":
4
+ json_path = input("Enter path to JSON file: ")
5
+ db_path = input("Enter path to database: ")
6
+ batch_size = int(input("Enter batch size (leave blank for no batching): "))
7
+
8
+ if batch_size:
9
+ overlap = int(input("Enter overlap (leave blank for no overlap): "))
10
+ else:
11
+ overlap = None
12
+
13
+
14
+ run_etl(json_path=json_path, db=db_path, batch_size=batch_size, overlap=overlap)
tests/__init__.py ADDED
File without changes
tests/test_main.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from main import run_query
2
+ from utils.general_utils import timeit
3
+
4
+ @timeit
5
+ def test_with_llm_video():
6
+ db_path = "data/single_video.db"
7
+
8
+ question = "What are the components of an LLM?"
9
+ print("Question: ", question)
10
+
11
+ answer = run_query(question,
12
+ db_path=db_path
13
+ )
14
+
15
+ print("Answer: ", answer)
16
+
17
+ @timeit
18
+ def test_with_subset():
19
+ db_path = "data/videos_subset_more_context.db"
20
+
21
+ question = "How should I train for anerobic capacity?"
22
+ print("Question: ", question)
23
+
24
+ answer = run_query(question,
25
+ db_path=db_path,
26
+ num_rel_segments=10,
27
+ llm_model="gpt-3.5-turbo-0125",
28
+ llm_temp=0.1
29
+ )
30
+
31
+ print("Answer: ", answer)
32
+
33
+
34
+ if __name__ == '__main__':
35
+ choice = input("Enter 1 for test_with_subset, 2 for test_with_llm_video: ")
36
+
37
+ if choice == "1":
38
+ test_with_subset()
39
+ elif choice == "2":
40
+ test_with_llm_video()
41
+ else:
42
+ print("Invalid choice")
43
+ exit(1)
tests/test_retrieval.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models.retrieval import get_relevant_segments
2
+ from models.llm import format_context
3
+ from utils.general_utils import timeit
4
+
5
+ @timeit
6
+ def test_with_subset():
7
+ db_path = "data/videos_subset.db"
8
+
9
+ question = "What methods can I use to lose weight quickly?"
10
+ print("Question: ", question)
11
+
12
+ relevant_segments = get_relevant_segments(question,
13
+ db_path=db_path
14
+ )
15
+ formatted_segments = format_context(relevant_segments)
16
+
17
+ print("Segments: ", formatted_segments)
18
+
19
+ if __name__ == "__main__":
20
+ test_with_subset()
utils/__init__.py ADDED
File without changes
utils/embedding_utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModel
3
+ from chromadb import Documents, EmbeddingFunction, Embeddings
4
+
5
+
6
+ model_name = "YituTech/conv-bert-base"
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModel.from_pretrained(model_name)
9
+
10
+
11
+ class MyEmbeddingFunction(EmbeddingFunction[Documents]):
12
+
13
+ def __call__(self, input: Documents) -> Embeddings:
14
+ embeddings_list = []
15
+
16
+ for text in input:
17
+ tokens = tokenizer(text, return_tensors='pt')
18
+ with torch.no_grad():
19
+ outputs = model(**tokens)
20
+ embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
21
+ embeddings_list.append(embeddings)
22
+
23
+ return embeddings_list
utils/general_utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ def timeit(func):
4
+ def wrapper(*args, **kwargs):
5
+ start_time = time.time()
6
+ result = func(*args, **kwargs)
7
+ end_time = time.time()
8
+ execution_time = round(end_time - start_time, 2)
9
+ print(f"{func.__name__} took {execution_time} seconds.")
10
+ return result
11
+ return wrapper
12
+