Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- .gitignore +138 -0
- ETL_Walkthrough.ipynb +375 -0
- README.md +3 -9
- TESTING.ipynb +165 -0
- __init__.py +0 -0
- data/logs/videos_subset_more_context_load_log.json +1 -0
- data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/data_level0.bin +3 -0
- data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/header.bin +3 -0
- data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/length.bin +3 -0
- data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/link_lists.bin +0 -0
- data/single_video.db/chroma.sqlite3 +0 -0
- data/single_video.json +6 -0
- data/videos.json +70 -0
- data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/data_level0.bin +3 -0
- data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/header.bin +3 -0
- data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/index_metadata.pickle +3 -0
- data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/length.bin +3 -0
- data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/link_lists.bin +3 -0
- data/videos_subset.db/chroma.sqlite3 +3 -0
- data/videos_subset.json +14 -0
- data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/data_level0.bin +3 -0
- data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/header.bin +3 -0
- data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/index_metadata.pickle +3 -0
- data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/length.bin +3 -0
- data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/link_lists.bin +3 -0
- data/videos_subset_more_context.db/chroma.sqlite3 +3 -0
- main.py +25 -0
- models/__init__.py +0 -0
- models/etl.py +126 -0
- models/llm.py +50 -0
- models/retrieval.py +11 -0
- package-lock.json +23 -0
- package.json +5 -0
- requirements.txt +127 -0
- run_etl.py +14 -0
- tests/__init__.py +0 -0
- tests/test_main.py +43 -0
- tests/test_retrieval.py +20 -0
- utils/__init__.py +0 -0
- utils/embedding_utils.py +23 -0
- utils/general_utils.py +12 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/videos_subset.db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/videos_subset_more_context.db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
98 |
+
__pypackages__/
|
99 |
+
|
100 |
+
# Celery stuff
|
101 |
+
celerybeat-schedule
|
102 |
+
celerybeat.pid
|
103 |
+
|
104 |
+
# SageMath parsed files
|
105 |
+
*.sage.py
|
106 |
+
|
107 |
+
# Environments
|
108 |
+
.env
|
109 |
+
.venv
|
110 |
+
env/
|
111 |
+
venv/
|
112 |
+
ENV/
|
113 |
+
env.bak/
|
114 |
+
venv.bak/
|
115 |
+
|
116 |
+
# Spyder project settings
|
117 |
+
.spyderproject
|
118 |
+
.spyproject
|
119 |
+
|
120 |
+
# Rope project settings
|
121 |
+
.ropeproject
|
122 |
+
|
123 |
+
# mkdocs documentation
|
124 |
+
/site
|
125 |
+
|
126 |
+
# mypy
|
127 |
+
.mypy_cache/
|
128 |
+
.dmypy.json
|
129 |
+
dmypy.json
|
130 |
+
|
131 |
+
# Pyre type checker
|
132 |
+
.pyre/
|
133 |
+
|
134 |
+
# pytype static type analyzer
|
135 |
+
.pytype/
|
136 |
+
|
137 |
+
# Cython debug symbols
|
138 |
+
cython_debug/
|
ETL_Walkthrough.ipynb
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# ETL to get the text data from the playlist\n",
|
8 |
+
"\n",
|
9 |
+
"This notebook shows the process of building the corpus of transcripts from the YouTube playlist.\n",
|
10 |
+
"\n",
|
11 |
+
"**Extract**: Pull data (transcripts) from each video. \n",
|
12 |
+
"**Transform**: \n",
|
13 |
+
"**Load**: Load data into our database where it will be retrieved from. "
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "code",
|
18 |
+
"execution_count": 1,
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"from models import etl\n",
|
23 |
+
"import json"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "markdown",
|
28 |
+
"metadata": {},
|
29 |
+
"source": [
|
30 |
+
"First we load the video information. This includes the video IDs and titles."
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": 2,
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [],
|
38 |
+
"source": [
|
39 |
+
"with open('data/single_video.json') as f:\n",
|
40 |
+
" video_info = json.load(f)"
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"cell_type": "markdown",
|
45 |
+
"metadata": {},
|
46 |
+
"source": [
|
47 |
+
"Then we must extract the transcripts using the YouTube Transcript API. This is done over all of the videos. \n",
|
48 |
+
"This produces a list of video segments with timestamps. \n",
|
49 |
+
"Next, we format the transcript by adding metadata so that the segments are easily identified for retreival later. \n",
|
50 |
+
"Since the original segments are small, they are batched with overlap to preserve semantic meaning."
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"cell_type": "code",
|
55 |
+
"execution_count": 3,
|
56 |
+
"metadata": {},
|
57 |
+
"outputs": [
|
58 |
+
{
|
59 |
+
"name": "stdout",
|
60 |
+
"output_type": "stream",
|
61 |
+
"text": [
|
62 |
+
"get_video_transcript took 0.84 seconds.\n",
|
63 |
+
"Transcript for video 5sLYAQS9sWQ fetched.\n"
|
64 |
+
]
|
65 |
+
}
|
66 |
+
],
|
67 |
+
"source": [
|
68 |
+
"videos = []\n",
|
69 |
+
"for video in video_info:\n",
|
70 |
+
" video_id = video[\"id\"]\n",
|
71 |
+
" video_title = video[\"title\"]\n",
|
72 |
+
" transcript = etl.get_video_transcript(video_id)\n",
|
73 |
+
" print(f\"Transcript for video {video_id} fetched.\")\n",
|
74 |
+
" if transcript:\n",
|
75 |
+
" formatted_transcript = etl.format_transcript(transcript, video_id, video_title, batch_size=5, overlap=2)\n",
|
76 |
+
" \n",
|
77 |
+
" videos.extend(formatted_transcript)"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "markdown",
|
82 |
+
"metadata": {},
|
83 |
+
"source": [
|
84 |
+
"The last step is to load the data into a database. We will use a Chromadb database. \n",
|
85 |
+
"The embedding function is the ____ model from HuggingFace."
|
86 |
+
]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "code",
|
90 |
+
"execution_count": 4,
|
91 |
+
"metadata": {},
|
92 |
+
"outputs": [
|
93 |
+
{
|
94 |
+
"name": "stdout",
|
95 |
+
"output_type": "stream",
|
96 |
+
"text": [
|
97 |
+
"Database created at data/single_video.db\n"
|
98 |
+
]
|
99 |
+
}
|
100 |
+
],
|
101 |
+
"source": [
|
102 |
+
"# Initialize the database\n",
|
103 |
+
"from utils.embedding_utils import MyEmbeddingFunction\n",
|
104 |
+
"import chromadb\n",
|
105 |
+
"\n",
|
106 |
+
"embed_text = MyEmbeddingFunction()\n",
|
107 |
+
"\n",
|
108 |
+
"db_path = \"data/single_video.db\"\n",
|
109 |
+
"client = chromadb.PersistentClient(path=db_path)\n",
|
110 |
+
"\n",
|
111 |
+
"client.create_collection(\n",
|
112 |
+
" name=\"huberman_videos\",\n",
|
113 |
+
" embedding_function=embed_text,\n",
|
114 |
+
" metadata={\"hnsw:space\": \"cosine\"}\n",
|
115 |
+
")\n",
|
116 |
+
"\n",
|
117 |
+
"print(f\"Database created at {db_path}\")"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"cell_type": "code",
|
122 |
+
"execution_count": 5,
|
123 |
+
"metadata": {},
|
124 |
+
"outputs": [
|
125 |
+
{
|
126 |
+
"name": "stdout",
|
127 |
+
"output_type": "stream",
|
128 |
+
"text": [
|
129 |
+
"Data loaded to database at data/single_video.db.\n"
|
130 |
+
]
|
131 |
+
}
|
132 |
+
],
|
133 |
+
"source": [
|
134 |
+
"# Add the data to the database\n",
|
135 |
+
"client = chromadb.PersistentClient(path=db_path)\n",
|
136 |
+
" \n",
|
137 |
+
"collection = client.get_collection(\"huberman_videos\")\n",
|
138 |
+
"\n",
|
139 |
+
"documents = [segment['text'] for segment in videos]\n",
|
140 |
+
"metadata = [segment['metadata'] for segment in videos]\n",
|
141 |
+
"ids = [segment['metadata']['segment_id'] for segment in videos]\n",
|
142 |
+
"\n",
|
143 |
+
"collection.add(\n",
|
144 |
+
" documents=documents,\n",
|
145 |
+
" metadatas=metadata,\n",
|
146 |
+
" ids=ids\n",
|
147 |
+
")\n",
|
148 |
+
"\n",
|
149 |
+
"print(f\"Data loaded to database at {db_path}.\")"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "markdown",
|
154 |
+
"metadata": {},
|
155 |
+
"source": [
|
156 |
+
"Here is some of the data:"
|
157 |
+
]
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"cell_type": "code",
|
161 |
+
"execution_count": 8,
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [
|
164 |
+
{
|
165 |
+
"name": "stdout",
|
166 |
+
"output_type": "stream",
|
167 |
+
"text": [
|
168 |
+
"Number of segments: 26\n"
|
169 |
+
]
|
170 |
+
},
|
171 |
+
{
|
172 |
+
"data": {
|
173 |
+
"text/html": [
|
174 |
+
"<div>\n",
|
175 |
+
"<style scoped>\n",
|
176 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
177 |
+
" vertical-align: middle;\n",
|
178 |
+
" }\n",
|
179 |
+
"\n",
|
180 |
+
" .dataframe tbody tr th {\n",
|
181 |
+
" vertical-align: top;\n",
|
182 |
+
" }\n",
|
183 |
+
"\n",
|
184 |
+
" .dataframe thead th {\n",
|
185 |
+
" text-align: right;\n",
|
186 |
+
" }\n",
|
187 |
+
"</style>\n",
|
188 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
189 |
+
" <thead>\n",
|
190 |
+
" <tr style=\"text-align: right;\">\n",
|
191 |
+
" <th></th>\n",
|
192 |
+
" <th>ids</th>\n",
|
193 |
+
" <th>embeddings</th>\n",
|
194 |
+
" <th>metadatas</th>\n",
|
195 |
+
" <th>documents</th>\n",
|
196 |
+
" <th>uris</th>\n",
|
197 |
+
" <th>data</th>\n",
|
198 |
+
" </tr>\n",
|
199 |
+
" </thead>\n",
|
200 |
+
" <tbody>\n",
|
201 |
+
" <tr>\n",
|
202 |
+
" <th>0</th>\n",
|
203 |
+
" <td>5sLYAQS9sWQ__0</td>\n",
|
204 |
+
" <td>[-0.11489544063806534, -0.03262839838862419, -...</td>\n",
|
205 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__0', 'source': 'ht...</td>\n",
|
206 |
+
" <td>GPT, or Generative Pre-trained Transformer, is...</td>\n",
|
207 |
+
" <td>None</td>\n",
|
208 |
+
" <td>None</td>\n",
|
209 |
+
" </tr>\n",
|
210 |
+
" <tr>\n",
|
211 |
+
" <th>1</th>\n",
|
212 |
+
" <td>5sLYAQS9sWQ__12</td>\n",
|
213 |
+
" <td>[0.094169981777668, -0.10430295020341873, 0.02...</td>\n",
|
214 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__12', 'source': 'h...</td>\n",
|
215 |
+
" <td>Now foundation models are pre-trained on large...</td>\n",
|
216 |
+
" <td>None</td>\n",
|
217 |
+
" <td>None</td>\n",
|
218 |
+
" </tr>\n",
|
219 |
+
" <tr>\n",
|
220 |
+
" <th>2</th>\n",
|
221 |
+
" <td>5sLYAQS9sWQ__15</td>\n",
|
222 |
+
" <td>[0.042587604373693466, -0.061460819095373154, ...</td>\n",
|
223 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__15', 'source': 'h...</td>\n",
|
224 |
+
" <td>I'm talking about things like code. Now, large...</td>\n",
|
225 |
+
" <td>None</td>\n",
|
226 |
+
" <td>None</td>\n",
|
227 |
+
" </tr>\n",
|
228 |
+
" <tr>\n",
|
229 |
+
" <th>3</th>\n",
|
230 |
+
" <td>5sLYAQS9sWQ__18</td>\n",
|
231 |
+
" <td>[-0.0245895367115736, -0.058405470103025436, -...</td>\n",
|
232 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__18', 'source': 'h...</td>\n",
|
233 |
+
" <td>these models can be tens of gigabytes in size ...</td>\n",
|
234 |
+
" <td>None</td>\n",
|
235 |
+
" <td>None</td>\n",
|
236 |
+
" </tr>\n",
|
237 |
+
" <tr>\n",
|
238 |
+
" <th>4</th>\n",
|
239 |
+
" <td>5sLYAQS9sWQ__21</td>\n",
|
240 |
+
" <td>[0.05348338559269905, -0.016104578971862793, -...</td>\n",
|
241 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__21', 'source': 'h...</td>\n",
|
242 |
+
" <td>So to put that into perspective, a text file t...</td>\n",
|
243 |
+
" <td>None</td>\n",
|
244 |
+
" <td>None</td>\n",
|
245 |
+
" </tr>\n",
|
246 |
+
" <tr>\n",
|
247 |
+
" <th>5</th>\n",
|
248 |
+
" <td>5sLYAQS9sWQ__24</td>\n",
|
249 |
+
" <td>[0.07004527002573013, -0.08996045589447021, -0...</td>\n",
|
250 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__24', 'source': 'h...</td>\n",
|
251 |
+
" <td>A lot of words just in one Gb. And how many gi...</td>\n",
|
252 |
+
" <td>None</td>\n",
|
253 |
+
" <td>None</td>\n",
|
254 |
+
" </tr>\n",
|
255 |
+
" <tr>\n",
|
256 |
+
" <th>6</th>\n",
|
257 |
+
" <td>5sLYAQS9sWQ__27</td>\n",
|
258 |
+
" <td>[0.0283487681299448, -0.11020224541425705, -0....</td>\n",
|
259 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__27', 'source': 'h...</td>\n",
|
260 |
+
" <td>Yeah, that's truly a lot of text. And LLMs are...</td>\n",
|
261 |
+
" <td>None</td>\n",
|
262 |
+
" <td>None</td>\n",
|
263 |
+
" </tr>\n",
|
264 |
+
" <tr>\n",
|
265 |
+
" <th>7</th>\n",
|
266 |
+
" <td>5sLYAQS9sWQ__3</td>\n",
|
267 |
+
" <td>[-0.0700172707438469, -0.061202701181173325, -...</td>\n",
|
268 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__3', 'source': 'ht...</td>\n",
|
269 |
+
" <td>And I've been using GPT in its various forms f...</td>\n",
|
270 |
+
" <td>None</td>\n",
|
271 |
+
" <td>None</td>\n",
|
272 |
+
" </tr>\n",
|
273 |
+
" <tr>\n",
|
274 |
+
" <th>8</th>\n",
|
275 |
+
" <td>5sLYAQS9sWQ__30</td>\n",
|
276 |
+
" <td>[-0.04904637485742569, -0.1277533322572708, -0...</td>\n",
|
277 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__30', 'source': 'h...</td>\n",
|
278 |
+
" <td>and the more parameters a model has, the more ...</td>\n",
|
279 |
+
" <td>None</td>\n",
|
280 |
+
" <td>None</td>\n",
|
281 |
+
" </tr>\n",
|
282 |
+
" <tr>\n",
|
283 |
+
" <th>9</th>\n",
|
284 |
+
" <td>5sLYAQS9sWQ__33</td>\n",
|
285 |
+
" <td>[0.03286760300397873, -0.041724931448698044, 0...</td>\n",
|
286 |
+
" <td>{'segment_id': '5sLYAQS9sWQ__33', 'source': 'h...</td>\n",
|
287 |
+
" <td>All right, so how do they work? Well, we can t...</td>\n",
|
288 |
+
" <td>None</td>\n",
|
289 |
+
" <td>None</td>\n",
|
290 |
+
" </tr>\n",
|
291 |
+
" </tbody>\n",
|
292 |
+
"</table>\n",
|
293 |
+
"</div>"
|
294 |
+
],
|
295 |
+
"text/plain": [
|
296 |
+
" ids embeddings \\\n",
|
297 |
+
"0 5sLYAQS9sWQ__0 [-0.11489544063806534, -0.03262839838862419, -... \n",
|
298 |
+
"1 5sLYAQS9sWQ__12 [0.094169981777668, -0.10430295020341873, 0.02... \n",
|
299 |
+
"2 5sLYAQS9sWQ__15 [0.042587604373693466, -0.061460819095373154, ... \n",
|
300 |
+
"3 5sLYAQS9sWQ__18 [-0.0245895367115736, -0.058405470103025436, -... \n",
|
301 |
+
"4 5sLYAQS9sWQ__21 [0.05348338559269905, -0.016104578971862793, -... \n",
|
302 |
+
"5 5sLYAQS9sWQ__24 [0.07004527002573013, -0.08996045589447021, -0... \n",
|
303 |
+
"6 5sLYAQS9sWQ__27 [0.0283487681299448, -0.11020224541425705, -0.... \n",
|
304 |
+
"7 5sLYAQS9sWQ__3 [-0.0700172707438469, -0.061202701181173325, -... \n",
|
305 |
+
"8 5sLYAQS9sWQ__30 [-0.04904637485742569, -0.1277533322572708, -0... \n",
|
306 |
+
"9 5sLYAQS9sWQ__33 [0.03286760300397873, -0.041724931448698044, 0... \n",
|
307 |
+
"\n",
|
308 |
+
" metadatas \\\n",
|
309 |
+
"0 {'segment_id': '5sLYAQS9sWQ__0', 'source': 'ht... \n",
|
310 |
+
"1 {'segment_id': '5sLYAQS9sWQ__12', 'source': 'h... \n",
|
311 |
+
"2 {'segment_id': '5sLYAQS9sWQ__15', 'source': 'h... \n",
|
312 |
+
"3 {'segment_id': '5sLYAQS9sWQ__18', 'source': 'h... \n",
|
313 |
+
"4 {'segment_id': '5sLYAQS9sWQ__21', 'source': 'h... \n",
|
314 |
+
"5 {'segment_id': '5sLYAQS9sWQ__24', 'source': 'h... \n",
|
315 |
+
"6 {'segment_id': '5sLYAQS9sWQ__27', 'source': 'h... \n",
|
316 |
+
"7 {'segment_id': '5sLYAQS9sWQ__3', 'source': 'ht... \n",
|
317 |
+
"8 {'segment_id': '5sLYAQS9sWQ__30', 'source': 'h... \n",
|
318 |
+
"9 {'segment_id': '5sLYAQS9sWQ__33', 'source': 'h... \n",
|
319 |
+
"\n",
|
320 |
+
" documents uris data \n",
|
321 |
+
"0 GPT, or Generative Pre-trained Transformer, is... None None \n",
|
322 |
+
"1 Now foundation models are pre-trained on large... None None \n",
|
323 |
+
"2 I'm talking about things like code. Now, large... None None \n",
|
324 |
+
"3 these models can be tens of gigabytes in size ... None None \n",
|
325 |
+
"4 So to put that into perspective, a text file t... None None \n",
|
326 |
+
"5 A lot of words just in one Gb. And how many gi... None None \n",
|
327 |
+
"6 Yeah, that's truly a lot of text. And LLMs are... None None \n",
|
328 |
+
"7 And I've been using GPT in its various forms f... None None \n",
|
329 |
+
"8 and the more parameters a model has, the more ... None None \n",
|
330 |
+
"9 All right, so how do they work? Well, we can t... None None "
|
331 |
+
]
|
332 |
+
},
|
333 |
+
"execution_count": 8,
|
334 |
+
"metadata": {},
|
335 |
+
"output_type": "execute_result"
|
336 |
+
}
|
337 |
+
],
|
338 |
+
"source": [
|
339 |
+
"import pandas as pd\n",
|
340 |
+
"\n",
|
341 |
+
"client = chromadb.PersistentClient('data/single_video.db')\n",
|
342 |
+
"collection= client.get_collection('huberman_videos')\n",
|
343 |
+
"\n",
|
344 |
+
"num_segments = collection.count()\n",
|
345 |
+
"sample_data = collection.peek()\n",
|
346 |
+
"\n",
|
347 |
+
"transcript_df = pd.DataFrame(sample_data)\n",
|
348 |
+
"\n",
|
349 |
+
"print(f\"Number of segments: {num_segments}\")\n",
|
350 |
+
"transcript_df"
|
351 |
+
]
|
352 |
+
}
|
353 |
+
],
|
354 |
+
"metadata": {
|
355 |
+
"kernelspec": {
|
356 |
+
"display_name": "Python 3",
|
357 |
+
"language": "python",
|
358 |
+
"name": "python3"
|
359 |
+
},
|
360 |
+
"language_info": {
|
361 |
+
"codemirror_mode": {
|
362 |
+
"name": "ipython",
|
363 |
+
"version": 3
|
364 |
+
},
|
365 |
+
"file_extension": ".py",
|
366 |
+
"mimetype": "text/x-python",
|
367 |
+
"name": "python",
|
368 |
+
"nbconvert_exporter": "python",
|
369 |
+
"pygments_lexer": "ipython3",
|
370 |
+
"version": "3.11.1"
|
371 |
+
}
|
372 |
+
},
|
373 |
+
"nbformat": 4,
|
374 |
+
"nbformat_minor": 2
|
375 |
+
}
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Fitness_QA_Bot
|
3 |
+
app_file: main.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
+
sdk_version: 4.16.0
|
|
|
|
|
6 |
---
|
|
|
|
TESTING.ipynb
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from models import etl"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": null,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"etl.main(json_path='data/single_video.json', db='data/single_video.db', batch_size=5, overlap=2)"
|
19 |
+
]
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"cell_type": "code",
|
23 |
+
"execution_count": 1,
|
24 |
+
"metadata": {},
|
25 |
+
"outputs": [],
|
26 |
+
"source": [
|
27 |
+
"import chromadb\n",
|
28 |
+
"from models import etl"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "code",
|
33 |
+
"execution_count": 2,
|
34 |
+
"metadata": {},
|
35 |
+
"outputs": [],
|
36 |
+
"source": [
|
37 |
+
"client = chromadb.PersistentClient('data/single_video.db')\n",
|
38 |
+
"collection= client.get_collection('huberman_videos')\n",
|
39 |
+
"# collection.count()\n",
|
40 |
+
"# collection.peek()\n",
|
41 |
+
"\n",
|
42 |
+
"query_text = \"What are the components of an LLM?\"\n",
|
43 |
+
"query_embedding = etl.embed_text(query_text)\n",
|
44 |
+
"results = collection.query(query_texts=[query_text], n_results=5)"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": 3,
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [
|
52 |
+
{
|
53 |
+
"data": {
|
54 |
+
"text/plain": [
|
55 |
+
"{'ids': [['5sLYAQS9sWQ__33',\n",
|
56 |
+
" '5sLYAQS9sWQ__36',\n",
|
57 |
+
" '5sLYAQS9sWQ__3',\n",
|
58 |
+
" '5sLYAQS9sWQ__6',\n",
|
59 |
+
" '5sLYAQS9sWQ__27']],\n",
|
60 |
+
" 'distances': [[0.27329726119651687,\n",
|
61 |
+
" 0.3594438065792097,\n",
|
62 |
+
" 0.4730243492988927,\n",
|
63 |
+
" 0.5004446084705303,\n",
|
64 |
+
" 0.5766584257317211]],\n",
|
65 |
+
" 'metadatas': [[{'segment_id': '5sLYAQS9sWQ__33',\n",
|
66 |
+
" 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=145.328s',\n",
|
67 |
+
" 'title': 'How Large Language Models Work',\n",
|
68 |
+
" 'video_id': '5sLYAQS9sWQ'},\n",
|
69 |
+
" {'segment_id': '5sLYAQS9sWQ__36',\n",
|
70 |
+
" 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=154.367s',\n",
|
71 |
+
" 'title': 'How Large Language Models Work',\n",
|
72 |
+
" 'video_id': '5sLYAQS9sWQ'},\n",
|
73 |
+
" {'segment_id': '5sLYAQS9sWQ__3',\n",
|
74 |
+
" 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=10.783s',\n",
|
75 |
+
" 'title': 'How Large Language Models Work',\n",
|
76 |
+
" 'video_id': '5sLYAQS9sWQ'},\n",
|
77 |
+
" {'segment_id': '5sLYAQS9sWQ__6',\n",
|
78 |
+
" 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=22.544s',\n",
|
79 |
+
" 'title': 'How Large Language Models Work',\n",
|
80 |
+
" 'video_id': '5sLYAQS9sWQ'},\n",
|
81 |
+
" {'segment_id': '5sLYAQS9sWQ__27',\n",
|
82 |
+
" 'source': 'https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=117.572s',\n",
|
83 |
+
" 'title': 'How Large Language Models Work',\n",
|
84 |
+
" 'video_id': '5sLYAQS9sWQ'}]],\n",
|
85 |
+
" 'embeddings': None,\n",
|
86 |
+
" 'documents': [['All right, so how do they work? Well, we can think of it like this. LLM equals three things: data, architecture, and lastly, we can think of it as training. Those three things are really the components of an LLM.',\n",
|
87 |
+
" \"data, architecture, and lastly, we can think of it as training. Those three things are really the components of an LLM. Now, we've already discussed the enormous amounts of text data that goes into these things. As for the architecture, this is a neural network and for GPT that is a transformer.\",\n",
|
88 |
+
" 'And I\\'ve been using GPT in its various forms for years. In this video we are going to number 1, ask \"what is an LLM?\" Number 2, we are going to describe how they work. And then number 3,',\n",
|
89 |
+
" 'Number 2, we are going to describe how they work. And then number 3, we\\'re going to ask, \"what are the business applications of LLMs?\" So let\\'s start with number 1, \"what is a large language model?\" Well, a large language model',\n",
|
90 |
+
" \"Yeah, that's truly a lot of text. And LLMs are also among the biggest models when it comes to parameter count. A parameter is a value the model can change independently as it learns, and the more parameters a model has, the more complex it can be. GPT-3, for example, is pre-trained on a corpus of actually 45 terabytes of data,\"]],\n",
|
91 |
+
" 'uris': None,\n",
|
92 |
+
" 'data': None}"
|
93 |
+
]
|
94 |
+
},
|
95 |
+
"execution_count": 3,
|
96 |
+
"metadata": {},
|
97 |
+
"output_type": "execute_result"
|
98 |
+
}
|
99 |
+
],
|
100 |
+
"source": [
|
101 |
+
"results"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": 10,
|
107 |
+
"metadata": {},
|
108 |
+
"outputs": [
|
109 |
+
{
|
110 |
+
"name": "stdout",
|
111 |
+
"output_type": "stream",
|
112 |
+
"text": [
|
113 |
+
"CONTEXT: All right, so how do they work? Well, we can think of it like this. LLM equals three things: data, architecture, and lastly, we can think of it as training. Those three things are really the components of an LLM.\n",
|
114 |
+
"TITLE: How Large Language Models Work\n",
|
115 |
+
"SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=145.328s\n",
|
116 |
+
"\n",
|
117 |
+
"CONTEXT: data, architecture, and lastly, we can think of it as training. Those three things are really the components of an LLM. Now, we've already discussed the enormous amounts of text data that goes into these things. As for the architecture, this is a neural network and for GPT that is a transformer.\n",
|
118 |
+
"TITLE: How Large Language Models Work\n",
|
119 |
+
"SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=154.367s\n",
|
120 |
+
"\n",
|
121 |
+
"CONTEXT: And I've been using GPT in its various forms for years. In this video we are going to number 1, ask \"what is an LLM?\" Number 2, we are going to describe how they work. And then number 3,\n",
|
122 |
+
"TITLE: How Large Language Models Work\n",
|
123 |
+
"SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=10.783s\n",
|
124 |
+
"\n",
|
125 |
+
"CONTEXT: Number 2, we are going to describe how they work. And then number 3, we're going to ask, \"what are the business applications of LLMs?\" So let's start with number 1, \"what is a large language model?\" Well, a large language model\n",
|
126 |
+
"TITLE: How Large Language Models Work\n",
|
127 |
+
"SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=22.544s\n",
|
128 |
+
"\n",
|
129 |
+
"CONTEXT: Yeah, that's truly a lot of text. And LLMs are also among the biggest models when it comes to parameter count. A parameter is a value the model can change independently as it learns, and the more parameters a model has, the more complex it can be. GPT-3, for example, is pre-trained on a corpus of actually 45 terabytes of data,\n",
|
130 |
+
"TITLE: How Large Language Models Work\n",
|
131 |
+
"SOURCE: https://www.youtube.com/watch?v=5sLYAQS9sWQ&t=117.572s\n",
|
132 |
+
"\n",
|
133 |
+
"\n"
|
134 |
+
]
|
135 |
+
}
|
136 |
+
],
|
137 |
+
"source": [
|
138 |
+
"from models.llm import format_context\n",
|
139 |
+
"\n",
|
140 |
+
"print(format_context(results))"
|
141 |
+
]
|
142 |
+
}
|
143 |
+
],
|
144 |
+
"metadata": {
|
145 |
+
"kernelspec": {
|
146 |
+
"display_name": "Python 3",
|
147 |
+
"language": "python",
|
148 |
+
"name": "python3"
|
149 |
+
},
|
150 |
+
"language_info": {
|
151 |
+
"codemirror_mode": {
|
152 |
+
"name": "ipython",
|
153 |
+
"version": 3
|
154 |
+
},
|
155 |
+
"file_extension": ".py",
|
156 |
+
"mimetype": "text/x-python",
|
157 |
+
"name": "python",
|
158 |
+
"nbconvert_exporter": "python",
|
159 |
+
"pygments_lexer": "ipython3",
|
160 |
+
"version": "3.11.1"
|
161 |
+
}
|
162 |
+
},
|
163 |
+
"nbformat": 4,
|
164 |
+
"nbformat_minor": 2
|
165 |
+
}
|
__init__.py
ADDED
File without changes
|
data/logs/videos_subset_more_context_load_log.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"videos_info_path": "data/videos_subset.json", "db_path": "data/videos_subset_more_context.db", "batch_size": 15, "overlap": 10, "load_time": "2024-02-07 19:44:01.376335"}
|
data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
|
3 |
+
size 1676000
|
data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
|
3 |
+
size 100
|
data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:135da168ae29fd55e9cbd2f05414e81b32cd0783d38340b404e315cebd7a7647
|
3 |
+
size 4000
|
data/single_video.db/2b08fbf8-a94d-47d9-97aa-102f2895fbd0/link_lists.bin
ADDED
File without changes
|
data/single_video.db/chroma.sqlite3
ADDED
Binary file (311 kB). View file
|
|
data/single_video.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"id": "5sLYAQS9sWQ",
|
4 |
+
"title": "How Large Language Models Work"
|
5 |
+
}
|
6 |
+
]
|
data/videos.json
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"id": "LYYyQcAJZfk",
|
4 |
+
"title": "Science-Supported Tools to Accelerate Your Fitness Goals | Huberman Lab Podcast"
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"id": "q37ARYnRDGc",
|
8 |
+
"title": "Dr. Andy Galpin: Optimal Nutrition & Supplementation for Fitness | Huberman Lab Guest Series"
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"id": "juD99_sPWGU",
|
12 |
+
"title": "Dr. Andy Galpin: Maximize Recovery to Achieve Fitness & Performance Goals | Huberman Lab"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"id": "UIy-WQCZd4M",
|
16 |
+
"title": "Dr. Andy Galpin: Optimize Your Training Program for Fitness & Longevity | Huberman Lab Guest Series"
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"id": "oNkDA2F7CjM",
|
20 |
+
"title": "Dr. Andy Galpin: How to Build Physical Endurance & Lose Fat | Huberman Lab Guest Series"
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"id": "CyDLbrZK75U",
|
24 |
+
"title": "Dr. Andy Galpin: Optimal Protocols to Build Strength & Grow Muscles | Huberman Lab Guest Series"
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"id": "zEYE-vcVKy8",
|
28 |
+
"title": "Dr. Andy Galpin: How to Assess & Improve All Aspects of Your Fitness | Huberman Lab Guest Series"
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"id": "q1Ss8sTbFBY",
|
32 |
+
"title": "Fitness Toolkit: Protocol & Tools to Optimize Physical Health | Huberman Lab Podcast #94"
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"id": "DTCmprPCDqc",
|
36 |
+
"title": "Dr. Peter Attia: Exercise, Nutrition, Hormones for Vitality & Longevity | Huberman Lab Podcast #85"
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"id": "UNCwdFxPtE8",
|
40 |
+
"title": "Jeff Cavaliere: Optimize Your Exercise Program with Science-Based Tools | Huberman Lab Podcast #79"
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"id": "a9yFKPmPZ90",
|
44 |
+
"title": "Ido Portal: The Science & Practice of Movement | Huberman Lab Podcast #77"
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"id": "tkH2-_jMCSk",
|
48 |
+
"title": "Improve Flexibility with Research-Supported Stretching Protocols | Huberman Lab Podcast #76"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"id": "IAnhFUUCq6c",
|
52 |
+
"title": "Dr. Andy Galpin: How to Build Strength, Muscle Size & Endurance | Huberman Lab Podcast #65"
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"id": "GLgKkG44MGo",
|
56 |
+
"title": "Dr. Jack Feldman: Breathing for Mental & Physical Health & Performance | Huberman Lab Podcast #54"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"id": "iMvtHqLmEkI",
|
60 |
+
"title": "Dr. Duncan French: How to Exercise for Strength Gains & Hormone Optimization | Huberman Lab #45"
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"id": "VQLU7gpk_X8",
|
64 |
+
"title": "How to Build Endurance in Your Brain & Body"
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"id": "XLr2RKoD-oY",
|
68 |
+
"title": "Science of Muscle Growth, Increasing Strength & Muscular Recovery"
|
69 |
+
}
|
70 |
+
]
|
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1dc504103bbe12bbc64b0a140e9c7eac03ceb4ad42d6b251e8399e6035b6407
|
3 |
+
size 3352000
|
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af2a3cca693212a055b68369ac5fd6015123fe35f04aafb696bf97a944142a76
|
3 |
+
size 100
|
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c76fc6a10e46df31db8a10e076a256a53418217dca6d9d7833623c4c1fcbee63
|
3 |
+
size 75509
|
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ad502e522081d0f08a17da3b8d9af56c4a78798cd4b76926924a473e7f210fb
|
3 |
+
size 8000
|
data/videos_subset.db/5c5462a4-1205-4d8f-8261-eccc02fe61b4/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:858c69984324e9f15a6fbd27b9f4efdd2ec55fe7240c7c438f39a2d8cbd808fb
|
3 |
+
size 18268
|
data/videos_subset.db/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6b150b1e85be961e1bcb3b67fa9ded5a20b79bb53ee59d05f5f2b6c09d7501e
|
3 |
+
size 13717504
|
data/videos_subset.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"id": "LYYyQcAJZfk",
|
4 |
+
"title": "Science-Supported Tools to Accelerate Your Fitness Goals | Huberman Lab Podcast"
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"id": "q37ARYnRDGc",
|
8 |
+
"title": "Dr. Andy Galpin: Optimal Nutrition & Supplementation for Fitness | Huberman Lab Guest Series"
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"id": "oNkDA2F7CjM",
|
12 |
+
"title": "Dr. Andy Galpin: How to Build Physical Endurance & Lose Fat | Huberman Lab Guest Series"
|
13 |
+
}
|
14 |
+
]
|
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31040549c44b1b419d5f60f849fd1abd006c4dd33bf077cf498e92360a7a890f
|
3 |
+
size 3352000
|
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:092185a3873be9e8dcabd69df1976197a0afb6cf7d175f3ee73ccae7283d0abc
|
3 |
+
size 100
|
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f2bc9be2510d046cef8a9e0aabc77141b639c086811cdbd4203b15977c7b2f6e
|
3 |
+
size 75323
|
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6cd4536124c25ea26dc53deadb45f017008f2815fbe38deb6e22ae1ecec8e57a
|
3 |
+
size 8000
|
data/videos_subset_more_context.db/7dde0ca4-4e76-4236-abfc-198f65e02dc0/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0e5fb78cc60a74d205b2b5797f65b15ce3ecd5d63d72f4ef82daf716e787a39
|
3 |
+
size 18200
|
data/videos_subset_more_context.db/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfa8d1ef4a95be2ae4714fbb86e740c4ed4bd334aa068fb64817f84e25857bda
|
3 |
+
size 21721088
|
main.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from models import llm, retrieval
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
|
5 |
+
def run_query(question, db_path="data/videos_subset_more_context.db", #dev path
|
6 |
+
num_rel_segments=5,
|
7 |
+
llm_model="gpt-3.5-turbo-0125",
|
8 |
+
llm_temp=0.1):
|
9 |
+
|
10 |
+
relevant_segments = retrieval.get_relevant_segments(question,
|
11 |
+
db_path=db_path,
|
12 |
+
n_results=num_rel_segments)
|
13 |
+
|
14 |
+
answer = llm.answer_with_context(question,
|
15 |
+
relevant_segments,
|
16 |
+
model=llm_model,
|
17 |
+
temperature=llm_temp)
|
18 |
+
|
19 |
+
return answer
|
20 |
+
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
demo = gr.Interface(fn=run_query, inputs="text", outputs="text")
|
24 |
+
|
25 |
+
demo.launch(share=True)
|
models/__init__.py
ADDED
File without changes
|
models/etl.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import chromadb
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
from utils.general_utils import timeit
|
6 |
+
from utils.embedding_utils import MyEmbeddingFunction
|
7 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
8 |
+
|
9 |
+
|
10 |
+
@timeit
|
11 |
+
def run_etl(json_path="data/videos.json", db=None, batch_size=None, overlap=None):
|
12 |
+
with open(json_path) as f:
|
13 |
+
video_info = json.load(f)
|
14 |
+
|
15 |
+
videos = []
|
16 |
+
for video in video_info:
|
17 |
+
video_id = video["id"]
|
18 |
+
video_title = video["title"]
|
19 |
+
transcript = get_video_transcript(video_id)
|
20 |
+
print(f"Transcript for video {video_id} fetched.")
|
21 |
+
if transcript:
|
22 |
+
formatted_transcript = format_transcript(transcript, video_id, video_title, batch_size=batch_size, overlap=overlap)
|
23 |
+
|
24 |
+
videos.extend(formatted_transcript)
|
25 |
+
|
26 |
+
if db:
|
27 |
+
initialize_db(db)
|
28 |
+
load_data_to_db(db, videos)
|
29 |
+
log_data_load(json_path, db, batch_size, overlap)
|
30 |
+
else:
|
31 |
+
print("No database specified. Skipping database load.")
|
32 |
+
print(videos)
|
33 |
+
|
34 |
+
|
35 |
+
@timeit
|
36 |
+
def get_video_transcript(video_id):
|
37 |
+
try:
|
38 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'en-US'])
|
39 |
+
return transcript
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Error fetching transcript for video {video_id}: {str(e)}")
|
42 |
+
return None
|
43 |
+
|
44 |
+
|
45 |
+
def format_transcript(transcript, video_id, video_title, batch_size=None, overlap=None):
|
46 |
+
formatted_data = []
|
47 |
+
base_url = f"https://www.youtube.com/watch?v={video_id}"
|
48 |
+
query_params = "&t={start}s"
|
49 |
+
|
50 |
+
if not batch_size:
|
51 |
+
batch_size = 1
|
52 |
+
overlap = 0
|
53 |
+
|
54 |
+
for i in range(0, len(transcript), batch_size - overlap):
|
55 |
+
batch = list(transcript[i:i+batch_size])
|
56 |
+
|
57 |
+
start_time = batch[0]["start"]
|
58 |
+
|
59 |
+
text = " ".join(entry["text"] for entry in batch)
|
60 |
+
|
61 |
+
url = base_url + query_params.format(start=start_time)
|
62 |
+
|
63 |
+
metadata = {
|
64 |
+
"video_id": video_id,
|
65 |
+
"segment_id": video_id + "__" + str(i),
|
66 |
+
"title": video_title,
|
67 |
+
"source": url
|
68 |
+
}
|
69 |
+
|
70 |
+
segment = {"text": text, "metadata": metadata}
|
71 |
+
|
72 |
+
formatted_data.append(segment)
|
73 |
+
|
74 |
+
return formatted_data
|
75 |
+
|
76 |
+
|
77 |
+
embed_text = MyEmbeddingFunction()
|
78 |
+
|
79 |
+
def initialize_db(db_path, distance_metric="cosine"):
|
80 |
+
client = chromadb.PersistentClient(path=db_path)
|
81 |
+
|
82 |
+
# Clear existing data
|
83 |
+
# client.reset()
|
84 |
+
|
85 |
+
client.create_collection(
|
86 |
+
name="huberman_videos",
|
87 |
+
embedding_function=embed_text,
|
88 |
+
metadata={"hnsw:space": distance_metric}
|
89 |
+
)
|
90 |
+
|
91 |
+
print(f"Database created at {db_path}")
|
92 |
+
|
93 |
+
|
94 |
+
def load_data_to_db(db_path, data):
|
95 |
+
client = chromadb.PersistentClient(path=db_path)
|
96 |
+
|
97 |
+
collection = client.get_collection("huberman_videos")
|
98 |
+
|
99 |
+
documents = [segment['text'] for segment in data]
|
100 |
+
metadata = [segment['metadata'] for segment in data]
|
101 |
+
ids = [segment['metadata']['segment_id'] for segment in data]
|
102 |
+
|
103 |
+
collection.add(
|
104 |
+
documents=documents,
|
105 |
+
metadatas=metadata,
|
106 |
+
ids=ids
|
107 |
+
)
|
108 |
+
|
109 |
+
print(f"Data loaded to database at {db_path}.")
|
110 |
+
|
111 |
+
|
112 |
+
def log_data_load(json_path, db_path, batch_size, overlap):
|
113 |
+
log_json = json.dumps({
|
114 |
+
"videos_info_path": json_path,
|
115 |
+
"db_path": db_path,
|
116 |
+
"batch_size": batch_size,
|
117 |
+
"overlap": overlap,
|
118 |
+
"load_time": str(datetime.now())
|
119 |
+
})
|
120 |
+
|
121 |
+
db_file = db_path.split("/")[-1]
|
122 |
+
db_name = db_file.split(".")[0]
|
123 |
+
log_path = f"data/logs/{db_name}_load_log.json"
|
124 |
+
|
125 |
+
with open(log_path, "w") as f:
|
126 |
+
f.write(log_json)
|
models/llm.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
# Load environment variables from .env file
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
# Get the API key from the environment variable
|
9 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
10 |
+
|
11 |
+
# Create the OpenAI client
|
12 |
+
client = openai.OpenAI(api_key=api_key)
|
13 |
+
|
14 |
+
context_result_base = "CONTEXT: {text}\nTITLE: {title}\nSOURCE: {source}\n\n"
|
15 |
+
|
16 |
+
def format_context(db_query_results):
|
17 |
+
documents = db_query_results['documents'][0]
|
18 |
+
metadatas = db_query_results['metadatas'][0]
|
19 |
+
|
20 |
+
formatted_context = ""
|
21 |
+
for i in range(len(documents)):
|
22 |
+
result_text = context_result_base.format(text=documents[i], title=metadatas[i]['title'], source=metadatas[i]['source'])
|
23 |
+
formatted_context += result_text
|
24 |
+
|
25 |
+
return formatted_context
|
26 |
+
|
27 |
+
|
28 |
+
def answer_with_context(question, context, model="gpt-3.5-turbo-1106", temperature=0.5):
|
29 |
+
formatted_context = format_context(context)
|
30 |
+
|
31 |
+
instruction = '''You are a question-answering bot. The user will ask a question about fitness and recovery. First, you will be provided relevant context. The relevant context are segments of transcripts from Andrew Huberman's playlist on fitness and recovery where he has conversations about these topics. Answer the user's question and include the video title and link to the relevant context where they talk about the topic of the user's question. When referencing relevant context, return its TITLE and SOURCE. If no context are related to the question, answer the question yourself and state that "No relevant clips were found". Use this format:
|
32 |
+
User: ```What is muscle atrophy?```
|
33 |
+
AI: ```Muscle atrophy is the decrease in size and wasting of muscle tissue.
|
34 |
+
VIDEO: Example video title
|
35 |
+
SOURCE: Example video url```
|
36 |
+
'''
|
37 |
+
|
38 |
+
formatted_context = "RELEVANT CONTEXT:\n```" + formatted_context + "```"
|
39 |
+
|
40 |
+
response = client.chat.completions.create(
|
41 |
+
model=model,
|
42 |
+
temperature=temperature,
|
43 |
+
messages=[
|
44 |
+
{"role": "system", "content": instruction},
|
45 |
+
{"role": "user", "content": formatted_context},
|
46 |
+
{"role": "user", "content": question}
|
47 |
+
]
|
48 |
+
)
|
49 |
+
|
50 |
+
return response.choices[0].message.content
|
models/retrieval.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
|
3 |
+
|
4 |
+
def get_relevant_segments(query, db_path, n_results=5):
|
5 |
+
client = chromadb.PersistentClient(db_path)
|
6 |
+
collection= client.get_collection('huberman_videos')
|
7 |
+
|
8 |
+
results = collection.query(query_texts=[query], n_results=n_results)
|
9 |
+
|
10 |
+
return results
|
11 |
+
|
package-lock.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "ml-qa-chatbot",
|
3 |
+
"lockfileVersion": 3,
|
4 |
+
"requires": true,
|
5 |
+
"packages": {
|
6 |
+
"": {
|
7 |
+
"dependencies": {
|
8 |
+
"dotenv": "^16.4.1"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"node_modules/dotenv": {
|
12 |
+
"version": "16.4.1",
|
13 |
+
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.1.tgz",
|
14 |
+
"integrity": "sha512-CjA3y+Dr3FyFDOAMnxZEGtnW9KBR2M0JvvUtXNW+dYJL5ROWxP9DUHCwgFqpMk0OXCc0ljhaNTr2w/kutYIcHQ==",
|
15 |
+
"engines": {
|
16 |
+
"node": ">=12"
|
17 |
+
},
|
18 |
+
"funding": {
|
19 |
+
"url": "https://github.com/motdotla/dotenv?sponsor=1"
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
package.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dependencies": {
|
3 |
+
"dotenv": "^16.4.1"
|
4 |
+
}
|
5 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
altair==5.2.0
|
3 |
+
annotated-types==0.6.0
|
4 |
+
anyio==4.2.0
|
5 |
+
asgiref==3.7.2
|
6 |
+
attrs==23.2.0
|
7 |
+
backoff==2.2.1
|
8 |
+
bcrypt==4.1.2
|
9 |
+
build==1.0.3
|
10 |
+
cachetools==5.3.2
|
11 |
+
certifi==2024.2.2
|
12 |
+
charset-normalizer==3.3.2
|
13 |
+
chroma-hnswlib==0.7.3
|
14 |
+
chromadb==0.4.22
|
15 |
+
click==8.1.7
|
16 |
+
colorama==0.4.6
|
17 |
+
coloredlogs==15.0.1
|
18 |
+
contourpy==1.2.0
|
19 |
+
cycler==0.12.1
|
20 |
+
Deprecated==1.2.14
|
21 |
+
distro==1.9.0
|
22 |
+
fastapi==0.109.2
|
23 |
+
ffmpy==0.3.1
|
24 |
+
filelock==3.13.1
|
25 |
+
flatbuffers==23.5.26
|
26 |
+
fonttools==4.47.2
|
27 |
+
fsspec==2024.2.0
|
28 |
+
google-auth==2.27.0
|
29 |
+
googleapis-common-protos==1.62.0
|
30 |
+
gradio==4.16.0
|
31 |
+
gradio_client==0.8.1
|
32 |
+
grpcio==1.60.1
|
33 |
+
h11==0.14.0
|
34 |
+
httpcore==1.0.2
|
35 |
+
httptools==0.6.1
|
36 |
+
httpx==0.26.0
|
37 |
+
huggingface-hub==0.20.3
|
38 |
+
humanfriendly==10.0
|
39 |
+
idna==3.6
|
40 |
+
importlib-metadata==6.11.0
|
41 |
+
importlib-resources==6.1.1
|
42 |
+
Jinja2==3.1.3
|
43 |
+
joblib==1.3.2
|
44 |
+
jsonschema==4.21.1
|
45 |
+
jsonschema-specifications==2023.12.1
|
46 |
+
kiwisolver==1.4.5
|
47 |
+
kubernetes==29.0.0
|
48 |
+
markdown-it-py==3.0.0
|
49 |
+
MarkupSafe==2.1.5
|
50 |
+
matplotlib==3.8.2
|
51 |
+
mdurl==0.1.2
|
52 |
+
mmh3==4.1.0
|
53 |
+
monotonic==1.6
|
54 |
+
mpmath==1.3.0
|
55 |
+
networkx==3.2.1
|
56 |
+
numpy==1.26.3
|
57 |
+
oauthlib==3.2.2
|
58 |
+
onnxruntime==1.17.0
|
59 |
+
openai==1.11.1
|
60 |
+
opentelemetry-api==1.22.0
|
61 |
+
opentelemetry-exporter-otlp-proto-common==1.22.0
|
62 |
+
opentelemetry-exporter-otlp-proto-grpc==1.22.0
|
63 |
+
opentelemetry-instrumentation==0.43b0
|
64 |
+
opentelemetry-instrumentation-asgi==0.43b0
|
65 |
+
opentelemetry-instrumentation-fastapi==0.43b0
|
66 |
+
opentelemetry-proto==1.22.0
|
67 |
+
opentelemetry-sdk==1.22.0
|
68 |
+
opentelemetry-semantic-conventions==0.43b0
|
69 |
+
opentelemetry-util-http==0.43b0
|
70 |
+
orjson==3.9.13
|
71 |
+
overrides==7.7.0
|
72 |
+
packaging==23.2
|
73 |
+
pandas==2.2.0
|
74 |
+
pillow==10.2.0
|
75 |
+
posthog==3.4.0
|
76 |
+
protobuf==4.25.2
|
77 |
+
pulsar-client==3.4.0
|
78 |
+
pyasn1==0.5.1
|
79 |
+
pyasn1-modules==0.3.0
|
80 |
+
pydantic==2.6.1
|
81 |
+
pydantic_core==2.16.2
|
82 |
+
pydub==0.25.1
|
83 |
+
Pygments==2.17.2
|
84 |
+
pyparsing==3.1.1
|
85 |
+
PyPika==0.48.9
|
86 |
+
pyproject_hooks==1.0.0
|
87 |
+
pyreadline3==3.4.1
|
88 |
+
python-dateutil==2.8.2
|
89 |
+
python-dotenv==1.0.1
|
90 |
+
python-multipart==0.0.7
|
91 |
+
pytz==2024.1
|
92 |
+
PyYAML==6.0.1
|
93 |
+
referencing==0.33.0
|
94 |
+
regex==2023.12.25
|
95 |
+
requests==2.31.0
|
96 |
+
requests-oauthlib==1.3.1
|
97 |
+
rich==13.7.0
|
98 |
+
rpds-py==0.17.1
|
99 |
+
rsa==4.9
|
100 |
+
ruff==0.2.1
|
101 |
+
safetensors==0.4.2
|
102 |
+
scikit-learn==1.4.0
|
103 |
+
scipy==1.12.0
|
104 |
+
semantic-version==2.10.0
|
105 |
+
shellingham==1.5.4
|
106 |
+
six==1.16.0
|
107 |
+
sniffio==1.3.0
|
108 |
+
starlette==0.36.3
|
109 |
+
sympy==1.12
|
110 |
+
tenacity==8.2.3
|
111 |
+
threadpoolctl==3.2.0
|
112 |
+
tokenizers==0.15.1
|
113 |
+
tomlkit==0.12.0
|
114 |
+
toolz==0.12.1
|
115 |
+
torch==2.2.0
|
116 |
+
tqdm==4.66.1
|
117 |
+
transformers==4.37.2
|
118 |
+
typer==0.9.0
|
119 |
+
typing_extensions==4.9.0
|
120 |
+
tzdata==2023.4
|
121 |
+
urllib3==2.2.0
|
122 |
+
uvicorn==0.27.0.post1
|
123 |
+
watchfiles==0.21.0
|
124 |
+
websocket-client==1.7.0
|
125 |
+
websockets==11.0.3
|
126 |
+
wrapt==1.16.0
|
127 |
+
zipp==3.17.0
|
run_etl.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from models.etl import run_etl
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
json_path = input("Enter path to JSON file: ")
|
5 |
+
db_path = input("Enter path to database: ")
|
6 |
+
batch_size = int(input("Enter batch size (leave blank for no batching): "))
|
7 |
+
|
8 |
+
if batch_size:
|
9 |
+
overlap = int(input("Enter overlap (leave blank for no overlap): "))
|
10 |
+
else:
|
11 |
+
overlap = None
|
12 |
+
|
13 |
+
|
14 |
+
run_etl(json_path=json_path, db=db_path, batch_size=batch_size, overlap=overlap)
|
tests/__init__.py
ADDED
File without changes
|
tests/test_main.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from main import run_query
|
2 |
+
from utils.general_utils import timeit
|
3 |
+
|
4 |
+
@timeit
|
5 |
+
def test_with_llm_video():
|
6 |
+
db_path = "data/single_video.db"
|
7 |
+
|
8 |
+
question = "What are the components of an LLM?"
|
9 |
+
print("Question: ", question)
|
10 |
+
|
11 |
+
answer = run_query(question,
|
12 |
+
db_path=db_path
|
13 |
+
)
|
14 |
+
|
15 |
+
print("Answer: ", answer)
|
16 |
+
|
17 |
+
@timeit
|
18 |
+
def test_with_subset():
|
19 |
+
db_path = "data/videos_subset_more_context.db"
|
20 |
+
|
21 |
+
question = "How should I train for anerobic capacity?"
|
22 |
+
print("Question: ", question)
|
23 |
+
|
24 |
+
answer = run_query(question,
|
25 |
+
db_path=db_path,
|
26 |
+
num_rel_segments=10,
|
27 |
+
llm_model="gpt-3.5-turbo-0125",
|
28 |
+
llm_temp=0.1
|
29 |
+
)
|
30 |
+
|
31 |
+
print("Answer: ", answer)
|
32 |
+
|
33 |
+
|
34 |
+
if __name__ == '__main__':
|
35 |
+
choice = input("Enter 1 for test_with_subset, 2 for test_with_llm_video: ")
|
36 |
+
|
37 |
+
if choice == "1":
|
38 |
+
test_with_subset()
|
39 |
+
elif choice == "2":
|
40 |
+
test_with_llm_video()
|
41 |
+
else:
|
42 |
+
print("Invalid choice")
|
43 |
+
exit(1)
|
tests/test_retrieval.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from models.retrieval import get_relevant_segments
|
2 |
+
from models.llm import format_context
|
3 |
+
from utils.general_utils import timeit
|
4 |
+
|
5 |
+
@timeit
|
6 |
+
def test_with_subset():
|
7 |
+
db_path = "data/videos_subset.db"
|
8 |
+
|
9 |
+
question = "What methods can I use to lose weight quickly?"
|
10 |
+
print("Question: ", question)
|
11 |
+
|
12 |
+
relevant_segments = get_relevant_segments(question,
|
13 |
+
db_path=db_path
|
14 |
+
)
|
15 |
+
formatted_segments = format_context(relevant_segments)
|
16 |
+
|
17 |
+
print("Segments: ", formatted_segments)
|
18 |
+
|
19 |
+
if __name__ == "__main__":
|
20 |
+
test_with_subset()
|
utils/__init__.py
ADDED
File without changes
|
utils/embedding_utils.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoTokenizer, AutoModel
|
3 |
+
from chromadb import Documents, EmbeddingFunction, Embeddings
|
4 |
+
|
5 |
+
|
6 |
+
model_name = "YituTech/conv-bert-base"
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
8 |
+
model = AutoModel.from_pretrained(model_name)
|
9 |
+
|
10 |
+
|
11 |
+
class MyEmbeddingFunction(EmbeddingFunction[Documents]):
|
12 |
+
|
13 |
+
def __call__(self, input: Documents) -> Embeddings:
|
14 |
+
embeddings_list = []
|
15 |
+
|
16 |
+
for text in input:
|
17 |
+
tokens = tokenizer(text, return_tensors='pt')
|
18 |
+
with torch.no_grad():
|
19 |
+
outputs = model(**tokens)
|
20 |
+
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
|
21 |
+
embeddings_list.append(embeddings)
|
22 |
+
|
23 |
+
return embeddings_list
|
utils/general_utils.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
def timeit(func):
|
4 |
+
def wrapper(*args, **kwargs):
|
5 |
+
start_time = time.time()
|
6 |
+
result = func(*args, **kwargs)
|
7 |
+
end_time = time.time()
|
8 |
+
execution_time = round(end_time - start_time, 2)
|
9 |
+
print(f"{func.__name__} took {execution_time} seconds.")
|
10 |
+
return result
|
11 |
+
return wrapper
|
12 |
+
|