diff --git "a/notebooks/RAG_CLAUDE.ipynb" "b/notebooks/RAG_CLAUDE.ipynb" --- "a/notebooks/RAG_CLAUDE.ipynb" +++ "b/notebooks/RAG_CLAUDE.ipynb" @@ -11,32 +11,34 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YImwMQLjASiK", - "outputId": "1177bd7c-e220-4f5f-e75f-24c2a7556604" + "outputId": "16d2a28d-714a-4d88-8b1a-dc117070bcf0" }, "outputs": [ { - "name": "stderr", "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { "name": "stdout", - "output_type": "stream", "text": [ - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.4/50.4 kB\u001b[0m \u001b[31m355.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.0/27.0 MB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.1/227.1 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m36.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m891.9/891.9 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.9/318.9 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m396.4/396.4 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.4/290.4 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m141.9/141.9 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" ] } ], @@ -46,46 +48,48 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Axccge0wAajT", + "outputId": "2e28f056-293f-412e-adb8-104133755487" + }, "outputs": [ { - "name": "stderr", "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { "name": "stdout", - "output_type": "stream", "text": [ - "Requirement already satisfied: google-api-python-client in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (2.144.0)\n", - "Requirement already satisfied: httplib2<1.dev0,>=0.19.0 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-python-client) (0.22.0)\n", - "Requirement already satisfied: google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-python-client) (2.34.0)\n", - "Requirement already satisfied: google-auth-httplib2<1.0.0,>=0.2.0 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-python-client) (0.2.0)\n", - "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-python-client) (2.19.2)\n", - "Requirement already satisfied: uritemplate<5,>=3.0.1 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-python-client) (4.1.1)\n", - "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (1.65.0)\n", - "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (5.28.0)\n", - "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (1.24.0)\n", - "Requirement already satisfied: requests<3.0.0.dev0,>=2.18.0 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2.32.3)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (5.5.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (0.4.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (4.9)\n", - "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from httplib2<1.dev0,>=0.19.0->google-api-python-client) (3.1.4)\n", - "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from pyasn1-modules>=0.2.1->google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (0.6.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.8)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2.2.2)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/HUAWEI/Coding/0Project-summarizer/myenv/lib/python3.11/site-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2024.8.30)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n" + "Requirement already satisfied: google-api-python-client in /usr/local/lib/python3.10/dist-packages (2.137.0)\n", + "Collecting google-api-python-client\n", + " Downloading google_api_python_client-2.144.0-py2.py3-none-any.whl.metadata (6.7 kB)\n", + "Requirement already satisfied: httplib2<1.dev0,>=0.19.0 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (0.22.0)\n", + "Requirement already satisfied: google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (2.27.0)\n", + "Requirement already satisfied: google-auth-httplib2<1.0.0,>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (0.2.0)\n", + "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (2.19.2)\n", + "Requirement already satisfied: uritemplate<5,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from google-api-python-client) (4.1.1)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (1.65.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.20.3)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (1.24.0)\n", + "Requirement already satisfied: requests<3.0.0.dev0,>=2.18.0 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2.32.3)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (5.5.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (0.4.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (4.9)\n", + "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /usr/local/lib/python3.10/dist-packages (from httplib2<1.dev0,>=0.19.0->google-api-python-client) (3.1.4)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0->google-api-python-client) (0.6.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (3.8)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0.dev0,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5->google-api-python-client) (2024.8.30)\n", + "Downloading google_api_python_client-2.144.0-py2.py3-none-any.whl (12.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m66.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: google-api-python-client\n", + " Attempting uninstall: google-api-python-client\n", + " Found existing installation: google-api-python-client 2.137.0\n", + " Uninstalling google-api-python-client-2.137.0:\n", + " Successfully uninstalled google-api-python-client-2.137.0\n", + "Successfully installed google-api-python-client-2.144.0\n" ] } ], @@ -119,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "id": "sFYWNeL-4xIr" }, @@ -128,7 +132,7 @@ "from youtube_transcript_api import YouTubeTranscriptApi\n", "from googleapiclient.discovery import build\n", "\n", - "api_key = \"Youtube_api\"\n", + "api_key = \"AIzaSyBzwEw3l9Or_l3ZfFniGBXJH7cK-s6-oQo\"\n", "\n", "\n", "def get_playlist_video_ids(playlist_id, api_key):\n", @@ -142,7 +146,7 @@ " request = youtube.playlistItems().list(\n", " part=\"contentDetails\",\n", " playlistId=playlist_id,\n", - " maxResults=50, # Максимальное количество видео, которое можно получить за один запрос\n", + " maxResults=100, # Максимальное количество видео, которое можно получить за один запрос\n", " pageToken=next_page_token\n", " )\n", " response = request.execute()\n", @@ -152,7 +156,7 @@ "\n", " # pagination\n", " next_page_token = response.get('nextPageToken')\n", - " \n", + "\n", " if not next_page_token:\n", " break\n", "\n", @@ -161,10 +165,17 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bnMIT3apAajU", + "outputId": "f5387bde-46e1-43c1-b57d-923e2b77cb6e" + }, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'kind': 'youtube#playlistItemListResponse',\n", @@ -267,9 +278,8 @@ " 'pageInfo': {'totalResults': 19, 'resultsPerPage': 50}}" ] }, - "execution_count": 27, "metadata": {}, - "output_type": "execute_result" + "execution_count": 6 } ], "source": [ @@ -278,7 +288,7 @@ "request = youtube.playlistItems().list(\n", " part=\"contentDetails\",\n", " playlistId='PLYSHtNPbAINnbqXjIbN-c7DorjCT6eYOQ',\n", - " maxResults=50, # Максимальное количество видео, которое можно получить за один запрос\n", + " maxResults=100, # Максимальное количество видео, которое можно получить за один запрос\n", " # pageToken=response.get('nextPageToken')\n", " )\n", "result1 = request.execute()\n", @@ -289,15 +299,19 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "5yDtqgsjAajV" + }, "source": [ "### EN transcripts all" ] }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, + "execution_count": null, + "metadata": { + "id": "NfYgich3AajV" + }, "outputs": [], "source": [ "\n", @@ -345,22 +359,24 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, + "execution_count": null, + "metadata": { + "id": "w_6MYrKTAajV" + }, "outputs": [], "source": [ "# Sources:\n", - "playlist_ml_en = \"https://www.youtube.com/watch?v=Gv9_4yMHFhI&list=PLblh5JKOoLUICTaGLRoHQDuF_7q2GfuJF\" \n", - "playlist_logistic_en = \"https://www.youtube.com/watch?v=yIYKR4sgzI8&list=PLblh5JKOoLUKxzEP5HA2d-Li7IJkHfXSe\" \n", - "playlist_nn_en = \"https://www.youtube.com/watch?v=zxagGtF9MeU&list=PLblh5JKOoLUIxGDQs4LFFD--41Vzf-ME1\" \n", - "playlist_stat_en = \"https://www.youtube.com/watch?v=qBigTkBLU6g&list=PLblh5JKOoLUK0FLuzwntyYI10UQFUhsY9\" \n", - "playlist_nn2_en = \"https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi\" \n", - "playlist_linal2_en = \"https://www.youtube.com/playlist?list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab\" \n" + "playlist_ml_en = \"https://www.youtube.com/watch?v=Gv9_4yMHFhI&list=PLblh5JKOoLUICTaGLRoHQDuF_7q2GfuJF\"\n", + "playlist_logistic_en = \"https://www.youtube.com/watch?v=yIYKR4sgzI8&list=PLblh5JKOoLUKxzEP5HA2d-Li7IJkHfXSe\"\n", + "playlist_nn_en = \"https://www.youtube.com/watch?v=zxagGtF9MeU&list=PLblh5JKOoLUIxGDQs4LFFD--41Vzf-ME1\"\n", + "playlist_stat_en = \"https://www.youtube.com/watch?v=qBigTkBLU6g&list=PLblh5JKOoLUK0FLuzwntyYI10UQFUhsY9\"\n", + "playlist_nn2_en = \"https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi\"\n", + "playlist_linal2_en = \"https://www.youtube.com/playlist?list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab\"\n" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": { "id": "QYmaFtk_5O6C" }, @@ -372,8 +388,36 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": {}, + "source": [ + "len(transcripts_ML_en)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Sko_ef_2BUP6", + "outputId": "b47cfbf2-6196-466d-fc28-b6db8ef8a959" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "99" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Cd5jJYKdAajV" + }, "outputs": [], "source": [ "# tier 2\n", @@ -384,8 +428,10 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": {}, + "execution_count": null, + "metadata": { + "id": "1m-VG8K9AajV" + }, "outputs": [], "source": [ "# tier 3\n", @@ -395,15 +441,19 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "wpCh1vb2AajW" + }, "source": [ "### RU transcripts all" ] }, { "cell_type": "code", - "execution_count": 36, - "metadata": {}, + "execution_count": null, + "metadata": { + "id": "b6WypAXMAajW" + }, "outputs": [], "source": [ "\n", @@ -450,12 +500,14 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": {}, + "execution_count": null, + "metadata": { + "id": "5990TRMnAajW" + }, "outputs": [], "source": [ - "# Sources Elbrus \n", - "playlist_phase_1_url = \"https://www.youtube.com/playlist?list=PLYSHtNPbAINnbqXjIbN-c7DorjCT6eYOQ\" \n", + "# Sources Elbrus\n", + "playlist_phase_1_url = \"https://www.youtube.com/playlist?list=PLYSHtNPbAINnbqXjIbN-c7DorjCT6eYOQ\"\n", "playlist_phase_2_url = 'https://www.youtube.com/playlist?list=PLYSHtNPbAINnNvDXtGNmC7-F1QRH7qTgb'\n", "playlist_phase_3_url = 'https://www.youtube.com/playlist?list=PLYSHtNPbAINlmyNNmTaqcn3BsaY8v1xgV'\n", "\n", @@ -468,7 +520,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": { "id": "SojIu0NP5VNt" }, @@ -482,8 +534,10 @@ }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, + "execution_count": null, + "metadata": { + "id": "VEDGiNEHAajW" + }, "outputs": [], "source": [ "# other Ru\n", @@ -497,15 +551,19 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "KPB9GcSfAajW" + }, "source": [ "### Aggregate all Knowledge Base" ] }, { "cell_type": "code", - "execution_count": 42, - "metadata": {}, + "execution_count": null, + "metadata": { + "id": "fp1Jb8BiAajW" + }, "outputs": [], "source": [ "transcripts_all = [transcripts_phase_1, transcripts_phase_2, transcripts_phase_3, transcripts_NN_ru, transcripts_OOP_ru, transcripts_linal_ru, transcripts_docker_ru, \\\n", @@ -523,11 +581,24 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "metadata": { - "id": "gbbajjDK5niN" + "id": "gbbajjDK5niN", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "40059afc-575f-4d7a-d8c8-ff1cfafbc69e" }, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import tqdm, trange\n" + ] + } + ], "source": [ "from langchain_core.documents import Document\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", @@ -543,10 +614,44 @@ " docs.append(Document(page_content=transcript, metadata={\"title\": title}))\n", "\n", "# Split documents into chunks\n", - "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)\n", - "split_docs = text_splitter.split_documents(docs)\n", - "\n", - "# Setup the new embeddings model\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", + "split_docs = text_splitter.split_documents(docs)\n" + ] + }, + { + "cell_type": "code", + "source": [ + "split_docs[0].page_content" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "y04EIegH0EbH", + "outputId": "4e1d67f6-f496-429c-e62b-fa260356e0f0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'1. Probability'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "source": [ + "#Setup the new embeddings model\n", "model_name = \"intfloat/multilingual-e5-base\"\n", "embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", "\n", @@ -559,6 +664,309 @@ "\n", "# Create the retriever for document retrieval\n", "embedding_retriever = vector_store.as_retriever(search_kwargs={\"k\": 15})" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 503, + "referenced_widgets": [ + "c5841a4ce6bb4080bfd12fe45f9ef0ac", + "b02022ee58da420e8eb3cfd9eb3f43d5", + "3461edf56f2b474898430f891d99674c", + "57e03ace47344b7d8768230e87f9a59b", + "89f056a38392430eb597e1bae5be4659", + "bc25be9987a14271a755ad0df670ca81", + "959555740acc4a6497f85eaa745e3517", + "84444adba9c64c49b18f6416775cb4c0", + "37c442a4df204027b2f34bbd3e88ce92", + "0f15db45f67d4136962279c7709d70fe", + "9fc9fa9e63014a429b8718ac7452d7db", + "48e65d69916a481e99144e0dd20f7252", + "bd7bcc8db719412295587b2e0698119f", + "0abbd878f96048c89525d33be6f6c2e8", + "28b999acf27f4b818839120a705b6126", + "605434ea75fc451795172f89008f8d95", + "fdabcf226eb84b77986df5d69020eb15", + "3935905da9af4dc099969908d6eedc2b", + "3c570f314c9948aeab372e6f470566ee", + "f365eb8705da44fea831803ef9c6cde0", + "bfe275eb45824f859c573e66b916fbe6", + "32b975e4c50c4e1396a70d4f81f59428", + "2d6b6e723b394db1bda1628a67f29cd0", + "5eb1e004b462473c91c5004119120481", + "1f8e2782862e45c9979c10716ab5317a", + "ad0a89dd4a6e4b16ba5a44717c459d82", + "3a23ba98917a485a8b5a13564a309e83", + "368b070adba9447ea065fe8c81f7ed91", + "1b294c12e7d9493f9d13b771fb8a1302", + "3119a5a291d84cbebaae86af46809302", + "bc2e3f4cba83499ca60b11363e00b015", + "da89f2ca38fb41b9b1728af45f1c6bd6", + "f2ae90fd201441069307d0db9fcafcb5", + "ddd607992f5a42598b5029a7d1c0735f", + "faedbf86ad894465803b6b5c599d044c", + "98c929fd22574c1fbc2b5d6b02d709e0", + "d0c12c42aa8848d78a30953fe4c06043", + "365e8d5a9fc041e69f591edee6341551", + "dba5975516064d2683be2deb6686a0fa", + "61692a8a2ce74fe598009b7e892bde1c", + "ea1626bcb92c4362a7b8e21338d64ccc", + "29f97006c32a4159bca0b40d559be2cf", + "d97633070f21444f81f8c4e80d6b0efa", + "ea8ea4ed7e1947af9ace049fffb0e5a1", + "98fe33de61954aa2a0d6417c9ff12ee3", + "273f53b7092c40d6a0d6a5b4d48da496", + "4e6143a3e0d74384956df5317939852b", + "77655507da874611be4e54a6953b7b39", + "8d7df79192974dd3b176c133f7931543", + "6331c3d717d34504ad1ffa88e94bdcd1", + "552b4de06bbd43dcaef196a9ab9842d3", + "4724d2b7db7e4c6592bfb32d5ae525fe", + "38c9a29185b642489f8ffb257b72986d", + "d77c0bf187a34cf3a66f8ab7f16c18e4", + "06aadc98340f44a3b9170c53529bb621", + "101f51b1dc994c58aea7e505678d5b60", + "55f8475d0c6f403a99a75c8a689949e9", + "b057272180204b0d9b2e8871a282695b", + "fa6ddf942f4f489980e9daa0aba85896", + "901ea41d6bef4187a2f868e64d828de9", + "ef9449e7edcc4e96b0d6fcc32cc0d9cd", + "8607bec554ba4150bfa3b73a7a7f9342", + "b3bf5dba20cd442d936883006bf7014c", + "e8e6d660042443daac145e62fc2b13a1", + "b9009138ad534f049aab8123d089db99", + "3f815eaccb1445bc87253fff0b8566fe", + "26007c6d5f7b4e26995158272fd85376", + "27b3d43982a64fe884cfd247da8a8d80", + "02d333df54f446b49b4288e1e53026f2", + "2a4f51c02020478d9ff6feef621f6521", + "fa00438099664a2db8058ad1594b39cf", + "c0e25fdc6ef04f8e9f2199c9bb2a535c", + "f01bc5455cc8494791c16a2f4223bae6", + "e9241a31ae2545e1ae5912310d955786", + "66c807881a7a4b21b026849abc385fa0", + "1e87056de38948f8a74c58163c747fde", + "069d31eeb61f48f0bd254b4fc4d0245e", + "eb22d97e7faa4129a75081c582ba6738", + "3704511fadb648988d791f24b5f053f4", + "d3e1937157734a74bc621058bdcd5807", + "9bd8fc8ff5244929825e707161243edd", + "c222d9d1e5514242a507a18a576dea34", + "829597ddccab4023b1e98517578d3f1a", + "48c828bf4d9e4ebcbdc58a1603a3ef50", + "0bee9a8e7f49497da1bdbf50e94f3783", + "67e3b43194f84b6bafa7b1f5af34c573", + "337187f84dda4191974849353af3286c", + "827842b86add4f5ebaab427098e016d2", + "b9436002c7934f0f96a0da2f53fc3bc9", + "71ea6d7c1a5c462bbc2c1b5e45e6980c", + "60687dbde2104e29a5a42475d41802e5", + "ba6d06200925454a912ae53d0916e70e", + "46ee806e48eb48068f65c51e706d0a21", + "4db6960655c84e3fbe6c36c1d9b001c7", + "12b392354f60499eaf57ce129f68d51e", + "3c3f34c148a94183826e4623fc0b64de", + "ff11e37f068040699e0bf563e7889e5b", + "6f467238d2344fd9b795158e839e221c", + "171ef26577734650872d856f010f3a7e", + "0abc174237c14c58881d5e47f4184d48", + "b521bc45dc5440e9938269318dff6142", + "fd522056a8604554ac34c46633ed6c19", + "583e8464244d4ca8b1d471b1870f2f92", + "238f168ef1ec401b8ba52678b261a1a9", + "daff4f1f67e84c01a4e7c5f539b3d97b", + "4c36ba21d43c465f85f545a5adc3575e", + "9a11281b56c3417eb0c98578db6f2c2b", + "0178d61509c340d2b96aa2ceb8adc4f6", + "fc0defcca1a14e5aa4a870c92c8d177a", + "7d0210ad5f954d269f16be72039328e6" + ] + }, + "id": "nOtDOvDBC8St", + "outputId": "96a533ef-e7f8-440d-8dcc-5dcb4a515825" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":3: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the langchain-huggingface package and should be used instead. To use it run `pip install -U langchain-huggingface` and import as `from langchain_huggingface import HuggingFaceEmbeddings`.\n", + " embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "modules.json: 0%| | 0.00/387 [00:00