{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "0bb77379", "metadata": { "execution": { "iopub.execute_input": "2023-04-17T05:48:17.537446Z", "iopub.status.busy": "2023-04-17T05:48:17.536424Z", "iopub.status.idle": "2023-04-17T05:48:31.651358Z", "shell.execute_reply": "2023-04-17T05:48:31.649651Z" }, "papermill": { "duration": 14.123673, "end_time": "2023-04-17T05:48:31.654609", "exception": false, "start_time": "2023-04-17T05:48:17.530936", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting faiss-cpu\r\n", " Downloading faiss_cpu-1.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)\r\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.0/17.0 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n", "\u001b[?25hInstalling collected packages: faiss-cpu\r\n", "Successfully installed faiss-cpu-1.7.3\r\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", "\u001b[0m" ] } ], "source": [ "!pip install faiss-cpu" ] }, { "cell_type": "code", "execution_count": 2, "id": "3c9019a6", "metadata": { "execution": { "iopub.execute_input": "2023-04-17T05:48:31.663539Z", "iopub.status.busy": "2023-04-17T05:48:31.662590Z", "iopub.status.idle": "2023-04-17T05:48:32.984952Z", "shell.execute_reply": "2023-04-17T05:48:32.983413Z" }, "papermill": { "duration": 1.330221, "end_time": "2023-04-17T05:48:32.988111", "exception": false, "start_time": "2023-04-17T05:48:31.657890", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import datasets\n", "from datasets import Dataset\n", "import numpy as np\n", "import json\n", "import os\n", "from tqdm.notebook import tqdm as tqdm\n", "import faiss\n", "\n", "ARTICLES_PATH = '/kaggle/input/ysda-ml-02-05-process-json/articles.hf'\n", "EMBEDDINGS_PATH = '/kaggle/input/ysda-ml-02-05-generate-embeddings/embeddings.npy'\n", "OUTPUT_PATH = '/kaggle/working/articles.index'" ] }, { "cell_type": "code", "execution_count": 3, "id": "0271d205", "metadata": { "execution": { "iopub.execute_input": "2023-04-17T05:48:32.997788Z", "iopub.status.busy": "2023-04-17T05:48:32.996997Z", "iopub.status.idle": "2023-04-17T05:48:47.446365Z", "shell.execute_reply": "2023-04-17T05:48:47.445122Z" }, "papermill": { "duration": 14.457709, "end_time": "2023-04-17T05:48:47.449394", "exception": false, "start_time": "2023-04-17T05:48:32.991685", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "articles = Dataset.load_from_disk(ARTICLES_PATH)\n", "embeddings = np.load(EMBEDDINGS_PATH)" ] }, { "cell_type": "code", "execution_count": 4, "id": "7467da22", "metadata": { "execution": { "iopub.execute_input": "2023-04-17T05:48:47.458217Z", "iopub.status.busy": "2023-04-17T05:48:47.457678Z", "iopub.status.idle": "2023-04-17T05:49:05.400026Z", "shell.execute_reply": "2023-04-17T05:49:05.398641Z" }, "papermill": { "duration": 17.95023, "end_time": "2023-04-17T05:49:05.402995", "exception": false, "start_time": "2023-04-17T05:48:47.452765", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "df6dfb9da874419d9783f7f156cd9963", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/2239 [00:00